youssef commited on
Commit
b841197
·
1 Parent(s): 11484b5

use cuda for ffmpeg

Browse files
Files changed (3) hide show
  1. Dockerfile +2 -0
  2. src/app.py +16 -9
  3. src/video_processor/processor.py +65 -21
Dockerfile CHANGED
@@ -23,6 +23,8 @@ RUN apt-get update && \
23
  liblzma-dev \
24
  # gradio dependencies \
25
  ffmpeg \
 
 
26
  && apt-get clean \
27
  && rm -rf /var/lib/apt/lists/*
28
 
 
23
  liblzma-dev \
24
  # gradio dependencies \
25
  ffmpeg \
26
+ # NVIDIA Video Codec SDK \
27
+ libnvidia-encode-12-3 \
28
  && apt-get clean \
29
  && rm -rf /var/lib/apt/lists/*
30
 
src/app.py CHANGED
@@ -70,18 +70,21 @@ def on_process(video):
70
 
71
  # Process segments and show progress
72
  segments = []
73
- total_processing_time = 0
 
74
 
75
  for i, segment in enumerate(analyzer.process_video(video)):
76
- segment_start = time.time()
77
  segments.append(segment)
78
- segment_time = time.time() - segment_start
79
- total_processing_time += segment_time
 
 
80
 
81
  progress = int((i + 1) / total_segments * 100)
82
- avg_time_per_segment = total_processing_time / (i + 1)
 
83
  remaining_segments = total_segments - (i + 1)
84
- estimated_remaining = remaining_segments * avg_time_per_segment
85
 
86
  # Format current segments
87
  formatted_desc = "### Video Analysis by Segments:\n\n"
@@ -90,8 +93,9 @@ def on_process(video):
90
 
91
  yield [
92
  f"Processing segments... {progress}% complete\n" +
93
- f"Segment {i+1}/{total_segments} processed in {segment_time:.2f}s\n" +
94
- f"Average time per segment: {avg_time_per_segment:.2f}s\n" +
 
95
  f"Estimated time remaining: {estimated_remaining:.2f}s",
96
  formatted_desc,
97
  gr.update(visible=True)
@@ -101,7 +105,10 @@ def on_process(video):
101
  yield [
102
  f"Processing complete!\n" +
103
  f"Total processing time: {total_time:.2f}s\n" +
104
- f"Average time per segment: {total_processing_time/total_segments:.2f}s",
 
 
 
105
  formatted_desc,
106
  gr.update(visible=True)
107
  ]
 
70
 
71
  # Process segments and show progress
72
  segments = []
73
+ total_ffmpeg_time = 0
74
+ total_inference_time = 0
75
 
76
  for i, segment in enumerate(analyzer.process_video(video)):
 
77
  segments.append(segment)
78
+
79
+ # Update timing totals
80
+ total_ffmpeg_time += segment['processing_times']['ffmpeg']
81
+ total_inference_time += segment['processing_times']['inference']
82
 
83
  progress = int((i + 1) / total_segments * 100)
84
+ avg_ffmpeg_time = total_ffmpeg_time / (i + 1)
85
+ avg_inference_time = total_inference_time / (i + 1)
86
  remaining_segments = total_segments - (i + 1)
87
+ estimated_remaining = remaining_segments * (avg_ffmpeg_time + avg_inference_time)
88
 
89
  # Format current segments
90
  formatted_desc = "### Video Analysis by Segments:\n\n"
 
93
 
94
  yield [
95
  f"Processing segments... {progress}% complete\n" +
96
+ f"Segment {i+1}/{total_segments}\n" +
97
+ f"FFmpeg processing: {segment['processing_times']['ffmpeg']:.2f}s (avg: {avg_ffmpeg_time:.2f}s)\n" +
98
+ f"Model inference: {segment['processing_times']['inference']:.2f}s (avg: {avg_inference_time:.2f}s)\n" +
99
  f"Estimated time remaining: {estimated_remaining:.2f}s",
100
  formatted_desc,
101
  gr.update(visible=True)
 
105
  yield [
106
  f"Processing complete!\n" +
107
  f"Total processing time: {total_time:.2f}s\n" +
108
+ f"Average per segment:\n" +
109
+ f" - FFmpeg: {total_ffmpeg_time/total_segments:.2f}s\n" +
110
+ f" - Inference: {total_inference_time/total_segments:.2f}s\n" +
111
+ f" - Total: {(total_ffmpeg_time + total_inference_time)/total_segments:.2f}s",
112
  formatted_desc,
113
  gr.update(visible=True)
114
  ]
src/video_processor/processor.py CHANGED
@@ -1,11 +1,12 @@
1
  import torch
2
  from transformers import AutoProcessor, AutoModelForImageTextToText
3
- from typing import List, Dict
4
  import logging
5
  import os
6
  import subprocess
7
  import json
8
  import tempfile
 
9
 
10
  logger = logging.getLogger(__name__)
11
 
@@ -44,7 +45,7 @@ class VideoAnalyzer:
44
  raise RuntimeError("CUDA is required but not available!")
45
 
46
  logger.info("Initializing VideoAnalyzer")
47
- self.model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
48
  logger.info(f"Loading model from {self.model_path} - Using device: {DEVICE}")
49
 
50
  # Load processor and model
@@ -53,6 +54,7 @@ class VideoAnalyzer:
53
  self.model = AutoModelForImageTextToText.from_pretrained(
54
  self.model_path,
55
  torch_dtype=torch.bfloat16,
 
56
  _attn_implementation="flash_attention_2"
57
  ).to(DEVICE)
58
  logger.info(f"Model loaded on device: {self.model.device}")
@@ -101,20 +103,19 @@ Be specific about visual details but stay concise."""}
101
  )
102
  return self.processor.batch_decode(outputs, skip_special_tokens=True)[0].split("Assistant: ")[-1]
103
 
104
- def process_video(self, video_path: str, segment_length: int = 10) -> List[Dict]:
105
  try:
106
  # Create temp directory for segments
107
  temp_dir = tempfile.mkdtemp()
108
- segments_info = []
109
 
110
  # Get video duration
111
  duration = get_video_duration_seconds(video_path)
112
- segments_processed = 0
113
- total_segments = int(duration / segment_length)
114
  logger.info(f"Processing {total_segments} segments for video of length {duration:.2f} seconds")
115
 
116
  # Process video in segments
117
  for segment_idx in range(total_segments):
 
118
  start_time = segment_idx * segment_length
119
  end_time = min(start_time + segment_length, duration)
120
 
@@ -122,40 +123,83 @@ Be specific about visual details but stay concise."""}
122
  if start_time >= duration:
123
  break
124
 
125
- # Create segment
126
  segment_path = os.path.join(temp_dir, f"segment_{start_time}.mp4")
127
  cmd = [
128
  "ffmpeg",
129
- "-y",
 
 
 
 
130
  "-i", video_path,
131
- "-ss", str(start_time),
132
- "-t", str(end_time - start_time), # Duration of this segment
133
- "-c:v", "libx264",
134
- "-preset", "ultrafast",
135
- "-pix_fmt", "yuv420p",
 
 
 
 
 
136
  segment_path
137
  ]
138
- subprocess.run(cmd, check=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
  # Analyze segment
 
141
  description = self.analyze_segment(segment_path, start_time)
 
142
 
143
  # Add segment info with timestamp
144
- segments_info.append({
145
- "timestamp": format_duration(start_time),
146
- "description": description
147
- })
 
 
 
 
 
148
 
149
  # Clean up segment file
150
  os.remove(segment_path)
151
 
152
- logger.info(f"Processed segment {segment_idx + 1}/{total_segments} ({start_time}-{end_time}s)")
 
 
 
153
 
154
  # Clean up temp directory
155
  os.rmdir(temp_dir)
156
 
157
- return segments_info
158
-
159
  except Exception as e:
160
  logger.error(f"Error processing video: {str(e)}", exc_info=True)
161
  raise
 
1
  import torch
2
  from transformers import AutoProcessor, AutoModelForImageTextToText
3
+ from typing import List, Dict, Generator
4
  import logging
5
  import os
6
  import subprocess
7
  import json
8
  import tempfile
9
+ import time
10
 
11
  logger = logging.getLogger(__name__)
12
 
 
45
  raise RuntimeError("CUDA is required but not available!")
46
 
47
  logger.info("Initializing VideoAnalyzer")
48
+ self.model_path = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
49
  logger.info(f"Loading model from {self.model_path} - Using device: {DEVICE}")
50
 
51
  # Load processor and model
 
54
  self.model = AutoModelForImageTextToText.from_pretrained(
55
  self.model_path,
56
  torch_dtype=torch.bfloat16,
57
+ device_map=DEVICE,
58
  _attn_implementation="flash_attention_2"
59
  ).to(DEVICE)
60
  logger.info(f"Model loaded on device: {self.model.device}")
 
103
  )
104
  return self.processor.batch_decode(outputs, skip_special_tokens=True)[0].split("Assistant: ")[-1]
105
 
106
+ def process_video(self, video_path: str, segment_length: int = 10) -> Generator[Dict, None, None]:
107
  try:
108
  # Create temp directory for segments
109
  temp_dir = tempfile.mkdtemp()
 
110
 
111
  # Get video duration
112
  duration = get_video_duration_seconds(video_path)
113
+ total_segments = (int(duration) + segment_length - 1) // segment_length
 
114
  logger.info(f"Processing {total_segments} segments for video of length {duration:.2f} seconds")
115
 
116
  # Process video in segments
117
  for segment_idx in range(total_segments):
118
+ segment_start_time = time.time()
119
  start_time = segment_idx * segment_length
120
  end_time = min(start_time + segment_length, duration)
121
 
 
123
  if start_time >= duration:
124
  break
125
 
126
+ # Create segment - Optimized ffmpeg settings
127
  segment_path = os.path.join(temp_dir, f"segment_{start_time}.mp4")
128
  cmd = [
129
  "ffmpeg",
130
+ "-y", # Overwrite output files
131
+ "-hwaccel", "cuda", # Use CUDA hardware acceleration
132
+ "-hwaccel_output_format", "cuda", # Keep frames in GPU memory
133
+ "-threads", "0", # Use all available CPU threads
134
+ "-thread_type", "frame", # Frame-level multi-threading
135
  "-i", video_path,
136
+ "-ss", str(start_time), # Seek position
137
+ "-t", str(end_time - start_time), # Duration
138
+ "-c:v", "h264_nvenc", # Use NVIDIA hardware encoder
139
+ "-preset", "p1", # Lowest latency preset for NVENC
140
+ "-tune", "ll", # Low latency tuning
141
+ "-rc", "vbr", # Variable bitrate mode
142
+ "-cq", "28", # Quality-based VBR
143
+ "-b:v", "0", # Let VBR control bitrate
144
+ "-vf", "scale_cuda=640:-2", # GPU-accelerated scaling
145
+ "-an", # Remove audio
146
  segment_path
147
  ]
148
+
149
+ ffmpeg_start = time.time()
150
+ try:
151
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
152
+ logger.debug(f"FFmpeg output: {result.stderr}")
153
+ except subprocess.CalledProcessError as e:
154
+ logger.error(f"FFmpeg error: {e.stderr}")
155
+ # Fallback to CPU if GPU encoding fails
156
+ logger.warning("Falling back to CPU encoding")
157
+ cmd = [
158
+ "ffmpeg",
159
+ "-y",
160
+ "-threads", "0",
161
+ "-i", video_path,
162
+ "-ss", str(start_time),
163
+ "-t", str(end_time - start_time),
164
+ "-c:v", "libx264",
165
+ "-preset", "ultrafast",
166
+ "-tune", "fastdecode",
167
+ "-crf", "28",
168
+ "-vf", "scale=640:-2",
169
+ "-an",
170
+ "-pix_fmt", "yuv420p",
171
+ segment_path
172
+ ]
173
+ subprocess.run(cmd, check=True, capture_output=True)
174
+ ffmpeg_time = time.time() - ffmpeg_start
175
 
176
  # Analyze segment
177
+ inference_start = time.time()
178
  description = self.analyze_segment(segment_path, start_time)
179
+ inference_time = time.time() - inference_start
180
 
181
  # Add segment info with timestamp
182
+ yield {
183
+ "timestamp": format_duration(int(start_time)),
184
+ "description": description,
185
+ "processing_times": {
186
+ "ffmpeg": ffmpeg_time,
187
+ "inference": inference_time,
188
+ "total": time.time() - segment_start_time
189
+ }
190
+ }
191
 
192
  # Clean up segment file
193
  os.remove(segment_path)
194
 
195
+ logger.info(
196
+ f"Segment {segment_idx + 1}/{total_segments} ({start_time}-{end_time}s) - "
197
+ f"FFmpeg: {ffmpeg_time:.2f}s, Inference: {inference_time:.2f}s"
198
+ )
199
 
200
  # Clean up temp directory
201
  os.rmdir(temp_dir)
202
 
 
 
203
  except Exception as e:
204
  logger.error(f"Error processing video: {str(e)}", exc_info=True)
205
  raise