youssef commited on
Commit
0820857
·
1 Parent(s): fb1b414
Files changed (2) hide show
  1. src/app.py +19 -5
  2. src/video_processor/processor.py +106 -50
src/app.py CHANGED
@@ -50,11 +50,25 @@ def on_process(video):
50
  ]
51
 
52
  logger.info(f"Processing video: {video}")
53
- result = analyzer.process_video(video)
54
- description = result[0]["description"]
 
55
 
56
- # Format output
57
- formatted_desc = f"### Analysis:\n{description}"
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  yield [
60
  "Processing complete!",
@@ -76,7 +90,7 @@ def on_process(video):
76
  # Create Gradio interface
77
  with gr.Blocks() as demo:
78
  gr.Markdown("# SmolVLM Video Analyzer")
79
- gr.Markdown("Upload a video to get a detailed analysis of its content.")
80
 
81
  with gr.Row():
82
  with gr.Column(scale=1):
 
50
  ]
51
 
52
  logger.info(f"Processing video: {video}")
53
+ segments = []
54
+ duration = analyzer.get_video_duration_seconds(video)
55
+ total_segments = int(duration / 10) # Using default 10-second segments
56
 
57
+ # Process video segments
58
+ for i, segment in enumerate(analyzer.process_video(video)):
59
+ segments.append(segment)
60
+ progress = int((i + 1) / total_segments * 100)
61
+
62
+ # Format current segments
63
+ formatted_desc = "### Video Analysis by Segments:\n\n"
64
+ for seg in segments:
65
+ formatted_desc += f"**[{seg['timestamp']}]** {seg['description']}\n\n"
66
+
67
+ yield [
68
+ f"Processing segments... {progress}% complete",
69
+ formatted_desc,
70
+ gr.update(visible=True)
71
+ ]
72
 
73
  yield [
74
  "Processing complete!",
 
90
  # Create Gradio interface
91
  with gr.Blocks() as demo:
92
  gr.Markdown("# SmolVLM Video Analyzer")
93
+ gr.Markdown("Upload a video to get a detailed analysis of its content, split into segments with timestamps.")
94
 
95
  with gr.Row():
96
  with gr.Column(scale=1):
src/video_processor/processor.py CHANGED
@@ -2,6 +2,10 @@ import torch
2
  from transformers import AutoProcessor, AutoModelForImageTextToText
3
  from typing import List, Dict
4
  import logging
 
 
 
 
5
 
6
  logger = logging.getLogger(__name__)
7
 
@@ -12,6 +16,24 @@ def _grab_best_device(use_gpu=True):
12
  device = "cpu"
13
  return device
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  DEVICE = _grab_best_device()
16
 
17
  logger.info(f"Using device: {DEVICE}")
@@ -35,60 +57,94 @@ class VideoAnalyzer:
35
  ).to(DEVICE)
36
  logger.info(f"Model loaded on device: {self.model.device}")
37
 
38
- def process_video(self, video_path: str, frame_interval: int = 30) -> List[Dict]:
39
- logger.info(f"Processing video: {video_path} with frame_interval={frame_interval}")
40
- try:
41
- # Create message for model with detailed system prompt
42
- messages = [
43
- {
44
- "role": "system",
45
- "content": [
46
- {
47
- "type": "text",
48
- "text": "You are a detailed video analysis assistant that can understand videos. Your task is to provide comprehensive descriptions including all events, actions, and important details with their timestamps. Focus on being specific and thorough."
49
- }
50
- ]
51
- },
52
- {
53
- "role": "user",
54
- "content": [
55
- {"type": "video", "path": video_path},
56
- {"type": "text", "text": "Please provide a detailed analysis of this video. Include:\n1. All significant actions and events\n2. Temporal information and timestamps\n3. Important visual details and context\n4. Any text or speech content if present\n5. Scene transitions and changes\nBe thorough and specific so the description can be used for detailed searching later."}
57
- ]
58
- }
59
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- logger.info(f"Applying chat template - message: {messages}")
 
 
 
 
62
 
63
- # Process video using chat template
64
- inputs = self.processor.apply_chat_template(
65
- messages,
66
- add_generation_prompt=True,
67
- tokenize=True,
68
- return_dict=True,
69
- return_tensors="pt"
70
- ).to(DEVICE, dtype=torch.bfloat16)
71
 
72
- logger.info(f"Generating IDs")
73
- # Generate description with increased token limit
74
- generated_ids = self.model.generate(
75
- **inputs,
76
- do_sample=True,
77
- temperature=0.7,
78
- max_new_tokens=512 # Increased from 100 to get more detailed descriptions
79
- )
80
-
81
- logger.info(f"batch decoding...")
82
- description = self.processor.batch_decode(
83
- generated_ids,
84
- skip_special_tokens=True
85
- )[0]
86
-
87
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- return [{
90
- "description": description.split("Assistant: ")[-1] # Remove assistant prefix if present
91
- }]
92
 
93
  except Exception as e:
94
  logger.error(f"Error processing video: {str(e)}", exc_info=True)
 
2
  from transformers import AutoProcessor, AutoModelForImageTextToText
3
  from typing import List, Dict
4
  import logging
5
+ import os
6
+ import subprocess
7
+ import json
8
+ import tempfile
9
 
10
  logger = logging.getLogger(__name__)
11
 
 
16
  device = "cpu"
17
  return device
18
 
19
+ def get_video_duration_seconds(video_path: str) -> float:
20
+ """Use ffprobe to get video duration in seconds."""
21
+ cmd = [
22
+ "ffprobe",
23
+ "-v", "quiet",
24
+ "-print_format", "json",
25
+ "-show_format",
26
+ video_path
27
+ ]
28
+ result = subprocess.run(cmd, capture_output=True, text=True)
29
+ info = json.loads(result.stdout)
30
+ return float(info["format"]["duration"])
31
+
32
+ def format_duration(seconds: int) -> str:
33
+ minutes = seconds // 60
34
+ secs = seconds % 60
35
+ return f"{minutes:02d}:{secs:02d}"
36
+
37
  DEVICE = _grab_best_device()
38
 
39
  logger.info(f"Using device: {DEVICE}")
 
57
  ).to(DEVICE)
58
  logger.info(f"Model loaded on device: {self.model.device}")
59
 
60
+ def analyze_segment(self, video_path: str, start_time: float) -> str:
61
+ """Analyze a single video segment."""
62
+ messages = [
63
+ {
64
+ "role": "system",
65
+ "content": [{"type": "text", "text": """You are a detailed video analysis assistant with expertise in scene description. Your task is to:
66
+ 1. Describe the visual content with precise details
67
+ 2. Note any significant actions or movements
68
+ 3. Describe important objects, people, or elements in the scene
69
+ 4. Capture the mood, atmosphere, or emotional content if present
70
+ 5. Mention any scene transitions or camera movements
71
+ Be specific and thorough, but focus only on what is visually present in this segment."""}]
72
+ },
73
+ {
74
+ "role": "user",
75
+ "content": [
76
+ {"type": "video", "path": video_path},
77
+ {"type": "text", "text": """Describe this video segment in detail. Focus on:
78
+ - What objects, people, or elements are visible?
79
+ - What actions or movements are occurring?
80
+ - What is the setting or environment?
81
+ - Are there any notable visual effects or transitions?
82
+ - What is the overall mood or atmosphere?
83
+ Be specific about visual details but stay concise."""}
84
+ ]
85
+ }
86
+ ]
87
+
88
+ inputs = self.processor.apply_chat_template(
89
+ messages,
90
+ add_generation_prompt=True,
91
+ tokenize=True,
92
+ return_dict=True,
93
+ return_tensors="pt"
94
+ ).to(DEVICE, dtype=torch.bfloat16)
95
+
96
+ outputs = self.model.generate(
97
+ **inputs,
98
+ do_sample=True,
99
+ temperature=0.7,
100
+ max_new_tokens=256
101
+ )
102
+ return self.processor.batch_decode(outputs, skip_special_tokens=True)[0].split("Assistant: ")[-1]
103
 
104
+ def process_video(self, video_path: str, segment_length: int = 10) -> List[Dict]:
105
+ try:
106
+ # Create temp directory for segments
107
+ temp_dir = tempfile.mkdtemp()
108
+ segments_info = []
109
 
110
+ # Get video duration
111
+ duration = get_video_duration_seconds(video_path)
 
 
 
 
 
 
112
 
113
+ # Process video in segments
114
+ for start_time in range(0, int(duration), segment_length):
115
+ end_time = min(start_time + segment_length, duration)
116
+
117
+ # Create segment
118
+ segment_path = os.path.join(temp_dir, f"segment_{start_time}.mp4")
119
+ cmd = [
120
+ "ffmpeg",
121
+ "-y",
122
+ "-i", video_path,
123
+ "-ss", str(start_time),
124
+ "-t", str(segment_length),
125
+ "-c:v", "libx264",
126
+ "-preset", "ultrafast",
127
+ "-pix_fmt", "yuv420p",
128
+ segment_path
129
+ ]
130
+ subprocess.run(cmd, check=True)
131
+
132
+ # Analyze segment
133
+ description = self.analyze_segment(segment_path, start_time)
134
+
135
+ # Add segment info with timestamp
136
+ segments_info.append({
137
+ "timestamp": format_duration(start_time),
138
+ "description": description
139
+ })
140
+
141
+ # Clean up segment file
142
+ os.remove(segment_path)
143
+
144
+ # Clean up temp directory
145
+ os.rmdir(temp_dir)
146
 
147
+ return segments_info
 
 
148
 
149
  except Exception as e:
150
  logger.error(f"Error processing video: {str(e)}", exc_info=True)