jschwab21 commited on
Commit
7963d98
·
verified ·
1 Parent(s): 058b473

Update video_processing.py

Browse files
Files changed (1) hide show
  1. video_processing.py +48 -60
video_processing.py CHANGED
@@ -105,31 +105,26 @@ def convert_timestamp_to_seconds(timestamp):
105
  h, m, s = map(float, timestamp.split(':'))
106
  return int(h) * 3600 + int(m) * 60 + s
107
 
108
- def extract_frame_at_time(video_clip, t):
109
- return video_clip.get_frame(t / video_clip.fps)
110
-
111
  def extract_frames(video_path, start_time, end_time):
112
  video_clip = VideoFileClip(video_path).subclip(start_time, end_time)
113
- frame_times = range(0, int(video_clip.duration * video_clip.fps), int(video_clip.fps / 10))
114
-
115
- frames = []
116
- with ThreadPoolExecutor() as executor:
117
- # Using threads to handle frame extraction
118
- frames = list(executor.map(lambda t: extract_frame_at_time(video_clip, t), frame_times))
119
 
120
- return frames
121
 
122
- def analyze_scene(params):
123
- video_path, start_time, end_time, description = params
124
- frames = extract_frames(video_path, start_time, end_time)
125
- if not frames:
126
- print(f"Scene: Start={start_time}, End={end_time} - No frames extracted")
127
- return (start_time, end_time, None) # Adjust as needed for error handling
128
-
129
- scene_prob = 0.0
130
- sentiment_distributions = np.zeros(8) # Assuming there are 8 sentiments
 
 
 
131
 
132
- # Preparing text inputs and features once per scene
 
133
  negative_descriptions = [
134
  "black screen",
135
  "Intro text for a video",
@@ -138,54 +133,47 @@ def analyze_scene(params):
138
  "A still shot of natural scenery",
139
  "Still-camera shot of a person's face"
140
  ]
 
141
  text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
142
  text_features = model.get_text_features(**text_inputs).detach()
143
  positive_feature, negative_features = text_features[0], text_features[1:]
144
 
145
- for frame in frames:
146
- image = Image.fromarray(frame[..., ::-1])
147
- image_input = processor(images=image, return_tensors="pt").to(device)
148
- with torch.no_grad():
149
- image_features = model.get_image_features(**image_input).detach()
150
- positive_similarity = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0)).squeeze().item()
151
- negative_similarities = torch.cosine_similarity(image_features, negative_features).squeeze().mean().item()
152
- scene_prob += positive_similarity - negative_similarities
153
-
154
- frame_sentiments = classify_frame(frame)
155
- sentiment_distributions += np.array(frame_sentiments)
156
-
157
- if len(frames) > 0:
158
- sentiment_distributions /= len(frames) # Normalize to get average probabilities
159
- sentiment_percentages = {category: round(prob * 100, 2) for category, prob in zip(categories, sentiment_distributions)}
160
- scene_prob /= len(frames)
161
- scene_duration = convert_timestamp_to_seconds(end_time) - convert_timestamp_to_seconds(start_time)
162
- print(f"Scene: Start={start_time}, End={end_time}, Probability={scene_prob}, Duration={scene_duration}, Sentiments: {sentiment_percentages}")
163
- return (start_time, end_time, scene_prob, scene_duration, sentiment_percentages)
164
-
165
- return (start_time, end_time, None) # Adjust as needed for error handling
166
 
167
- from concurrent.futures import ProcessPoolExecutor
168
-
169
- def analyze_scenes(video_path, scenes, description):
170
- scene_params = [(video_path, start, end, description) for start, end in scenes]
171
-
172
- # Use ProcessPoolExecutor to handle multiprocessing
173
  with ProcessPoolExecutor() as executor:
174
- results = list(executor.map(analyze_scene, scene_params))
175
-
176
- # Process results to find the best scene
177
- scene_scores = [result for result in results if result[2] is not None] # Filter out scenes with no data
178
- if scene_scores:
179
- scene_scores.sort(reverse=True, key=lambda x: x[2]) # Sort scenes by confidence, highest first
180
- top_3_scenes = scene_scores[:3] # Get the top 3 scenes
181
- best_scene = max(top_3_scenes, key=lambda x: x[3]) # Find the longest scene from these top 3
182
- if best_scene:
183
- print(f"Best Scene: Start={best_scene[0]}, End={best_scene[1]}, Probability={best_scene[2]}, Duration={best_scene[3]}, Sentiments: {best_scene[4]}")
184
- return (best_scene[0], best_scene[1]), best_scene[4] # Returning a tuple with scene times and sentiments
 
 
 
 
 
 
 
185
 
186
- print("No suitable scene found")
187
- return None, {}
 
 
188
 
 
 
 
 
189
 
190
 
191
  def extract_best_scene(video_path, scene):
 
105
  h, m, s = map(float, timestamp.split(':'))
106
  return int(h) * 3600 + int(m) * 60 + s
107
 
 
 
 
108
  def extract_frames(video_path, start_time, end_time):
109
  video_clip = VideoFileClip(video_path).subclip(start_time, end_time)
110
+ return [video_clip.get_frame(t / video_clip.fps) for t in range(0, int(video_clip.duration * video_clip.fps), int(video_clip.fps / 10))]
 
 
 
 
 
111
 
 
112
 
113
+ def analyze_frame(args):
114
+ frame, positive_feature, negative_features = args
115
+ image = Image.fromarray(frame[..., ::-1])
116
+ image_input = processor(images=image, return_tensors="pt").to(device)
117
+ with torch.no_grad():
118
+ image_features = model.get_image_features(**image_input).detach()
119
+ positive_similarity = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0)).squeeze().item()
120
+ negative_similarities = torch.cosine_similarity(image_features, negative_features).squeeze().mean().item()
121
+
122
+ scene_prob = positive_similarity - negative_similarities
123
+ frame_sentiments = classify_frame(frame)
124
+ return scene_prob, frame_sentiments
125
 
126
+ def analyze_scenes(video_path, scenes, description):
127
+ scene_scores = []
128
  negative_descriptions = [
129
  "black screen",
130
  "Intro text for a video",
 
133
  "A still shot of natural scenery",
134
  "Still-camera shot of a person's face"
135
  ]
136
+
137
  text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
138
  text_features = model.get_text_features(**text_inputs).detach()
139
  positive_feature, negative_features = text_features[0], text_features[1:]
140
 
141
+ tasks = []
142
+ for start_time, end_time in scenes:
143
+ frames = extract_frames(video_path, start_time, end_time)
144
+ for frame in frames:
145
+ tasks.append((frame, positive_feature, negative_features))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
+ scene_results = {}
 
 
 
 
 
148
  with ProcessPoolExecutor() as executor:
149
+ results = list(executor.map(analyze_frame, tasks))
150
+
151
+ for ((start_time, end_time), (scene_prob, sentiments)) in zip(scenes, results):
152
+ if (start_time, end_time) not in scene_results:
153
+ scene_results[(start_time, end_time)] = {
154
+ 'probabilities': [],
155
+ 'sentiments': np.zeros(8)
156
+ }
157
+ scene_results[(start_time, end_time)]['probabilities'].append(scene_prob)
158
+ scene_results[(start_time, end_time)]['sentiments'] += sentiments
159
+
160
+ # Calculate averages and prepare the final scores
161
+ for (start_time, end_time), data in scene_results.items():
162
+ avg_prob = np.mean(data['probabilities'])
163
+ avg_sentiments = data['sentiments'] / len(data['probabilities'])
164
+ sentiment_percentages = {category: round(prob * 100, 2) for category, prob in zip(categories, avg_sentiments)}
165
+ scene_duration = convert_timestamp_to_seconds(end_time) - convert_timestamp_to_seconds(start_time)
166
+ scene_scores.append((avg_prob, start_time, end_time, scene_duration, sentiment_percentages))
167
 
168
+ # Sort and select the best scene
169
+ scene_scores.sort(reverse=True, key=lambda x: x[0])
170
+ top_3_scenes = scene_scores[:3]
171
+ best_scene = max(top_3_scenes, key=lambda x: x[3])
172
 
173
+ if best_scene:
174
+ return (best_scene[1], best_scene[2]), best_scene[4]
175
+ else:
176
+ return None, {}
177
 
178
 
179
  def extract_best_scene(video_path, scene):