Spaces:
Sleeping
Sleeping
Update video_processing.py
Browse files- video_processing.py +48 -60
video_processing.py
CHANGED
@@ -105,31 +105,26 @@ def convert_timestamp_to_seconds(timestamp):
|
|
105 |
h, m, s = map(float, timestamp.split(':'))
|
106 |
return int(h) * 3600 + int(m) * 60 + s
|
107 |
|
108 |
-
def extract_frame_at_time(video_clip, t):
|
109 |
-
return video_clip.get_frame(t / video_clip.fps)
|
110 |
-
|
111 |
def extract_frames(video_path, start_time, end_time):
|
112 |
video_clip = VideoFileClip(video_path).subclip(start_time, end_time)
|
113 |
-
|
114 |
-
|
115 |
-
frames = []
|
116 |
-
with ThreadPoolExecutor() as executor:
|
117 |
-
# Using threads to handle frame extraction
|
118 |
-
frames = list(executor.map(lambda t: extract_frame_at_time(video_clip, t), frame_times))
|
119 |
|
120 |
-
return frames
|
121 |
|
122 |
-
def
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
131 |
|
132 |
-
|
|
|
133 |
negative_descriptions = [
|
134 |
"black screen",
|
135 |
"Intro text for a video",
|
@@ -138,54 +133,47 @@ def analyze_scene(params):
|
|
138 |
"A still shot of natural scenery",
|
139 |
"Still-camera shot of a person's face"
|
140 |
]
|
|
|
141 |
text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
|
142 |
text_features = model.get_text_features(**text_inputs).detach()
|
143 |
positive_feature, negative_features = text_features[0], text_features[1:]
|
144 |
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
positive_similarity = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0)).squeeze().item()
|
151 |
-
negative_similarities = torch.cosine_similarity(image_features, negative_features).squeeze().mean().item()
|
152 |
-
scene_prob += positive_similarity - negative_similarities
|
153 |
-
|
154 |
-
frame_sentiments = classify_frame(frame)
|
155 |
-
sentiment_distributions += np.array(frame_sentiments)
|
156 |
-
|
157 |
-
if len(frames) > 0:
|
158 |
-
sentiment_distributions /= len(frames) # Normalize to get average probabilities
|
159 |
-
sentiment_percentages = {category: round(prob * 100, 2) for category, prob in zip(categories, sentiment_distributions)}
|
160 |
-
scene_prob /= len(frames)
|
161 |
-
scene_duration = convert_timestamp_to_seconds(end_time) - convert_timestamp_to_seconds(start_time)
|
162 |
-
print(f"Scene: Start={start_time}, End={end_time}, Probability={scene_prob}, Duration={scene_duration}, Sentiments: {sentiment_percentages}")
|
163 |
-
return (start_time, end_time, scene_prob, scene_duration, sentiment_percentages)
|
164 |
-
|
165 |
-
return (start_time, end_time, None) # Adjust as needed for error handling
|
166 |
|
167 |
-
|
168 |
-
|
169 |
-
def analyze_scenes(video_path, scenes, description):
|
170 |
-
scene_params = [(video_path, start, end, description) for start, end in scenes]
|
171 |
-
|
172 |
-
# Use ProcessPoolExecutor to handle multiprocessing
|
173 |
with ProcessPoolExecutor() as executor:
|
174 |
-
results = list(executor.map(
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
|
186 |
-
|
187 |
-
|
|
|
|
|
188 |
|
|
|
|
|
|
|
|
|
189 |
|
190 |
|
191 |
def extract_best_scene(video_path, scene):
|
|
|
105 |
h, m, s = map(float, timestamp.split(':'))
|
106 |
return int(h) * 3600 + int(m) * 60 + s
|
107 |
|
|
|
|
|
|
|
108 |
def extract_frames(video_path, start_time, end_time):
|
109 |
video_clip = VideoFileClip(video_path).subclip(start_time, end_time)
|
110 |
+
return [video_clip.get_frame(t / video_clip.fps) for t in range(0, int(video_clip.duration * video_clip.fps), int(video_clip.fps / 10))]
|
|
|
|
|
|
|
|
|
|
|
111 |
|
|
|
112 |
|
113 |
+
def analyze_frame(args):
|
114 |
+
frame, positive_feature, negative_features = args
|
115 |
+
image = Image.fromarray(frame[..., ::-1])
|
116 |
+
image_input = processor(images=image, return_tensors="pt").to(device)
|
117 |
+
with torch.no_grad():
|
118 |
+
image_features = model.get_image_features(**image_input).detach()
|
119 |
+
positive_similarity = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0)).squeeze().item()
|
120 |
+
negative_similarities = torch.cosine_similarity(image_features, negative_features).squeeze().mean().item()
|
121 |
+
|
122 |
+
scene_prob = positive_similarity - negative_similarities
|
123 |
+
frame_sentiments = classify_frame(frame)
|
124 |
+
return scene_prob, frame_sentiments
|
125 |
|
126 |
+
def analyze_scenes(video_path, scenes, description):
|
127 |
+
scene_scores = []
|
128 |
negative_descriptions = [
|
129 |
"black screen",
|
130 |
"Intro text for a video",
|
|
|
133 |
"A still shot of natural scenery",
|
134 |
"Still-camera shot of a person's face"
|
135 |
]
|
136 |
+
|
137 |
text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
|
138 |
text_features = model.get_text_features(**text_inputs).detach()
|
139 |
positive_feature, negative_features = text_features[0], text_features[1:]
|
140 |
|
141 |
+
tasks = []
|
142 |
+
for start_time, end_time in scenes:
|
143 |
+
frames = extract_frames(video_path, start_time, end_time)
|
144 |
+
for frame in frames:
|
145 |
+
tasks.append((frame, positive_feature, negative_features))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
+
scene_results = {}
|
|
|
|
|
|
|
|
|
|
|
148 |
with ProcessPoolExecutor() as executor:
|
149 |
+
results = list(executor.map(analyze_frame, tasks))
|
150 |
+
|
151 |
+
for ((start_time, end_time), (scene_prob, sentiments)) in zip(scenes, results):
|
152 |
+
if (start_time, end_time) not in scene_results:
|
153 |
+
scene_results[(start_time, end_time)] = {
|
154 |
+
'probabilities': [],
|
155 |
+
'sentiments': np.zeros(8)
|
156 |
+
}
|
157 |
+
scene_results[(start_time, end_time)]['probabilities'].append(scene_prob)
|
158 |
+
scene_results[(start_time, end_time)]['sentiments'] += sentiments
|
159 |
+
|
160 |
+
# Calculate averages and prepare the final scores
|
161 |
+
for (start_time, end_time), data in scene_results.items():
|
162 |
+
avg_prob = np.mean(data['probabilities'])
|
163 |
+
avg_sentiments = data['sentiments'] / len(data['probabilities'])
|
164 |
+
sentiment_percentages = {category: round(prob * 100, 2) for category, prob in zip(categories, avg_sentiments)}
|
165 |
+
scene_duration = convert_timestamp_to_seconds(end_time) - convert_timestamp_to_seconds(start_time)
|
166 |
+
scene_scores.append((avg_prob, start_time, end_time, scene_duration, sentiment_percentages))
|
167 |
|
168 |
+
# Sort and select the best scene
|
169 |
+
scene_scores.sort(reverse=True, key=lambda x: x[0])
|
170 |
+
top_3_scenes = scene_scores[:3]
|
171 |
+
best_scene = max(top_3_scenes, key=lambda x: x[3])
|
172 |
|
173 |
+
if best_scene:
|
174 |
+
return (best_scene[1], best_scene[2]), best_scene[4]
|
175 |
+
else:
|
176 |
+
return None, {}
|
177 |
|
178 |
|
179 |
def extract_best_scene(video_path, scene):
|