Spaces:
Sleeping
Sleeping
Update video_processing.py
Browse files- video_processing.py +13 -18
video_processing.py
CHANGED
@@ -104,15 +104,15 @@ def analyze_scenes(video_path, scenes, description, batch_size=4):
|
|
104 |
#"Still-camera shot of a person's face"
|
105 |
]
|
106 |
preprocess = transforms.Compose([
|
107 |
-
transforms.ToTensor(),
|
108 |
-
transforms.Resize((224, 224)),
|
109 |
-
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
110 |
])
|
111 |
|
112 |
text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
|
113 |
text_features = model.get_text_features(**text_inputs).detach()
|
114 |
positive_feature, negative_features = text_features[0], text_features[1:]
|
115 |
-
print("Negative features shape:", negative_features)
|
116 |
video = VideoFileClip(video_path)
|
117 |
|
118 |
for scene_num, (start_time, end_time) in enumerate(scenes):
|
@@ -121,10 +121,9 @@ def analyze_scenes(video_path, scenes, description, batch_size=4):
|
|
121 |
print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time} - No frames extracted")
|
122 |
continue
|
123 |
|
124 |
-
# Create batches of frames for processing
|
125 |
batches = [frames[i:i + batch_size] for i in range(0, len(frames), batch_size)]
|
126 |
scene_prob = 0.0
|
127 |
-
sentiment_distributions = np.zeros(8)
|
128 |
|
129 |
for batch in batches:
|
130 |
batch_tensors = torch.stack([preprocess(frame) for frame in batch]).to(device)
|
@@ -132,33 +131,29 @@ def analyze_scenes(video_path, scenes, description, batch_size=4):
|
|
132 |
image_features = model.get_image_features(pixel_values=batch_tensors).detach()
|
133 |
print("Image Features Shape:", image_features.shape)
|
134 |
|
135 |
-
positive_similarities = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0))
|
136 |
-
|
137 |
-
|
|
|
138 |
|
139 |
-
# Sum up the sentiments for all frames in the batch
|
140 |
for frame in batch:
|
141 |
frame_sentiments = classify_frame(frame)
|
142 |
sentiment_distributions += np.array(frame_sentiments)
|
143 |
|
144 |
-
sentiment_distributions /= len(frames)
|
145 |
sentiment_percentages = {category: round(prob * 100, 2) for category, prob in zip(categories, sentiment_distributions)}
|
146 |
scene_prob /= len(frames)
|
147 |
scene_duration = convert_timestamp_to_seconds(end_time) - convert_timestamp_to_seconds(start_time)
|
148 |
print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}, Duration={scene_duration}, Sentiments: {sentiment_percentages}")
|
149 |
|
150 |
scene_scores.append((scene_prob, start_time, end_time, scene_duration, sentiment_percentages))
|
151 |
-
|
152 |
-
# Sort scenes by confidence, highest first
|
153 |
scene_scores.sort(reverse=True, key=lambda x: x[0])
|
154 |
-
|
155 |
-
|
156 |
-
top_3_scenes = scene_scores[:3] # Get the top 3 scenes
|
157 |
-
best_scene = max(top_3_scenes, key=lambda x: x[3]) # Find the longest scene from these top 3
|
158 |
|
159 |
if best_scene:
|
160 |
print(f"Best Scene: Start={best_scene[1]}, End={best_scene[2]}, Probability={best_scene[0]}, Duration={best_scene[3]}, Sentiments: {best_scene[4]}")
|
161 |
-
return (best_scene[1], best_scene[2]), best_scene[4]
|
162 |
else:
|
163 |
print("No suitable scene found")
|
164 |
return None, {}
|
|
|
104 |
#"Still-camera shot of a person's face"
|
105 |
]
|
106 |
preprocess = transforms.Compose([
|
107 |
+
transforms.ToTensor(),
|
108 |
+
transforms.Resize((224, 224)),
|
109 |
+
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
110 |
])
|
111 |
|
112 |
text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
|
113 |
text_features = model.get_text_features(**text_inputs).detach()
|
114 |
positive_feature, negative_features = text_features[0], text_features[1:]
|
115 |
+
print("Negative features shape:", negative_features.shape)
|
116 |
video = VideoFileClip(video_path)
|
117 |
|
118 |
for scene_num, (start_time, end_time) in enumerate(scenes):
|
|
|
121 |
print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time} - No frames extracted")
|
122 |
continue
|
123 |
|
|
|
124 |
batches = [frames[i:i + batch_size] for i in range(0, len(frames), batch_size)]
|
125 |
scene_prob = 0.0
|
126 |
+
sentiment_distributions = np.zeros(8)
|
127 |
|
128 |
for batch in batches:
|
129 |
batch_tensors = torch.stack([preprocess(frame) for frame in batch]).to(device)
|
|
|
131 |
image_features = model.get_image_features(pixel_values=batch_tensors).detach()
|
132 |
print("Image Features Shape:", image_features.shape)
|
133 |
|
134 |
+
positive_similarities = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0).expand_as(image_features))
|
135 |
+
negative_mean = negative_features.mean(dim=0).unsqueeze(0).expand_as(image_features)
|
136 |
+
negative_similarities = torch.cosine_similarity(image_features, negative_mean)
|
137 |
+
scene_prob += (positive_similarities.mean().item() - negative_similarities.mean().item())
|
138 |
|
|
|
139 |
for frame in batch:
|
140 |
frame_sentiments = classify_frame(frame)
|
141 |
sentiment_distributions += np.array(frame_sentiments)
|
142 |
|
143 |
+
sentiment_distributions /= len(frames)
|
144 |
sentiment_percentages = {category: round(prob * 100, 2) for category, prob in zip(categories, sentiment_distributions)}
|
145 |
scene_prob /= len(frames)
|
146 |
scene_duration = convert_timestamp_to_seconds(end_time) - convert_timestamp_to_seconds(start_time)
|
147 |
print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}, Duration={scene_duration}, Sentiments: {sentiment_percentages}")
|
148 |
|
149 |
scene_scores.append((scene_prob, start_time, end_time, scene_duration, sentiment_percentages))
|
|
|
|
|
150 |
scene_scores.sort(reverse=True, key=lambda x: x[0])
|
151 |
+
top_3_scenes = scene_scores[:3]
|
152 |
+
best_scene = max(top_3_scenes, key=lambda x: x[3])
|
|
|
|
|
153 |
|
154 |
if best_scene:
|
155 |
print(f"Best Scene: Start={best_scene[1]}, End={best_scene[2]}, Probability={best_scene[0]}, Duration={best_scene[3]}, Sentiments: {best_scene[4]}")
|
156 |
+
return (best_scene[1], best_scene[2]), best_scene[4]
|
157 |
else:
|
158 |
print("No suitable scene found")
|
159 |
return None, {}
|