Spaces:
Sleeping
Sleeping
Update video_processing.py
Browse files- video_processing.py +19 -13
video_processing.py
CHANGED
@@ -11,6 +11,8 @@ import uuid
|
|
11 |
from torchvision import models, transforms
|
12 |
from torch.nn import functional as F
|
13 |
|
|
|
|
|
14 |
|
15 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
16 |
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
|
@@ -87,9 +89,10 @@ def extract_frames(video_path, start_time, end_time):
|
|
87 |
frames.append(frame)
|
88 |
return frames
|
89 |
|
|
|
|
|
90 |
def analyze_scenes(video_path, scenes, description):
|
91 |
scene_scores = []
|
92 |
-
|
93 |
negative_descriptions = [
|
94 |
"black screen",
|
95 |
"Intro text for a video",
|
@@ -99,7 +102,6 @@ def analyze_scenes(video_path, scenes, description):
|
|
99 |
"Still-camera shot of a person's face"
|
100 |
]
|
101 |
|
102 |
-
# Tokenize and encode the description text
|
103 |
text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
|
104 |
text_features = model.get_text_features(**text_inputs).detach()
|
105 |
positive_feature, negative_features = text_features[0], text_features[1:]
|
@@ -111,6 +113,7 @@ def analyze_scenes(video_path, scenes, description):
|
|
111 |
continue
|
112 |
|
113 |
scene_prob = 0.0
|
|
|
114 |
for frame in frames:
|
115 |
image = Image.fromarray(frame[..., ::-1])
|
116 |
image_input = processor(images=image, return_tensors="pt").to(device)
|
@@ -119,27 +122,30 @@ def analyze_scenes(video_path, scenes, description):
|
|
119 |
positive_similarity = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0)).squeeze().item()
|
120 |
negative_similarities = torch.cosine_similarity(image_features, negative_features).squeeze().mean().item()
|
121 |
scene_prob += positive_similarity - negative_similarities
|
122 |
-
|
|
|
|
|
123 |
|
|
|
|
|
124 |
scene_prob /= len(frames)
|
125 |
scene_duration = convert_timestamp_to_seconds(end_time) - convert_timestamp_to_seconds(start_time)
|
126 |
-
print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}, Duration={scene_duration}")
|
127 |
|
128 |
-
scene_scores.append((scene_prob, start_time, end_time, scene_duration))
|
129 |
|
130 |
-
# Sort scenes by probability
|
131 |
scene_scores.sort(reverse=True, key=lambda x: x[0])
|
132 |
-
|
133 |
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
if longest_scene:
|
138 |
-
print(f"Longest Scene: Start={longest_scene[1]}, End={longest_scene[2]}, Probability={longest_scene[0]}, Duration={longest_scene[3]}")
|
139 |
else:
|
140 |
print("No suitable scene found")
|
141 |
|
142 |
-
return
|
|
|
|
|
143 |
|
144 |
def extract_best_scene(video_path, scene):
|
145 |
if scene is None:
|
|
|
11 |
from torchvision import models, transforms
|
12 |
from torch.nn import functional as F
|
13 |
|
14 |
+
categories = ["Joy", "Trust", "Fear", "Surprise", "Sadness", "Disgust", "Anger", "Anticipation"]
|
15 |
+
|
16 |
|
17 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
18 |
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
|
|
|
89 |
frames.append(frame)
|
90 |
return frames
|
91 |
|
92 |
+
import numpy as np
|
93 |
+
|
94 |
def analyze_scenes(video_path, scenes, description):
|
95 |
scene_scores = []
|
|
|
96 |
negative_descriptions = [
|
97 |
"black screen",
|
98 |
"Intro text for a video",
|
|
|
102 |
"Still-camera shot of a person's face"
|
103 |
]
|
104 |
|
|
|
105 |
text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
|
106 |
text_features = model.get_text_features(**text_inputs).detach()
|
107 |
positive_feature, negative_features = text_features[0], text_features[1:]
|
|
|
113 |
continue
|
114 |
|
115 |
scene_prob = 0.0
|
116 |
+
sentiment_distributions = np.zeros(8) # Assuming there are 8 sentiments
|
117 |
for frame in frames:
|
118 |
image = Image.fromarray(frame[..., ::-1])
|
119 |
image_input = processor(images=image, return_tensors="pt").to(device)
|
|
|
122 |
positive_similarity = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0)).squeeze().item()
|
123 |
negative_similarities = torch.cosine_similarity(image_features, negative_features).squeeze().mean().item()
|
124 |
scene_prob += positive_similarity - negative_similarities
|
125 |
+
|
126 |
+
frame_sentiments = classify_frame(frame)
|
127 |
+
sentiment_distributions += np.array(frame_sentiments)
|
128 |
|
129 |
+
sentiment_distributions /= len(frames) # Normalize to get average probabilities
|
130 |
+
sentiment_percentages = {category: round(prob * 100, 2) for category, prob in zip(categories, sentiment_distributions)}
|
131 |
scene_prob /= len(frames)
|
132 |
scene_duration = convert_timestamp_to_seconds(end_time) - convert_timestamp_to_seconds(start_time)
|
133 |
+
print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}, Duration={scene_duration}, Sentiments: {sentiment_percentages}")
|
134 |
|
135 |
+
scene_scores.append((scene_prob, start_time, end_time, scene_duration, sentiment_percentages))
|
136 |
|
137 |
+
# Sort scenes by probability and select the best scene
|
138 |
scene_scores.sort(reverse=True, key=lambda x: x[0])
|
139 |
+
best_scene = max(scene_scores, key=lambda x: x[3]) # Select based on duration among the top scenes
|
140 |
|
141 |
+
if best_scene:
|
142 |
+
print(f"Best Scene: Start={best_scene[1]}, End={best_scene[2]}, Probability={best_scene[0]}, Duration={best_scene[3]}, Sentiments: {best_scene[4]}")
|
|
|
|
|
|
|
143 |
else:
|
144 |
print("No suitable scene found")
|
145 |
|
146 |
+
return best_scene[1:3] if best_scene else None
|
147 |
+
|
148 |
+
|
149 |
|
150 |
def extract_best_scene(video_path, scene):
|
151 |
if scene is None:
|