Spaces:
Sleeping
Sleeping
Update video_processing.py
Browse files- video_processing.py +9 -4
video_processing.py
CHANGED
@@ -104,6 +104,12 @@ def analyze_scenes(video_path, scenes, description):
|
|
104 |
#"Still-camera shot of a person's face"
|
105 |
]
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
|
108 |
text_features = model.get_text_features(**text_inputs).detach()
|
109 |
positive_feature, negative_features = text_features[0], text_features[1:]
|
@@ -119,10 +125,10 @@ def analyze_scenes(video_path, scenes, description):
|
|
119 |
scene_prob = 0.0
|
120 |
sentiment_distributions = np.zeros(8) # Assuming there are 8 sentiments
|
121 |
for frame in frames:
|
122 |
-
|
123 |
-
|
124 |
with torch.no_grad():
|
125 |
-
image_features = model.get_image_features(
|
126 |
positive_similarity = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0)).squeeze().item()
|
127 |
negative_similarities = torch.cosine_similarity(image_features, negative_features).squeeze().mean().item()
|
128 |
scene_prob += positive_similarity - negative_similarities
|
@@ -145,7 +151,6 @@ def analyze_scenes(video_path, scenes, description):
|
|
145 |
top_3_scenes = scene_scores[:3] # Get the top 3 scenes
|
146 |
best_scene = max(top_3_scenes, key=lambda x: x[3]) # Find the longest scene from these top 3
|
147 |
|
148 |
-
|
149 |
if best_scene:
|
150 |
print(f"Best Scene: Start={best_scene[1]}, End={best_scene[2]}, Probability={best_scene[0]}, Duration={best_scene[3]}, Sentiments: {best_scene[4]}")
|
151 |
return (best_scene[1], best_scene[2]), best_scene[4] # Returning a tuple with scene times and sentiments
|
|
|
104 |
#"Still-camera shot of a person's face"
|
105 |
]
|
106 |
|
107 |
+
preprocess = transforms.Compose([
|
108 |
+
transforms.ToTensor(), # Directly convert numpy arrays to tensors
|
109 |
+
transforms.Resize((224, 224)), # Resize the tensor
|
110 |
+
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Normalize the tensor
|
111 |
+
])
|
112 |
+
|
113 |
text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
|
114 |
text_features = model.get_text_features(**text_inputs).detach()
|
115 |
positive_feature, negative_features = text_features[0], text_features[1:]
|
|
|
125 |
scene_prob = 0.0
|
126 |
sentiment_distributions = np.zeros(8) # Assuming there are 8 sentiments
|
127 |
for frame in frames:
|
128 |
+
# Directly preprocess the frame
|
129 |
+
frame_tensor = preprocess(frame).unsqueeze(0).to(device) # Add batch dimension and send to device
|
130 |
with torch.no_grad():
|
131 |
+
image_features = model.get_image_features(pixel_values=frame_tensor).detach()
|
132 |
positive_similarity = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0)).squeeze().item()
|
133 |
negative_similarities = torch.cosine_similarity(image_features, negative_features).squeeze().mean().item()
|
134 |
scene_prob += positive_similarity - negative_similarities
|
|
|
151 |
top_3_scenes = scene_scores[:3] # Get the top 3 scenes
|
152 |
best_scene = max(top_3_scenes, key=lambda x: x[3]) # Find the longest scene from these top 3
|
153 |
|
|
|
154 |
if best_scene:
|
155 |
print(f"Best Scene: Start={best_scene[1]}, End={best_scene[2]}, Probability={best_scene[0]}, Duration={best_scene[3]}, Sentiments: {best_scene[4]}")
|
156 |
return (best_scene[1], best_scene[2]), best_scene[4] # Returning a tuple with scene times and sentiments
|