Spaces:

jschwab21
/

SickstadiumAI

Sleeping

App Files Files Community

jschwab21 commited on Jun 6, 2024

Commit

da8565f

verified ·

1 Parent(s): f9d282c

Update video_processing.py

Browse files

Files changed (1) hide show

video_processing.py +32 -20

video_processing.py CHANGED Viewed

@@ -104,6 +104,7 @@ def analyze_scenes(video_path, scenes, description):
         "A still shot of natural scenery",
         "Still-camera shot of a person's face"
     ]
     text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
     text_features = model.get_text_features(**text_inputs).detach()
     positive_feature, negative_features = text_features[0], text_features[1:]
@@ -115,34 +116,45 @@ def analyze_scenes(video_path, scenes, description):
             continue
         scene_prob = 0.0
-        sentiment_distributions = np.zeros(8)  # Assuming 8 sentiments
         for frame in frames:
             frame_sentiments = classify_frame(frame)
             sentiment_distributions += np.array(frame_sentiments)
-        sentiment_distributions /= len(frames)  # Average probabilities
         scene_prob /= len(frames)
         scene_duration = convert_timestamp_to_seconds(end_time) - convert_timestamp_to_seconds(start_time)
-        sentiment_percentages = {categories[i]: round(sentiment_distributions[i] * 100, 2) for i in range(len(categories))}
-        scene_scores.append({
-            'probability': scene_prob,
-            'start_time': start_time,
-            'end_time': end_time,
-            'duration': scene_duration,
-            'sentiments': sentiment_percentages
-        })
-    best_scene = max(scene_scores, key=lambda x: (x['probability'], x['duration'])) if scene_scores else None
-    return best_scene
-def extract_best_scene(video_path, scene_data):
-    if not scene_data:
         return None
-    start_time = scene_data['start_time']
-    end_time = scene_data['end_time']
     start_seconds = convert_timestamp_to_seconds(start_time)
     end_seconds = convert_timestamp_to_seconds(end_time)
     video_clip = VideoFileClip(video_path).subclip(start_seconds, end_seconds)

         "A still shot of natural scenery",
         "Still-camera shot of a person's face"
     ]
     text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
     text_features = model.get_text_features(**text_inputs).detach()
     positive_feature, negative_features = text_features[0], text_features[1:]
             continue
         scene_prob = 0.0
+        sentiment_distributions = np.zeros(8)  # Assuming there are 8 sentiments
         for frame in frames:
+            image = Image.fromarray(frame[..., ::-1])
+            image_input = processor(images=image, return_tensors="pt").to(device)
+            with torch.no_grad():
+                image_features = model.get_image_features(**image_input).detach()
+                positive_similarity = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0)).squeeze().item()
+                negative_similarities = torch.cosine_similarity(image_features, negative_features).squeeze().mean().item()
+                scene_prob += positive_similarity - negative_similarities
             frame_sentiments = classify_frame(frame)
             sentiment_distributions += np.array(frame_sentiments)
+        sentiment_distributions /= len(frames)  # Normalize to get average probabilities
+        sentiment_percentages = {category: round(prob * 100, 2) for category, prob in zip(categories, sentiment_distributions)}
         scene_prob /= len(frames)
         scene_duration = convert_timestamp_to_seconds(end_time) - convert_timestamp_to_seconds(start_time)
+        print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}, Duration={scene_duration}, Sentiments: {sentiment_percentages}")
+        scene_scores.append((scene_prob, start_time, end_time, scene_duration, sentiment_percentages))
+    # Sort scenes by probability and select the best scene
+    scene_scores.sort(reverse=True, key=lambda x: x[0])
+    best_scene = max(scene_scores, key=lambda x: x[3])  # Select based on duration among the top scenes
+    if best_scene:
+        print(f"Best Scene: Start={best_scene[1]}, End={best_scene[2]}, Probability={best_scene[0]}, Duration={best_scene[3]}, Sentiments: {best_scene[4]}")
+    else:
+        print("No suitable scene found")
+    return best_scene[1:3] if best_scene else None
+def extract_best_scene(video_path, scene):
+    if scene is None:
         return None
+    start_time, end_time = scene
     start_seconds = convert_timestamp_to_seconds(start_time)
     end_seconds = convert_timestamp_to_seconds(end_time)
     video_clip = VideoFileClip(video_path).subclip(start_seconds, end_seconds)