jschwab21 commited on
Commit
bdb3f22
·
verified ·
1 Parent(s): 40ddb04

Update video_processing.py

Browse files
Files changed (1) hide show
  1. video_processing.py +19 -13
video_processing.py CHANGED
@@ -11,6 +11,8 @@ import uuid
11
  from torchvision import models, transforms
12
  from torch.nn import functional as F
13
 
 
 
14
 
15
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
  model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
@@ -87,9 +89,10 @@ def extract_frames(video_path, start_time, end_time):
87
  frames.append(frame)
88
  return frames
89
 
 
 
90
  def analyze_scenes(video_path, scenes, description):
91
  scene_scores = []
92
-
93
  negative_descriptions = [
94
  "black screen",
95
  "Intro text for a video",
@@ -99,7 +102,6 @@ def analyze_scenes(video_path, scenes, description):
99
  "Still-camera shot of a person's face"
100
  ]
101
 
102
- # Tokenize and encode the description text
103
  text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
104
  text_features = model.get_text_features(**text_inputs).detach()
105
  positive_feature, negative_features = text_features[0], text_features[1:]
@@ -111,6 +113,7 @@ def analyze_scenes(video_path, scenes, description):
111
  continue
112
 
113
  scene_prob = 0.0
 
114
  for frame in frames:
115
  image = Image.fromarray(frame[..., ::-1])
116
  image_input = processor(images=image, return_tensors="pt").to(device)
@@ -119,27 +122,30 @@ def analyze_scenes(video_path, scenes, description):
119
  positive_similarity = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0)).squeeze().item()
120
  negative_similarities = torch.cosine_similarity(image_features, negative_features).squeeze().mean().item()
121
  scene_prob += positive_similarity - negative_similarities
122
- print(classify_frame(frame))
 
 
123
 
 
 
124
  scene_prob /= len(frames)
125
  scene_duration = convert_timestamp_to_seconds(end_time) - convert_timestamp_to_seconds(start_time)
126
- print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}, Duration={scene_duration}")
127
 
128
- scene_scores.append((scene_prob, start_time, end_time, scene_duration))
129
 
130
- # Sort scenes by probability in descending order and select the top 5
131
  scene_scores.sort(reverse=True, key=lambda x: x[0])
132
- top_scenes = scene_scores[:5]
133
 
134
- # Find the longest scene among the top 5
135
- longest_scene = max(top_scenes, key=lambda x: x[3])
136
-
137
- if longest_scene:
138
- print(f"Longest Scene: Start={longest_scene[1]}, End={longest_scene[2]}, Probability={longest_scene[0]}, Duration={longest_scene[3]}")
139
  else:
140
  print("No suitable scene found")
141
 
142
- return longest_scene[1:3] if longest_scene else None
 
 
143
 
144
  def extract_best_scene(video_path, scene):
145
  if scene is None:
 
11
  from torchvision import models, transforms
12
  from torch.nn import functional as F
13
 
14
+ categories = ["Joy", "Trust", "Fear", "Surprise", "Sadness", "Disgust", "Anger", "Anticipation"]
15
+
16
 
17
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
  model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
 
89
  frames.append(frame)
90
  return frames
91
 
92
+ import numpy as np
93
+
94
  def analyze_scenes(video_path, scenes, description):
95
  scene_scores = []
 
96
  negative_descriptions = [
97
  "black screen",
98
  "Intro text for a video",
 
102
  "Still-camera shot of a person's face"
103
  ]
104
 
 
105
  text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
106
  text_features = model.get_text_features(**text_inputs).detach()
107
  positive_feature, negative_features = text_features[0], text_features[1:]
 
113
  continue
114
 
115
  scene_prob = 0.0
116
+ sentiment_distributions = np.zeros(8) # Assuming there are 8 sentiments
117
  for frame in frames:
118
  image = Image.fromarray(frame[..., ::-1])
119
  image_input = processor(images=image, return_tensors="pt").to(device)
 
122
  positive_similarity = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0)).squeeze().item()
123
  negative_similarities = torch.cosine_similarity(image_features, negative_features).squeeze().mean().item()
124
  scene_prob += positive_similarity - negative_similarities
125
+
126
+ frame_sentiments = classify_frame(frame)
127
+ sentiment_distributions += np.array(frame_sentiments)
128
 
129
+ sentiment_distributions /= len(frames) # Normalize to get average probabilities
130
+ sentiment_percentages = {category: round(prob * 100, 2) for category, prob in zip(categories, sentiment_distributions)}
131
  scene_prob /= len(frames)
132
  scene_duration = convert_timestamp_to_seconds(end_time) - convert_timestamp_to_seconds(start_time)
133
+ print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}, Duration={scene_duration}, Sentiments: {sentiment_percentages}")
134
 
135
+ scene_scores.append((scene_prob, start_time, end_time, scene_duration, sentiment_percentages))
136
 
137
+ # Sort scenes by probability and select the best scene
138
  scene_scores.sort(reverse=True, key=lambda x: x[0])
139
+ best_scene = max(scene_scores, key=lambda x: x[3]) # Select based on duration among the top scenes
140
 
141
+ if best_scene:
142
+ print(f"Best Scene: Start={best_scene[1]}, End={best_scene[2]}, Probability={best_scene[0]}, Duration={best_scene[3]}, Sentiments: {best_scene[4]}")
 
 
 
143
  else:
144
  print("No suitable scene found")
145
 
146
+ return best_scene[1:3] if best_scene else None
147
+
148
+
149
 
150
  def extract_best_scene(video_path, scene):
151
  if scene is None: