jschwab21 commited on
Commit
c751763
·
verified ·
1 Parent(s): 1876e9f

Update video_processing.py

Browse files
Files changed (1) hide show
  1. video_processing.py +13 -18
video_processing.py CHANGED
@@ -104,15 +104,15 @@ def analyze_scenes(video_path, scenes, description, batch_size=4):
104
  #"Still-camera shot of a person's face"
105
  ]
106
  preprocess = transforms.Compose([
107
- transforms.ToTensor(), # Convert numpy arrays directly to tensors
108
- transforms.Resize((224, 224)), # Resize the tensor to fit model input
109
- transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Normalize the tensor
110
  ])
111
 
112
  text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
113
  text_features = model.get_text_features(**text_inputs).detach()
114
  positive_feature, negative_features = text_features[0], text_features[1:]
115
- print("Negative features shape:", negative_features)
116
  video = VideoFileClip(video_path)
117
 
118
  for scene_num, (start_time, end_time) in enumerate(scenes):
@@ -121,10 +121,9 @@ def analyze_scenes(video_path, scenes, description, batch_size=4):
121
  print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time} - No frames extracted")
122
  continue
123
 
124
- # Create batches of frames for processing
125
  batches = [frames[i:i + batch_size] for i in range(0, len(frames), batch_size)]
126
  scene_prob = 0.0
127
- sentiment_distributions = np.zeros(8) # Assuming there are 8 sentiments
128
 
129
  for batch in batches:
130
  batch_tensors = torch.stack([preprocess(frame) for frame in batch]).to(device)
@@ -132,33 +131,29 @@ def analyze_scenes(video_path, scenes, description, batch_size=4):
132
  image_features = model.get_image_features(pixel_values=batch_tensors).detach()
133
  print("Image Features Shape:", image_features.shape)
134
 
135
- positive_similarities = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0))
136
- negative_similarities = torch.cosine_similarity(image_features, negative_features.unsqueeze(0).mean(dim=0, keepdim=True))
137
- scene_prob += positive_similarities.mean().item() - negative_similarities.mean().item()
 
138
 
139
- # Sum up the sentiments for all frames in the batch
140
  for frame in batch:
141
  frame_sentiments = classify_frame(frame)
142
  sentiment_distributions += np.array(frame_sentiments)
143
 
144
- sentiment_distributions /= len(frames) # Normalize to get average probabilities
145
  sentiment_percentages = {category: round(prob * 100, 2) for category, prob in zip(categories, sentiment_distributions)}
146
  scene_prob /= len(frames)
147
  scene_duration = convert_timestamp_to_seconds(end_time) - convert_timestamp_to_seconds(start_time)
148
  print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}, Duration={scene_duration}, Sentiments: {sentiment_percentages}")
149
 
150
  scene_scores.append((scene_prob, start_time, end_time, scene_duration, sentiment_percentages))
151
-
152
- # Sort scenes by confidence, highest first
153
  scene_scores.sort(reverse=True, key=lambda x: x[0])
154
-
155
- # Select the longest scene from the top 3 highest confidence scenes
156
- top_3_scenes = scene_scores[:3] # Get the top 3 scenes
157
- best_scene = max(top_3_scenes, key=lambda x: x[3]) # Find the longest scene from these top 3
158
 
159
  if best_scene:
160
  print(f"Best Scene: Start={best_scene[1]}, End={best_scene[2]}, Probability={best_scene[0]}, Duration={best_scene[3]}, Sentiments: {best_scene[4]}")
161
- return (best_scene[1], best_scene[2]), best_scene[4] # Returning a tuple with scene times and sentiments
162
  else:
163
  print("No suitable scene found")
164
  return None, {}
 
104
  #"Still-camera shot of a person's face"
105
  ]
106
  preprocess = transforms.Compose([
107
+ transforms.ToTensor(),
108
+ transforms.Resize((224, 224)),
109
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
110
  ])
111
 
112
  text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
113
  text_features = model.get_text_features(**text_inputs).detach()
114
  positive_feature, negative_features = text_features[0], text_features[1:]
115
+ print("Negative features shape:", negative_features.shape)
116
  video = VideoFileClip(video_path)
117
 
118
  for scene_num, (start_time, end_time) in enumerate(scenes):
 
121
  print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time} - No frames extracted")
122
  continue
123
 
 
124
  batches = [frames[i:i + batch_size] for i in range(0, len(frames), batch_size)]
125
  scene_prob = 0.0
126
+ sentiment_distributions = np.zeros(8)
127
 
128
  for batch in batches:
129
  batch_tensors = torch.stack([preprocess(frame) for frame in batch]).to(device)
 
131
  image_features = model.get_image_features(pixel_values=batch_tensors).detach()
132
  print("Image Features Shape:", image_features.shape)
133
 
134
+ positive_similarities = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0).expand_as(image_features))
135
+ negative_mean = negative_features.mean(dim=0).unsqueeze(0).expand_as(image_features)
136
+ negative_similarities = torch.cosine_similarity(image_features, negative_mean)
137
+ scene_prob += (positive_similarities.mean().item() - negative_similarities.mean().item())
138
 
 
139
  for frame in batch:
140
  frame_sentiments = classify_frame(frame)
141
  sentiment_distributions += np.array(frame_sentiments)
142
 
143
+ sentiment_distributions /= len(frames)
144
  sentiment_percentages = {category: round(prob * 100, 2) for category, prob in zip(categories, sentiment_distributions)}
145
  scene_prob /= len(frames)
146
  scene_duration = convert_timestamp_to_seconds(end_time) - convert_timestamp_to_seconds(start_time)
147
  print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}, Duration={scene_duration}, Sentiments: {sentiment_percentages}")
148
 
149
  scene_scores.append((scene_prob, start_time, end_time, scene_duration, sentiment_percentages))
 
 
150
  scene_scores.sort(reverse=True, key=lambda x: x[0])
151
+ top_3_scenes = scene_scores[:3]
152
+ best_scene = max(top_3_scenes, key=lambda x: x[3])
 
 
153
 
154
  if best_scene:
155
  print(f"Best Scene: Start={best_scene[1]}, End={best_scene[2]}, Probability={best_scene[0]}, Duration={best_scene[3]}, Sentiments: {best_scene[4]}")
156
+ return (best_scene[1], best_scene[2]), best_scene[4]
157
  else:
158
  print("No suitable scene found")
159
  return None, {}