jschwab21 commited on
Commit
c25bcaf
·
verified ·
1 Parent(s): 79e5fc0

Update video_processing.py

Browse files
Files changed (1) hide show
  1. video_processing.py +9 -4
video_processing.py CHANGED
@@ -104,6 +104,12 @@ def analyze_scenes(video_path, scenes, description):
104
  #"Still-camera shot of a person's face"
105
  ]
106
 
 
 
 
 
 
 
107
  text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
108
  text_features = model.get_text_features(**text_inputs).detach()
109
  positive_feature, negative_features = text_features[0], text_features[1:]
@@ -119,10 +125,10 @@ def analyze_scenes(video_path, scenes, description):
119
  scene_prob = 0.0
120
  sentiment_distributions = np.zeros(8) # Assuming there are 8 sentiments
121
  for frame in frames:
122
- image = Image.fromarray(frame[..., ::-1])
123
- image_input = processor(images=image, return_tensors="pt").to(device)
124
  with torch.no_grad():
125
- image_features = model.get_image_features(**image_input).detach()
126
  positive_similarity = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0)).squeeze().item()
127
  negative_similarities = torch.cosine_similarity(image_features, negative_features).squeeze().mean().item()
128
  scene_prob += positive_similarity - negative_similarities
@@ -145,7 +151,6 @@ def analyze_scenes(video_path, scenes, description):
145
  top_3_scenes = scene_scores[:3] # Get the top 3 scenes
146
  best_scene = max(top_3_scenes, key=lambda x: x[3]) # Find the longest scene from these top 3
147
 
148
-
149
  if best_scene:
150
  print(f"Best Scene: Start={best_scene[1]}, End={best_scene[2]}, Probability={best_scene[0]}, Duration={best_scene[3]}, Sentiments: {best_scene[4]}")
151
  return (best_scene[1], best_scene[2]), best_scene[4] # Returning a tuple with scene times and sentiments
 
104
  #"Still-camera shot of a person's face"
105
  ]
106
 
107
+ preprocess = transforms.Compose([
108
+ transforms.ToTensor(), # Directly convert numpy arrays to tensors
109
+ transforms.Resize((224, 224)), # Resize the tensor
110
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Normalize the tensor
111
+ ])
112
+
113
  text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
114
  text_features = model.get_text_features(**text_inputs).detach()
115
  positive_feature, negative_features = text_features[0], text_features[1:]
 
125
  scene_prob = 0.0
126
  sentiment_distributions = np.zeros(8) # Assuming there are 8 sentiments
127
  for frame in frames:
128
+ # Directly preprocess the frame
129
+ frame_tensor = preprocess(frame).unsqueeze(0).to(device) # Add batch dimension and send to device
130
  with torch.no_grad():
131
+ image_features = model.get_image_features(pixel_values=frame_tensor).detach()
132
  positive_similarity = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0)).squeeze().item()
133
  negative_similarities = torch.cosine_similarity(image_features, negative_features).squeeze().mean().item()
134
  scene_prob += positive_similarity - negative_similarities
 
151
  top_3_scenes = scene_scores[:3] # Get the top 3 scenes
152
  best_scene = max(top_3_scenes, key=lambda x: x[3]) # Find the longest scene from these top 3
153
 
 
154
  if best_scene:
155
  print(f"Best Scene: Start={best_scene[1]}, End={best_scene[2]}, Probability={best_scene[0]}, Duration={best_scene[3]}, Sentiments: {best_scene[4]}")
156
  return (best_scene[1], best_scene[2]), best_scene[4] # Returning a tuple with scene times and sentiments