jschwab21 commited on
Commit
614e074
·
verified ·
1 Parent(s): cf9a3ea

Update video_processing.py

Browse files
Files changed (1) hide show
  1. video_processing.py +4 -3
video_processing.py CHANGED
@@ -61,7 +61,7 @@ def analyze_scenes(video_path, scenes, description):
61
  best_scene = None
62
 
63
  # Tokenize and encode the description text
64
- text_inputs = processor(text=[description], return_tensors="pt", padding=True).to(device)
65
  text_features = model.get_text_features(**text_inputs).detach()
66
 
67
  for scene_num, (start_time, end_time) in enumerate(scenes):
@@ -76,8 +76,9 @@ def analyze_scenes(video_path, scenes, description):
76
  image_input = processor(images=image, return_tensors="pt").to(device)
77
  with torch.no_grad():
78
  image_features = model.get_image_features(**image_input).detach()
79
- logits_per_image = torch.cosine_similarity(image_features, text_features)
80
- scene_prob += logits_per_image.item()
 
81
 
82
  scene_prob /= len(frames)
83
  print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}")
 
61
  best_scene = None
62
 
63
  # Tokenize and encode the description text
64
+ text_inputs = processor(text=description, return_tensors="pt").to(device)
65
  text_features = model.get_text_features(**text_inputs).detach()
66
 
67
  for scene_num, (start_time, end_time) in enumerate(scenes):
 
76
  image_input = processor(images=image, return_tensors="pt").to(device)
77
  with torch.no_grad():
78
  image_features = model.get_image_features(**image_input).detach()
79
+ logits = (image_features @ text_features.T).squeeze()
80
+ probs = logits.softmax(dim=0)
81
+ scene_prob += max(probs).item()
82
 
83
  scene_prob /= len(frames)
84
  print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}")