jschwab21 commited on
Commit
cf4ffba
·
verified ·
1 Parent(s): cdf47a5

Update video_processing.py

Browse files
Files changed (1) hide show
  1. video_processing.py +13 -5
video_processing.py CHANGED
@@ -33,7 +33,7 @@ def sanitize_filename(filename):
33
  def find_scenes(video_path):
34
  video_manager = VideoManager([video_path])
35
  scene_manager = SceneManager()
36
- scene_manager.add_detector(ContentDetector(threshold=30)) # Adjusted threshold for finer segmentation
37
  video_manager.set_downscale_factor()
38
  video_manager.start()
39
  scene_manager.detect_scenes(frame_source=video_manager)
@@ -58,12 +58,19 @@ def extract_frames(video_path, start_time, end_time):
58
  return frames
59
 
60
  def analyze_scenes(video_path, scenes, description):
61
- highest_prob = 0.0
62
  best_scene = None
63
 
 
 
 
 
 
 
64
  # Tokenize and encode the description text
65
- text_inputs = processor(text=[description], return_tensors="pt", padding=True).to(device)
66
  text_features = model.get_text_features(**text_inputs).detach()
 
67
 
68
  for scene_num, (start_time, end_time) in enumerate(scenes):
69
  frames = extract_frames(video_path, start_time, end_time)
@@ -77,8 +84,9 @@ def analyze_scenes(video_path, scenes, description):
77
  image_input = processor(images=image, return_tensors="pt").to(device)
78
  with torch.no_grad():
79
  image_features = model.get_image_features(**image_input).detach()
80
- logits = torch.cosine_similarity(image_features, text_features).squeeze().item()
81
- scene_prob += logits
 
82
 
83
  scene_prob /= len(frames)
84
  print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}")
 
33
  def find_scenes(video_path):
34
  video_manager = VideoManager([video_path])
35
  scene_manager = SceneManager()
36
+ scene_manager.add_detector(ContentDetector(threshold=20)) # Adjusted threshold for finer segmentation
37
  video_manager.set_downscale_factor()
38
  video_manager.start()
39
  scene_manager.detect_scenes(frame_source=video_manager)
 
58
  return frames
59
 
60
  def analyze_scenes(video_path, scenes, description):
61
+ highest_prob = float('-inf')
62
  best_scene = None
63
 
64
+ negative_descriptions = [
65
+ "black screen",
66
+ "Intro text for a video",
67
+ "dark scene without much contrast"
68
+ ]
69
+
70
  # Tokenize and encode the description text
71
+ text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
72
  text_features = model.get_text_features(**text_inputs).detach()
73
+ positive_feature, negative_features = text_features[0], text_features[1:]
74
 
75
  for scene_num, (start_time, end_time) in enumerate(scenes):
76
  frames = extract_frames(video_path, start_time, end_time)
 
84
  image_input = processor(images=image, return_tensors="pt").to(device)
85
  with torch.no_grad():
86
  image_features = model.get_image_features(**image_input).detach()
87
+ positive_similarity = torch.cosine_similarity(image_features, positive_feature.unsqueeze(0)).squeeze().item()
88
+ negative_similarities = torch.cosine_similarity(image_features, negative_features).squeeze().mean().item()
89
+ scene_prob += positive_similarity - negative_similarities
90
 
91
  scene_prob /= len(frames)
92
  print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}")