jschwab21 commited on
Commit
72a3e3b
·
verified ·
1 Parent(s): f8656a1

Update video_processing.py

Browse files
Files changed (1) hide show
  1. video_processing.py +9 -5
video_processing.py CHANGED
@@ -59,6 +59,10 @@ def analyze_scenes(video_path, scenes, description):
59
  highest_prob = 0.0
60
  best_scene = None
61
 
 
 
 
 
62
  for scene_num, (start_time, end_time) in enumerate(scenes):
63
  frames = extract_frames(video_path, start_time, end_time)
64
  if not frames:
@@ -68,12 +72,11 @@ def analyze_scenes(video_path, scenes, description):
68
  scene_prob = 0.0
69
  for frame in frames:
70
  image = Image.fromarray(frame[..., ::-1])
71
- inputs = processor(text=description, images=image, return_tensors="pt", padding=True).to(device)
72
  with torch.no_grad():
73
- outputs = model(**inputs)
74
- logits_per_image = outputs.logits_per_image
75
- probs = logits_per_image.softmax(dim=1)
76
- scene_prob += probs[0][0].item() # Get the probability of the first class
77
 
78
  scene_prob /= len(frames)
79
  print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}")
@@ -112,3 +115,4 @@ def process_video(video_url, description):
112
  final_clip.write_videofile(final_clip_path, codec='libx264', audio_codec='aac')
113
  return final_clip_path
114
  return None
 
 
59
  highest_prob = 0.0
60
  best_scene = None
61
 
62
+ # Tokenize and encode the description text
63
+ text_inputs = processor(text=[description], return_tensors="pt", padding=True).to(device)
64
+ text_features = model.get_text_features(**text_inputs).detach()
65
+
66
  for scene_num, (start_time, end_time) in enumerate(scenes):
67
  frames = extract_frames(video_path, start_time, end_time)
68
  if not frames:
 
72
  scene_prob = 0.0
73
  for frame in frames:
74
  image = Image.fromarray(frame[..., ::-1])
75
+ image_input = processor(images=image, return_tensors="pt").to(device)
76
  with torch.no_grad():
77
+ image_features = model.get_image_features(**image_input).detach()
78
+ logits_per_image = torch.cosine_similarity(image_features, text_features)
79
+ scene_prob += logits_per_image.item()
 
80
 
81
  scene_prob /= len(frames)
82
  print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}")
 
115
  final_clip.write_videofile(final_clip_path, codec='libx264', audio_codec='aac')
116
  return final_clip_path
117
  return None
118
+