jschwab21 commited on
Commit
c31ee40
·
verified ·
1 Parent(s): 33428af

Update video_processing.py

Browse files
Files changed (1) hide show
  1. video_processing.py +45 -93
video_processing.py CHANGED
@@ -37,109 +37,61 @@ def find_scenes(video_path):
37
  scene_manager.detect_scenes(frame_source=video_manager)
38
  scene_list = scene_manager.get_scene_list()
39
  video_manager.release()
40
- return scene_list
41
-
42
- def extract_frames(video_path, scene_list):
43
- scene_frames = {}
44
- cap = cv2.VideoCapture(video_path)
45
- for i, (start_time, end_time) in enumerate(scene_list):
46
- frames = []
47
- first_frame = None
48
- start_frame = start_time.get_frames()
49
- end_frame = end_time.get_frames()
50
- cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
51
- while cap.get(cv2.CAP_PROP_POS_FRAMES) < end_frame:
52
- ret, frame = cap.read()
53
- if ret:
54
- if first_frame is None:
55
- first_frame = frame
56
- if int(cap.get(cv2.CAP_PROP_POS_FRAMES)) % 5 == 0:
57
- frames.append(frame)
58
- scene_frames[i] = (start_time, end_time, frames, first_frame)
59
- cap.release()
60
- return scene_frames
61
 
62
  def convert_timestamp_to_seconds(timestamp):
63
  h, m, s = map(float, timestamp.split(':'))
64
  return int(h) * 3600 + int(m) * 60 + s
65
 
66
- def classify_and_categorize_scenes(scene_frames, description_phrases):
67
- scene_categories = {}
68
- description_texts = description_phrases
69
-
70
- action_indices = [0]
71
- context_indices = list(set(range(len(description_texts))) - set(action_indices))
 
 
 
72
 
73
- for scene_id, (start_time, end_time, frames, first_frame) in scene_frames.items():
74
- scene_scores = [0] * len(description_texts)
75
- valid_frames = 0
76
 
 
 
77
  for frame in frames:
78
- image = Image.fromarray(frame[..., ::-1])
79
- image_input = processor(images=image, return_tensors="pt").to(device)
80
- with torch.no_grad():
81
- text_inputs = processor(text=description_texts, return_tensors="pt", padding=True).to(device)
82
- text_features = model.get_text_features(**text_inputs)
83
- image_features = model.get_image_features(**image_input)
84
- logits = (image_features @ text_features.T).squeeze()
85
- probs = logits.softmax(dim=0)
86
- scene_scores = [sum(x) for x in zip(scene_scores, probs.tolist())]
87
- valid_frames += 1
88
-
89
- if valid_frames > 0:
90
- scene_scores = [score / valid_frames for score in scene_scores]
91
- action_confidence = sum(scene_scores[i] for i in action_indices) / len(action_indices)
92
- context_confidence = sum(scene_scores[i] for i in context_indices) / len(context_indices)
93
-
94
- best_description_index = scene_scores.index(max(scene_scores))
95
- best_description = description_texts[best_description_index]
96
-
97
- if action_confidence > context_confidence:
98
- category = "Action Scene"
99
- confidence = action_confidence
100
- else:
101
- category = "Context Scene"
102
- confidence = context_confidence
103
-
104
- duration = end_time.get_seconds() - start_time.get_seconds()
105
- scene_categories[scene_id] = {
106
- "category": category,
107
- "confidence": confidence,
108
- "start_time": str(start_time),
109
- "end_time": str(end_time),
110
- "duration": duration,
111
- "first_frame": first_frame,
112
- "best_description": best_description
113
- }
114
-
115
- return scene_categories
116
-
117
- def save_clip(video_path, scene_info, output_directory, scene_id):
118
- output_filename = f"scene_{scene_id+1}_{scene_info['category'].replace(' ', '_')}.mp4"
119
- output_filepath = os.path.join(output_directory, output_filename)
120
-
121
- start_seconds = convert_timestamp_to_seconds(scene_info['start_time'])
122
- end_seconds = convert_timestamp_to_seconds(scene_info['end_time'])
123
-
124
  video_clip = VideoFileClip(video_path).subclip(start_seconds, end_seconds)
125
-
126
- video_clip.write_videofile(output_filepath, codec='libx264', audio_codec='aac')
127
- video_clip.close()
128
-
129
- return output_filepath, scene_info['first_frame']
130
 
131
  def process_video(video_url, description):
132
- output_directory = "output"
133
- os.makedirs(output_directory, exist_ok=True)
134
-
135
  video_path = download_video(video_url)
136
  scenes = find_scenes(video_path)
137
- scene_frames = extract_frames(video_path, scenes)
138
- description_phrases = [description] # Modify if multiple descriptions are needed
139
- scene_categories = classify_and_categorize_scenes(scene_frames, description_phrases)
140
-
141
- best_scene = max(scene_categories.items(), key=lambda x: x[1]['confidence'])[1]
142
- clip_path, first_frame = save_clip(video_path, best_scene, output_directory, 0)
143
-
144
- return clip_path
145
-
 
 
37
  scene_manager.detect_scenes(frame_source=video_manager)
38
  scene_list = scene_manager.get_scene_list()
39
  video_manager.release()
40
+ scenes = [(start.get_timecode(), end.get_timecode()) for start, end in scene_list]
41
+ return scenes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def convert_timestamp_to_seconds(timestamp):
44
  h, m, s = map(float, timestamp.split(':'))
45
  return int(h) * 3600 + int(m) * 60 + s
46
 
47
+ def extract_frames(video_path, start_time, end_time):
48
+ frames = []
49
+ start_seconds = convert_timestamp_to_seconds(start_time)
50
+ end_seconds = convert_timestamp_to_seconds(end_time)
51
+ video_clip = VideoFileClip(video_path).subclip(start_seconds, end_seconds)
52
+ for frame_time in range(0, int(video_clip.duration), 5):
53
+ frame = video_clip.get_frame(frame_time)
54
+ frames.append(frame)
55
+ return frames
56
 
57
+ def analyze_scenes(video_path, scenes, description):
58
+ highest_prob = 0.0
59
+ best_scene = None
60
 
61
+ for start_time, end_time in scenes:
62
+ frames = extract_frames(video_path, start_time, end_time)
63
  for frame in frames:
64
+ inputs = processor(text=description, images=frame, return_tensors="pt", padding=True)
65
+ outputs = model(**inputs)
66
+ logits_per_image = outputs.logits_per_image
67
+ probs = logits_per_image.softmax(dim=1)
68
+ max_prob = max(probs[0]).item()
69
+ if max_prob > highest_prob:
70
+ highest_prob = max_prob
71
+ best_scene = (start_time, end_time)
72
+
73
+ return best_scene
74
+
75
+ def extract_best_scene(video_path, scene):
76
+ if scene is None:
77
+ return None
78
+
79
+ start_time, end_time = scene
80
+ start_seconds = convert_timestamp_to_seconds(start_time)
81
+ end_seconds = convert_timestamp_to_seconds(end_time)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  video_clip = VideoFileClip(video_path).subclip(start_seconds, end_seconds)
83
+ return video_clip
 
 
 
 
84
 
85
  def process_video(video_url, description):
 
 
 
86
  video_path = download_video(video_url)
87
  scenes = find_scenes(video_path)
88
+ best_scene = analyze_scenes(video_path, scenes, description)
89
+ final_clip = extract_best_scene(video_path, best_scene)
90
+
91
+ if final_clip:
92
+ output_dir = "output"
93
+ os.makedirs(output_dir, exist_ok=True)
94
+ final_clip_path = os.path.join(output_dir, "final_clip.mp4")
95
+ final_clip.write_videofile(final_clip_path, codec='libx264', audio_codec='aac')
96
+ return final_clip_path
97
+ return None