Spaces:
Sleeping
Sleeping
Update video_processing.py
Browse files- video_processing.py +10 -23
video_processing.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import os
|
2 |
import cv2
|
3 |
-
from scenedetect import
|
4 |
from scenedetect.detectors import ContentDetector
|
5 |
from moviepy.editor import VideoFileClip
|
6 |
from transformers import CLIPProcessor, CLIPModel
|
@@ -31,27 +31,20 @@ def sanitize_filename(filename):
|
|
31 |
return "".join([c if c.isalnum() or c in " .-_()" else "_" for c in filename])
|
32 |
|
33 |
def find_scenes(video_path):
|
34 |
-
video_manager =
|
35 |
scene_manager = SceneManager()
|
36 |
-
scene_manager.add_detector(ContentDetector(threshold=
|
37 |
-
|
38 |
-
video_manager.start()
|
39 |
-
scene_manager.detect_scenes(frame_source=video_manager)
|
40 |
scene_list = scene_manager.get_scene_list()
|
41 |
-
|
42 |
-
scenes = [(start.get_timecode(), end.get_timecode()) for start, end in scene_list]
|
43 |
return scenes
|
44 |
|
45 |
def convert_timestamp_to_seconds(timestamp):
|
46 |
-
|
47 |
-
return int(h) * 3600 + int(m) * 60 + s
|
48 |
|
49 |
def extract_frames(video_path, start_time, end_time):
|
50 |
frames = []
|
51 |
-
|
52 |
-
end_seconds = convert_timestamp_to_seconds(end_time)
|
53 |
-
video_clip = VideoFileClip(video_path).subclip(start_seconds, end_seconds)
|
54 |
-
# Extract more frames: every frame in the scene
|
55 |
for frame_time in range(0, int(video_clip.duration * video_clip.fps), int(video_clip.fps / 5)):
|
56 |
frame = video_clip.get_frame(frame_time / video_clip.fps)
|
57 |
frames.append(frame)
|
@@ -69,7 +62,6 @@ def analyze_scenes(video_path, scenes, description):
|
|
69 |
"Still-camera shot of a person's face"
|
70 |
]
|
71 |
|
72 |
-
# Tokenize and encode the description text
|
73 |
text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
|
74 |
text_features = model.get_text_features(**text_inputs).detach()
|
75 |
positive_feature, negative_features = text_features[0], text_features[1:]
|
@@ -91,16 +83,13 @@ def analyze_scenes(video_path, scenes, description):
|
|
91 |
scene_prob += positive_similarity - negative_similarities
|
92 |
|
93 |
scene_prob /= len(frames)
|
94 |
-
scene_duration =
|
95 |
print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}, Duration={scene_duration}")
|
96 |
|
97 |
scene_scores.append((scene_prob, start_time, end_time, scene_duration))
|
98 |
|
99 |
-
# Sort scenes by probability in descending order and select the top 5
|
100 |
scene_scores.sort(reverse=True, key=lambda x: x[0])
|
101 |
top_scenes = scene_scores[:5]
|
102 |
-
|
103 |
-
# Find the longest scene among the top 5
|
104 |
longest_scene = max(top_scenes, key=lambda x: x[3])
|
105 |
|
106 |
if longest_scene:
|
@@ -115,16 +104,14 @@ def extract_best_scene(video_path, scene):
|
|
115 |
return None
|
116 |
|
117 |
start_time, end_time = scene
|
118 |
-
|
119 |
-
end_seconds = convert_timestamp_to_seconds(end_time)
|
120 |
-
video_clip = VideoFileClip(video_path).subclip(start_seconds, end_seconds)
|
121 |
return video_clip
|
122 |
|
123 |
def process_video(video_input, description, is_url=True):
|
124 |
if is_url:
|
125 |
video_path = download_video(video_input)
|
126 |
else:
|
127 |
-
video_path = video_input
|
128 |
|
129 |
scenes = find_scenes(video_path)
|
130 |
best_scene = analyze_scenes(video_path, scenes, description)
|
|
|
1 |
import os
|
2 |
import cv2
|
3 |
+
from scenedetect import SceneManager, open_video, split_video_ffmpeg
|
4 |
from scenedetect.detectors import ContentDetector
|
5 |
from moviepy.editor import VideoFileClip
|
6 |
from transformers import CLIPProcessor, CLIPModel
|
|
|
31 |
return "".join([c if c.isalnum() or c in " .-_()" else "_" for c in filename])
|
32 |
|
33 |
def find_scenes(video_path):
|
34 |
+
video_manager = open_video(video_path)
|
35 |
scene_manager = SceneManager()
|
36 |
+
scene_manager.add_detector(ContentDetector(threshold=30.0))
|
37 |
+
scene_manager.detect_scenes(video_manager)
|
|
|
|
|
38 |
scene_list = scene_manager.get_scene_list()
|
39 |
+
scenes = [(scene[0].get_seconds(), scene[1].get_seconds()) for scene in scene_list]
|
|
|
40 |
return scenes
|
41 |
|
42 |
def convert_timestamp_to_seconds(timestamp):
|
43 |
+
return float(timestamp)
|
|
|
44 |
|
45 |
def extract_frames(video_path, start_time, end_time):
|
46 |
frames = []
|
47 |
+
video_clip = VideoFileClip(video_path).subclip(start_time, end_time)
|
|
|
|
|
|
|
48 |
for frame_time in range(0, int(video_clip.duration * video_clip.fps), int(video_clip.fps / 5)):
|
49 |
frame = video_clip.get_frame(frame_time / video_clip.fps)
|
50 |
frames.append(frame)
|
|
|
62 |
"Still-camera shot of a person's face"
|
63 |
]
|
64 |
|
|
|
65 |
text_inputs = processor(text=[description] + negative_descriptions, return_tensors="pt", padding=True).to(device)
|
66 |
text_features = model.get_text_features(**text_inputs).detach()
|
67 |
positive_feature, negative_features = text_features[0], text_features[1:]
|
|
|
83 |
scene_prob += positive_similarity - negative_similarities
|
84 |
|
85 |
scene_prob /= len(frames)
|
86 |
+
scene_duration = end_time - start_time
|
87 |
print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}, Duration={scene_duration}")
|
88 |
|
89 |
scene_scores.append((scene_prob, start_time, end_time, scene_duration))
|
90 |
|
|
|
91 |
scene_scores.sort(reverse=True, key=lambda x: x[0])
|
92 |
top_scenes = scene_scores[:5]
|
|
|
|
|
93 |
longest_scene = max(top_scenes, key=lambda x: x[3])
|
94 |
|
95 |
if longest_scene:
|
|
|
104 |
return None
|
105 |
|
106 |
start_time, end_time = scene
|
107 |
+
video_clip = VideoFileClip(video_path).subclip(start_time, end_time)
|
|
|
|
|
108 |
return video_clip
|
109 |
|
110 |
def process_video(video_input, description, is_url=True):
|
111 |
if is_url:
|
112 |
video_path = download_video(video_input)
|
113 |
else:
|
114 |
+
video_path = video_input
|
115 |
|
116 |
scenes = find_scenes(video_path)
|
117 |
best_scene = analyze_scenes(video_path, scenes, description)
|