Spaces:
Sleeping
Sleeping
import cv2 | |
from scenedetect import open_video, SceneManager | |
from scenedetect.detectors import ContentDetector | |
from moviepy.editor import VideoFileClip, concatenate_videoclips | |
from transformers import CLIPProcessor, CLIPModel | |
import torch | |
import yt_dlp | |
import os | |
def process_video(video_url, description): | |
# Download or load the video from the URL | |
video_path = download_video(video_url) | |
# Segment video into scenes | |
scenes = detect_scenes(video_path) | |
# Extract frames and analyze with CLIP model | |
best_scene = analyze_scenes(video_path, scenes, description) | |
# Extract the best scene into a final clip | |
final_clip = extract_best_scene(video_path, best_scene) | |
# Ensure the output directory exists | |
output_dir = "output" | |
os.makedirs(output_dir, exist_ok=True) | |
final_clip_path = os.path.join(output_dir, "final_clip.mp4") | |
# Save and return the final clip | |
try: | |
final_clip.write_videofile(final_clip_path) | |
except Exception as e: | |
return str(e) | |
return final_clip_path | |
def detect_scenes(video_path): | |
video = open_video(video_path) | |
scene_manager = SceneManager() | |
scene_manager.add_detector(ContentDetector()) | |
scene_manager.detect_scenes(video) | |
scene_list = scene_manager.get_scene_list() | |
return scene_list | |
def analyze_scenes(video_path, scenes, description): | |
# Load CLIP model and processor | |
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") | |
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") | |
best_scene = None | |
highest_prob = 0.0 | |
for scene in scenes: | |
# Extract every 5th frame from the scene | |
frames = extract_frames(video_path, scene) | |
# Analyze frames with CLIP | |
for frame in frames: | |
inputs = processor(text=description, images=frame, return_tensors="pt", padding=True) | |
outputs = model(**inputs) | |
logits_per_image = outputs.logits_per_image | |
probs = logits_per_image.softmax(dim=1) | |
max_prob = max(probs[0]).item() | |
if max_prob > highest_prob: | |
highest_prob = max_prob | |
best_scene = scene | |
return best_scene | |
def extract_frames(video_path, scene): | |
frames = [] | |
start_frame, end_frame = scene[0].get_frames(), scene[1].get_frames() | |
video_clip = VideoFileClip(video_path) | |
for frame_num in range(start_frame, end_frame, 5): | |
frame = video_clip.get_frame(frame_num / video_clip.fps) | |
frames.append(frame) | |
return frames | |
def extract_best_scene(video_path, scene): | |
start_time = scene[0].get_seconds() | |
end_time = scene[1].get_seconds() | |
video_clip = VideoFileClip(video_path).subclip(start_time, end_time) | |
return video_clip | |
def download_video(video_url): | |
ydl_opts = { | |
'format': 'bestvideo[height<=1440]+bestaudio/best[height<=1440]', | |
'outtmpl': 'downloaded_video.%(ext)s', | |
'noplaylist': True, | |
} | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
info_dict = ydl.extract_info(video_url, download=True) | |
video_file = ydl.prepare_filename(info_dict) | |
return video_file | |