Spaces:
Sleeping
Sleeping
import cv2 | |
from scenedetect import open_video, SceneManager | |
from scenedetect.detectors import ContentDetector | |
from moviepy.editor import VideoFileClip, concatenate_videoclips | |
from transformers import CLIPProcessor, CLIPModel | |
import torch | |
import yt_dlp | |
import os | |
def process_video(video_url, description): | |
# Download or load the video from the URL | |
video_path = download_video(video_url) | |
# Segment video into scenes | |
scenes = detect_scenes(video_path) | |
# Extract frames and analyze with CLIP model | |
best_scenes = analyze_scenes(video_path, scenes, description) | |
# Combine best scenes into a final clip | |
final_clip = combine_scenes(video_path, best_scenes) | |
# Ensure the output directory exists | |
output_dir = "output" | |
os.makedirs(output_dir, exist_ok=True) | |
final_clip_path = os.path.join(output_dir, "final_clip.mp4") | |
# Save and return the final clip | |
try: | |
final_clip.write_videofile(final_clip_path) | |
except Exception as e: | |
return str(e) | |
return final_clip_path | |
def detect_scenes(video_path): | |
video = open_video(video_path) | |
scene_manager = SceneManager() | |
scene_manager.add_detector(ContentDetector()) | |
scene_manager.detect_scenes(video) | |
scene_list = scene_manager.get_scene_list() | |
return scene_list | |
def analyze_scenes(video_path, scenes, description): | |
# Load CLIP model and processor | |
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") | |
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") | |
best_scenes = [] | |
for scene in scenes: | |
# Extract every 5th frame from the scene | |
frames = extract_frames(video_path, scene) | |
# Analyze frames with CLIP | |
for frame in frames: | |
inputs = processor(text=description, images=frame, return_tensors="pt", padding=True) | |
outputs = model(**inputs) | |
logits_per_image = outputs.logits_per_image | |
probs = logits_per_image.softmax(dim=1) | |
# Store scenes with high probabilities for the description | |
if max(probs) > 0.5: # Define a suitable threshold | |
best_scenes.append(scene) | |
break | |
return best_scenes | |
def extract_frames(video_path, scene): | |
frames = [] | |
start_frame, end_frame = scene[0].get_frames(), scene[1].get_frames() | |
video_clip = VideoFileClip(video_path) | |
for frame_num in range(start_frame, end_frame, 5): | |
frame = video_clip.get_frame(frame_num / video_clip.fps) | |
frames.append(frame) | |
return frames | |
def combine_scenes(video_path, scenes): | |
final_clip = concatenate_videoclips([VideoFileClip(video_path).subclip(scene[0].get_seconds(), scene[1].get_seconds()) for scene in scenes]) | |
return final_clip | |
def download_video(video_url): | |
ydl_opts = { | |
'format': 'bestvideo[height<=1440]+bestaudio/best[height<=1440]', | |
'outtmpl': 'downloaded_video.%(ext)s', | |
'noplaylist': True, | |
} | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
info_dict = ydl.extract_info(video_url, download=True) | |
video_file = ydl.prepare_filename(info_dict) | |
return video_file | |