import cv2 from scenedetect import VideoManager, SceneManager from scenedetect.detectors import ContentDetector from moviepy.editor import VideoFileClip, concatenate_videoclips from transformers import CLIPProcessor, CLIPModel import torch import yt_dlp import os def process_video(video_url, description): # Download or load the video from the URL video_path = download_video(video_url) # Segment video into scenes scenes = detect_scenes(video_path) # Extract frames and analyze with CLIP model best_scenes = analyze_scenes(scenes, description) # Combine best scenes into a final clip final_clip = combine_scenes(best_scenes) # Save and return the final clip final_clip_path = "output/final_clip.mp4" final_clip.write_videofile(final_clip_path) return final_clip_path def detect_scenes(video_path): video_manager = VideoManager([video_path]) scene_manager = SceneManager() scene_manager.add_detector(ContentDetector()) video_manager.start() scene_manager.detect_scenes(frame_source=video_manager) scene_list = scene_manager.get_scene_list() video_manager.release() return scene_list def analyze_scenes(scenes, description): # Load CLIP model and processor model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") best_scenes = [] for scene in scenes: # Extract every 5th frame from the scene frames = extract_frames(scene) # Analyze frames with CLIP for frame in frames: inputs = processor(text=description, images=frame, return_tensors="pt", padding=True) outputs = model(**inputs) logits_per_image = outputs.logits_per_image probs = logits_per_image.softmax(dim=1) # Store scenes with high probabilities for the description if max(probs) > 0.5: # Define a suitable threshold best_scenes.append(scene) break return best_scenes def extract_frames(scene): frames = [] start_frame, end_frame = scene[0].get_frames(), scene[1].get_frames() video_clip = VideoFileClip(scene[0].get_filename()) for frame_num in range(start_frame, end_frame, 5): frame = video_clip.get_frame(frame_num / video_clip.fps) frames.append(frame) return frames def combine_scenes(scenes): final_clip = concatenate_videoclips([VideoFileClip(scene[0].get_filename()).subclip(scene[0].get_seconds(), scene[1].get_seconds()) for scene in scenes]) return final_clip def download_video(video_url): ydl_opts = { 'format': 'bestvideo[height<=1440]+bestaudio/best[height<=1440]', 'outtmpl': 'downloaded_video.%(ext)s', 'noplaylist': True, } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info_dict = ydl.extract_info(video_url, download=True) video_file = ydl.prepare_filename(info_dict) return video_file