import os import cv2 from scenedetect import VideoManager, SceneManager from scenedetect.detectors import ContentDetector from moviepy.editor import VideoFileClip from transformers import CLIPProcessor, CLIPModel import torch import yt_dlp from PIL import Image device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device) processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") def download_video(url): ydl_opts = { 'format': 'bestvideo[height<=1440]+bestaudio/best[height<=1440]', 'outtmpl': 'downloaded_video.%(ext)s', 'merge_output_format': 'mp4', } with yt_dlp.YoutubeDL(ydl_opts) as ydl: result = ydl.extract_info(url, download=True) video_filename = ydl.prepare_filename(result) safe_filename = sanitize_filename(video_filename) if os.path.exists(video_filename) and video_filename != safe_filename: os.rename(video_filename, safe_filename) return safe_filename def sanitize_filename(filename): return "".join([c if c.isalnum() or c in " .-_()" else "_" for c in filename]) def find_scenes(video_path): video_manager = VideoManager([video_path]) scene_manager = SceneManager() scene_manager.add_detector(ContentDetector(threshold=30)) video_manager.set_downscale_factor() video_manager.start() scene_manager.detect_scenes(frame_source=video_manager) scene_list = scene_manager.get_scene_list() video_manager.release() scenes = [(start.get_timecode(), end.get_timecode()) for start, end in scene_list] return scenes def convert_timestamp_to_seconds(timestamp): h, m, s = map(float, timestamp.split(':')) return int(h) * 3600 + int(m) * 60 + s def extract_frames(video_path, start_time, end_time): frames = [] start_seconds = convert_timestamp_to_seconds(start_time) end_seconds = convert_timestamp_to_seconds(end_time) video_clip = VideoFileClip(video_path).subclip(start_seconds, end_seconds) for frame_time in range(0, int(video_clip.duration * video_clip.fps), int(video_clip.fps / 2)): frame = video_clip.get_frame(frame_time / video_clip.fps) frames.append(frame) return frames def analyze_scenes(video_path, scenes, description): highest_prob = 0.0 best_scene = None # Tokenize and encode the description text text_inputs = processor(text=[description], return_tensors="pt", padding=True).to(device) text_features = model.get_text_features(**text_inputs).detach() for scene_num, (start_time, end_time) in enumerate(scenes): frames = extract_frames(video_path, start_time, end_time) if not frames: print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time} - No frames extracted") continue scene_prob = 0.0 for frame in frames: image = Image.fromarray(frame[..., ::-1]) image_input = processor(images=image, return_tensors="pt").to(device) with torch.no_grad(): image_features = model.get_image_features(**image_input).detach() logits_per_image = torch.cosine_similarity(image_features, text_features) scene_prob += logits_per_image.item() scene_prob /= len(frames) print(f"Scene {scene_num + 1}: Start={start_time}, End={end_time}, Probability={scene_prob}") if scene_prob > highest_prob: highest_prob = scene_prob best_scene = (start_time, end_time) if best_scene: print(f"Best Scene: Start={best_scene[0]}, End={best_scene[1]}, Probability={highest_prob}") else: print("No suitable scene found") return best_scene def extract_best_scene(video_path, scene): if scene is None: return None start_time, end_time = scene start_seconds = convert_timestamp_to_seconds(start_time) end_seconds = convert_timestamp_to_seconds(end_time) video_clip = VideoFileClip(video_path).subclip(start_seconds, end_seconds) return video_clip def process_video(video_url, description): video_path = download_video(video_url) scenes = find_scenes(video_path) best_scene = analyze_scenes(video_path, scenes, description) final_clip = extract_best_scene(video_path, best_scene) if final_clip: output_dir = "output" os.makedirs(output_dir, exist_ok=True) final_clip_path = os.path.join(output_dir, "final_clip.mp4") final_clip.write_videofile(final_clip_path, codec='libx264', audio_codec='aac') return final_clip_path return None