import streamlit as st import pandas as pd import numpy as np from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import torch import json import os from pathlib import Path class VideoRetrieval: def __init__(self): self.text_model = SentenceTransformer('all-MiniLM-L6-v2') self.load_data() def load_data(self): # Load pre-computed features # In practice, these would be loaded from your actual feature files self.features = { 'visual_features': np.load('path_to_visual_features.npy'), 'scene_features': np.load('path_to_scene_features.npy'), 'object_features': np.load('path_to_object_features.npy') } # Load clip metadata self.clips_df = pd.read_csv('clips_metadata.csv') def encode_query(self, query_text): """Encode the text query into embeddings""" return self.text_model.encode(query_text) def compute_similarity(self, query_embedding, feature_type='visual_features'): """Compute similarity between query and video features""" similarities = cosine_similarity( query_embedding.reshape(1, -1), self.features[feature_type] ) return similarities[0] def retrieve_clips(self, query_text, top_k=3): """Retrieve top-k most relevant clips based on query""" # Encode query query_embedding = self.encode_query(query_text) # Compute similarities for different feature types similarities = {} weights = { 'visual_features': 0.4, 'scene_features': 0.3, 'object_features': 0.3 } for feat_type, weight in weights.items(): similarities[feat_type] = self.compute_similarity(query_embedding, feat_type) * weight # Combine similarities combined_similarities = sum(similarities.values()) # Get top-k indices top_indices = np.argsort(combined_similarities)[-top_k:][::-1] # Return clip information results = [] for idx in top_indices: results.append({ 'clip_id': self.clips_df.iloc[idx]['clip_id'], 'movie_title': self.clips_df.iloc[idx]['movie_title'], 'description': self.clips_df.iloc[idx]['description'], 'timestamp': self.clips_df.iloc[idx]['timestamp'], 'similarity_score': combined_similarities[idx] }) return results # Streamlit UI def main(): st.title("Movie Scene Retrieval System") st.write(""" Search for movie scenes using natural language descriptions. The system will retrieve the most relevant 2-3 minute clips based on your query. """) # Initialize retrieval system try: retrieval_system = st.session_state.retrieval_system except AttributeError: retrieval_system = VideoRetrieval() st.session_state.retrieval_system = retrieval_system # Search interface query = st.text_input("Enter your scene description:", "A dramatic confrontation between two characters in a dark room") num_results = st.slider("Number of results to show:", min_value=1, max_value=5, value=3) if st.button("Search"): with st.spinner("Searching for relevant clips..."): results = retrieval_system.retrieve_clips(query, top_k=num_results) for i, result in enumerate(results, 1): st.subheader(f"Result {i}: {result['movie_title']}") col1, col2 = st.columns([2, 1]) with col1: st.write("**Scene Description:**") st.write(result['description']) st.write(f"**Timestamp:** {result['timestamp']}") with col2: st.write("**Similarity Score:**") st.progress(float(result['similarity_score'])) # In practice, you would have a way to play the video clip here st.write("---") # Additional features with st.sidebar: st.header("About") st.write(""" This system uses pre-computed visual features from several expert models to retrieve relevant movie clips based on natural language descriptions. Features include: - Visual scene understanding - Character interaction analysis - Object detection - Action recognition """) st.header("Feature Weights") st.write("Current weights used for similarity computation:") st.write("- Visual Features: 40%") st.write("- Scene Features: 30%") st.write("- Object Features: 30%") if __name__ == "__main__": main() # Requirements.txt ''' streamlit==1.22.0 pandas==1.5.3 numpy==1.23.5 sentence-transformers==2.2.2 scikit-learn==1.2.2 torch==2.0.0 streamlit pandas numpy sentence-transformers scikit-learn torch '''