File size: 5,191 Bytes
7e7e4f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import streamlit as st
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import json
import os
from pathlib import Path

class VideoRetrieval:
    def __init__(self):
        self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.load_data()
        
    def load_data(self):
        # Load pre-computed features
        # In practice, these would be loaded from your actual feature files
        self.features = {
            'visual_features': np.load('path_to_visual_features.npy'),
            'scene_features': np.load('path_to_scene_features.npy'),
            'object_features': np.load('path_to_object_features.npy')
        }
        
        # Load clip metadata
        self.clips_df = pd.read_csv('clips_metadata.csv')
        
    def encode_query(self, query_text):
        """Encode the text query into embeddings"""
        return self.text_model.encode(query_text)
    
    def compute_similarity(self, query_embedding, feature_type='visual_features'):
        """Compute similarity between query and video features"""
        similarities = cosine_similarity(
            query_embedding.reshape(1, -1),
            self.features[feature_type]
        )
        return similarities[0]
    
    def retrieve_clips(self, query_text, top_k=3):
        """Retrieve top-k most relevant clips based on query"""
        # Encode query
        query_embedding = self.encode_query(query_text)
        
        # Compute similarities for different feature types
        similarities = {}
        weights = {
            'visual_features': 0.4,
            'scene_features': 0.3,
            'object_features': 0.3
        }
        
        for feat_type, weight in weights.items():
            similarities[feat_type] = self.compute_similarity(query_embedding, feat_type) * weight
            
        # Combine similarities
        combined_similarities = sum(similarities.values())
        
        # Get top-k indices
        top_indices = np.argsort(combined_similarities)[-top_k:][::-1]
        
        # Return clip information
        results = []
        for idx in top_indices:
            results.append({
                'clip_id': self.clips_df.iloc[idx]['clip_id'],
                'movie_title': self.clips_df.iloc[idx]['movie_title'],
                'description': self.clips_df.iloc[idx]['description'],
                'timestamp': self.clips_df.iloc[idx]['timestamp'],
                'similarity_score': combined_similarities[idx]
            })
        
        return results

# Streamlit UI
def main():
    st.title("Movie Scene Retrieval System")
    st.write("""
    Search for movie scenes using natural language descriptions.
    The system will retrieve the most relevant 2-3 minute clips based on your query.
    """)
    
    # Initialize retrieval system
    try:
        retrieval_system = st.session_state.retrieval_system
    except AttributeError:
        retrieval_system = VideoRetrieval()
        st.session_state.retrieval_system = retrieval_system
    
    # Search interface
    query = st.text_input("Enter your scene description:", 
                         "A dramatic confrontation between two characters in a dark room")
    
    num_results = st.slider("Number of results to show:", min_value=1, max_value=5, value=3)
    
    if st.button("Search"):
        with st.spinner("Searching for relevant clips..."):
            results = retrieval_system.retrieve_clips(query, top_k=num_results)
            
            for i, result in enumerate(results, 1):
                st.subheader(f"Result {i}: {result['movie_title']}")
                col1, col2 = st.columns([2, 1])
                
                with col1:
                    st.write("**Scene Description:**")
                    st.write(result['description'])
                    st.write(f"**Timestamp:** {result['timestamp']}")
                
                with col2:
                    st.write("**Similarity Score:**")
                    st.progress(float(result['similarity_score']))
                
                # In practice, you would have a way to play the video clip here
                st.write("---")
    
    # Additional features
    with st.sidebar:
        st.header("About")
        st.write("""
        This system uses pre-computed visual features from several expert models to retrieve
        relevant movie clips based on natural language descriptions. Features include:
        
        - Visual scene understanding
        - Character interaction analysis
        - Object detection
        - Action recognition
        """)
        
        st.header("Feature Weights")
        st.write("Current weights used for similarity computation:")
        st.write("- Visual Features: 40%")
        st.write("- Scene Features: 30%")
        st.write("- Object Features: 30%")

if __name__ == "__main__":
    main()

# Requirements.txt
'''
streamlit==1.22.0
pandas==1.5.3
numpy==1.23.5
sentence-transformers==2.2.2
scikit-learn==1.2.2
torch==2.0.0

streamlit
pandas
numpy
sentence-transformers
scikit-learn
torch

'''