awacke1 commited on
Commit
7e7e4f5
·
verified ·
1 Parent(s): 4e27092

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -0
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import torch
7
+ import json
8
+ import os
9
+ from pathlib import Path
10
+
11
+ class VideoRetrieval:
12
+ def __init__(self):
13
+ self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
14
+ self.load_data()
15
+
16
+ def load_data(self):
17
+ # Load pre-computed features
18
+ # In practice, these would be loaded from your actual feature files
19
+ self.features = {
20
+ 'visual_features': np.load('path_to_visual_features.npy'),
21
+ 'scene_features': np.load('path_to_scene_features.npy'),
22
+ 'object_features': np.load('path_to_object_features.npy')
23
+ }
24
+
25
+ # Load clip metadata
26
+ self.clips_df = pd.read_csv('clips_metadata.csv')
27
+
28
+ def encode_query(self, query_text):
29
+ """Encode the text query into embeddings"""
30
+ return self.text_model.encode(query_text)
31
+
32
+ def compute_similarity(self, query_embedding, feature_type='visual_features'):
33
+ """Compute similarity between query and video features"""
34
+ similarities = cosine_similarity(
35
+ query_embedding.reshape(1, -1),
36
+ self.features[feature_type]
37
+ )
38
+ return similarities[0]
39
+
40
+ def retrieve_clips(self, query_text, top_k=3):
41
+ """Retrieve top-k most relevant clips based on query"""
42
+ # Encode query
43
+ query_embedding = self.encode_query(query_text)
44
+
45
+ # Compute similarities for different feature types
46
+ similarities = {}
47
+ weights = {
48
+ 'visual_features': 0.4,
49
+ 'scene_features': 0.3,
50
+ 'object_features': 0.3
51
+ }
52
+
53
+ for feat_type, weight in weights.items():
54
+ similarities[feat_type] = self.compute_similarity(query_embedding, feat_type) * weight
55
+
56
+ # Combine similarities
57
+ combined_similarities = sum(similarities.values())
58
+
59
+ # Get top-k indices
60
+ top_indices = np.argsort(combined_similarities)[-top_k:][::-1]
61
+
62
+ # Return clip information
63
+ results = []
64
+ for idx in top_indices:
65
+ results.append({
66
+ 'clip_id': self.clips_df.iloc[idx]['clip_id'],
67
+ 'movie_title': self.clips_df.iloc[idx]['movie_title'],
68
+ 'description': self.clips_df.iloc[idx]['description'],
69
+ 'timestamp': self.clips_df.iloc[idx]['timestamp'],
70
+ 'similarity_score': combined_similarities[idx]
71
+ })
72
+
73
+ return results
74
+
75
+ # Streamlit UI
76
+ def main():
77
+ st.title("Movie Scene Retrieval System")
78
+ st.write("""
79
+ Search for movie scenes using natural language descriptions.
80
+ The system will retrieve the most relevant 2-3 minute clips based on your query.
81
+ """)
82
+
83
+ # Initialize retrieval system
84
+ try:
85
+ retrieval_system = st.session_state.retrieval_system
86
+ except AttributeError:
87
+ retrieval_system = VideoRetrieval()
88
+ st.session_state.retrieval_system = retrieval_system
89
+
90
+ # Search interface
91
+ query = st.text_input("Enter your scene description:",
92
+ "A dramatic confrontation between two characters in a dark room")
93
+
94
+ num_results = st.slider("Number of results to show:", min_value=1, max_value=5, value=3)
95
+
96
+ if st.button("Search"):
97
+ with st.spinner("Searching for relevant clips..."):
98
+ results = retrieval_system.retrieve_clips(query, top_k=num_results)
99
+
100
+ for i, result in enumerate(results, 1):
101
+ st.subheader(f"Result {i}: {result['movie_title']}")
102
+ col1, col2 = st.columns([2, 1])
103
+
104
+ with col1:
105
+ st.write("**Scene Description:**")
106
+ st.write(result['description'])
107
+ st.write(f"**Timestamp:** {result['timestamp']}")
108
+
109
+ with col2:
110
+ st.write("**Similarity Score:**")
111
+ st.progress(float(result['similarity_score']))
112
+
113
+ # In practice, you would have a way to play the video clip here
114
+ st.write("---")
115
+
116
+ # Additional features
117
+ with st.sidebar:
118
+ st.header("About")
119
+ st.write("""
120
+ This system uses pre-computed visual features from several expert models to retrieve
121
+ relevant movie clips based on natural language descriptions. Features include:
122
+
123
+ - Visual scene understanding
124
+ - Character interaction analysis
125
+ - Object detection
126
+ - Action recognition
127
+ """)
128
+
129
+ st.header("Feature Weights")
130
+ st.write("Current weights used for similarity computation:")
131
+ st.write("- Visual Features: 40%")
132
+ st.write("- Scene Features: 30%")
133
+ st.write("- Object Features: 30%")
134
+
135
+ if __name__ == "__main__":
136
+ main()
137
+
138
+ # Requirements.txt
139
+ '''
140
+ streamlit==1.22.0
141
+ pandas==1.5.3
142
+ numpy==1.23.5
143
+ sentence-transformers==2.2.2
144
+ scikit-learn==1.2.2
145
+ torch==2.0.0
146
+
147
+ streamlit
148
+ pandas
149
+ numpy
150
+ sentence-transformers
151
+ scikit-learn
152
+ torch
153
+
154
+ '''