awacke1 commited on
Commit
6afbfac
Β·
verified Β·
1 Parent(s): 46076c4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +266 -0
app.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def fetch_dataset_info_auth(dataset_id, hf_token):
2
+ """Fetch dataset information with authentication"""
3
+ info_url = f"https://huggingface.co/api/datasets/{dataset_id}"
4
+ headers = {"Authorization": f"Bearer {hf_token}"}
5
+ try:
6
+ response = requests.get(info_url, headers=headers, timeout=30)
7
+ if response.status_code == 200:
8
+ return response.json()
9
+ except Exception as e:
10
+ st.warning(f"Error fetching dataset info: {e}")
11
+ return None
12
+
13
+ def fetch_dataset_splits_auth(dataset_id, hf_token):
14
+ """Fetch available splits for the dataset"""
15
+ splits_url = f"https://datasets-server.huggingface.co/splits?dataset={dataset_id}"
16
+ headers = {"Authorization": f"Bearer {hf_token}"}
17
+ try:
18
+ response = requests.get(splits_url, headers=headers, timeout=30)
19
+ if response.status_code == 200:
20
+ return response.json().get('splits', [])
21
+ except Exception as e:
22
+ st.warning(f"Error fetching splits: {e}")
23
+ return []
24
+
25
+ def fetch_parquet_urls_auth(dataset_id, config, split, hf_token):
26
+ """Fetch Parquet file URLs for a specific split"""
27
+ parquet_url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/{config}/{split}"
28
+ headers = {"Authorization": f"Bearer {hf_token}"}
29
+ try:
30
+ response = requests.get(parquet_url, headers=headers, timeout=30)
31
+ if response.status_code == 200:
32
+ return response.json()
33
+ except Exception as e:
34
+ st.warning(f"Error fetching parquet URLs: {e}")
35
+ return []
36
+
37
+ def fetch_rows_auth(dataset_id, config, split, offset, length, hf_token):
38
+ """Fetch rows with authentication"""
39
+ url = f"https://datasets-server.huggingface.co/rows?dataset={dataset_id}&config={config}&split={split}&offset={offset}&length={length}"
40
+ headers = {"Authorization": f"Bearer {hf_token}"}
41
+ try:
42
+ response = requests.get(url, headers=headers, timeout=30)
43
+ if response.status_code == 200:
44
+ return response.json()
45
+ except Exception as e:
46
+ st.warning(f"Error fetching rows: {e}")
47
+ return None
48
+
49
+ class ParquetVideoSearch:
50
+ def __init__(self, hf_token):
51
+ self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
52
+ self.dataset_id = "tomg-group-umd/cinepile"
53
+ self.config = "v2"
54
+ self.hf_token = hf_token
55
+ self.load_dataset()
56
+
57
+ def load_dataset(self):
58
+ """Load initial dataset sample"""
59
+ try:
60
+ rows_data = fetch_rows_auth(
61
+ self.dataset_id,
62
+ self.config,
63
+ "train",
64
+ 0,
65
+ 100,
66
+ self.hf_token
67
+ )
68
+
69
+ if rows_data and 'rows' in rows_data:
70
+ processed_rows = []
71
+ for row_data in rows_data['rows']:
72
+ row = row_data.get('row', row_data)
73
+ processed_rows.append(row)
74
+
75
+ self.dataset = pd.DataFrame(processed_rows)
76
+ st.session_state['search_columns'] = [col for col in self.dataset.columns
77
+ if not any(term in col.lower() for term in ['embed', 'vector', 'encoding'])]
78
+ else:
79
+ self.dataset = self.load_example_data()
80
+
81
+ except Exception as e:
82
+ st.warning(f"Error loading dataset: {e}")
83
+ self.dataset = self.load_example_data()
84
+
85
+ self.prepare_features()
86
+
87
+ def load_example_data(self):
88
+ """Load example data as fallback"""
89
+ return pd.DataFrame([{
90
+ "video_id": "example",
91
+ "title": "Example Video",
92
+ "description": "Example video content",
93
+ "duration": 120,
94
+ "start_time": 0,
95
+ "end_time": 120
96
+ }])
97
+
98
+ def prepare_features(self):
99
+ """Prepare text features for search"""
100
+ try:
101
+ # Combine relevant text fields for search
102
+ text_fields = ['title', 'description'] if 'title' in self.dataset.columns else ['description']
103
+ combined_text = self.dataset[text_fields].fillna('').agg(' '.join, axis=1)
104
+ self.text_embeds = self.text_model.encode(combined_text.tolist())
105
+
106
+ except Exception as e:
107
+ st.warning(f"Error preparing features: {e}")
108
+ self.text_embeds = np.random.randn(len(self.dataset), 384)
109
+
110
+ def search(self, query, column=None, top_k=20):
111
+ """Search using text embeddings and optional column filtering"""
112
+ query_embedding = self.text_model.encode([query])[0]
113
+ similarities = cosine_similarity([query_embedding], self.text_embeds)[0]
114
+
115
+ # Column filtering
116
+ if column and column in self.dataset.columns and column != "All Fields":
117
+ mask = self.dataset[column].astype(str).str.contains(query, case=False)
118
+ similarities[~mask] *= 0.5
119
+
120
+ top_k = min(top_k, len(similarities))
121
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
122
+
123
+ results = []
124
+ for idx in top_indices:
125
+ result = {
126
+ 'relevance_score': float(similarities[idx]),
127
+ **self.dataset.iloc[idx].to_dict()
128
+ }
129
+ results.append(result)
130
+
131
+ return results
132
+
133
+ def render_video_result(result):
134
+ """Render a video result with enhanced display"""
135
+ col1, col2 = st.columns([2, 1])
136
+
137
+ with col1:
138
+ if 'title' in result:
139
+ st.markdown(f"**Title:** {result['title']}")
140
+ st.markdown("**Description:**")
141
+ st.write(result.get('description', 'No description available'))
142
+
143
+ # Show timing information
144
+ start_time = result.get('start_time', 0)
145
+ end_time = result.get('end_time', result.get('duration', 0))
146
+ st.markdown(f"**Time Range:** {start_time}s - {end_time}s")
147
+
148
+ # Show additional metadata
149
+ for key, value in result.items():
150
+ if key not in ['title', 'description', 'start_time', 'end_time', 'duration',
151
+ 'relevance_score', 'video_id', '_config', '_split']:
152
+ st.markdown(f"**{key.replace('_', ' ').title()}:** {value}")
153
+
154
+ with col2:
155
+ st.markdown(f"**Relevance Score:** {result['relevance_score']:.2%}")
156
+
157
+ # Display video if URL is available
158
+ video_url = None
159
+ if 'video_url' in result:
160
+ video_url = result['video_url']
161
+ elif 'youtube_id' in result:
162
+ video_url = f"https://youtube.com/watch?v={result['youtube_id']}&t={start_time}"
163
+
164
+ if video_url:
165
+ st.video(video_url)
166
+ if st.button(f"πŸ”Š Audio Summary", key=f"audio_{result.get('video_id', '')}"):
167
+ summary = f"Video summary: {result.get('title', '')}. {result.get('description', '')[:200]}"
168
+ audio_file = asyncio.run(generate_speech(summary))
169
+ if audio_file:
170
+ st.audio(audio_file)
171
+
172
+ def main():
173
+ st.title("πŸŽ₯ Enhanced Video Search with Parquet Support")
174
+
175
+ # Get HF token from secrets or user input
176
+ if 'hf_token' not in st.session_state:
177
+ st.session_state['hf_token'] = st.secrets.get("HF_TOKEN", None)
178
+
179
+ if not st.session_state['hf_token']:
180
+ hf_token = st.text_input("Enter your Hugging Face API token:", type="password")
181
+ if hf_token:
182
+ st.session_state['hf_token'] = hf_token
183
+
184
+ if not st.session_state.get('hf_token'):
185
+ st.warning("Please provide a Hugging Face API token to access the dataset.")
186
+ return
187
+
188
+ # Initialize search class
189
+ search = ParquetVideoSearch(st.session_state['hf_token'])
190
+
191
+ # Create tabs
192
+ tab1, tab2 = st.tabs(["πŸ” Video Search", "πŸ“Š Dataset Info"])
193
+
194
+ # ---- Tab 1: Video Search ----
195
+ with tab1:
196
+ st.subheader("Search Videos")
197
+ col1, col2 = st.columns([3, 1])
198
+
199
+ with col1:
200
+ query = st.text_input("Enter your search query:",
201
+ value="" if st.session_state['initial_search_done'] else "")
202
+ with col2:
203
+ search_column = st.selectbox("Search in field:",
204
+ ["All Fields"] + st.session_state['search_columns'])
205
+
206
+ col3, col4 = st.columns(2)
207
+ with col3:
208
+ num_results = st.slider("Number of results:", 1, 100, 20)
209
+ with col4:
210
+ search_button = st.button("πŸ” Search")
211
+
212
+ if search_button and query:
213
+ st.session_state['initial_search_done'] = True
214
+ selected_column = None if search_column == "All Fields" else search_column
215
+
216
+ with st.spinner("Searching..."):
217
+ results = search.search(query, selected_column, num_results)
218
+
219
+ st.session_state['search_history'].append({
220
+ 'query': query,
221
+ 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
222
+ 'results': results[:5]
223
+ })
224
+
225
+ for i, result in enumerate(results, 1):
226
+ with st.expander(
227
+ f"Result {i}: {result.get('title', result.get('description', 'No title'))[:100]}...",
228
+ expanded=(i==1)
229
+ ):
230
+ render_video_result(result)
231
+
232
+ # ---- Tab 2: Dataset Info ----
233
+ with tab2:
234
+ st.subheader("Dataset Information")
235
+
236
+ # Show available splits
237
+ splits = fetch_dataset_splits_auth(search.dataset_id, st.session_state['hf_token'])
238
+ if splits:
239
+ st.write("### Available Splits")
240
+ for split in splits:
241
+ st.write(f"- {split['split']}: {split.get('num_rows', 'unknown')} rows")
242
+
243
+ # Show dataset statistics
244
+ st.write("### Dataset Statistics")
245
+ st.write(f"- Loaded rows: {len(search.dataset)}")
246
+ st.write(f"- Available columns: {', '.join(search.dataset.columns)}")
247
+
248
+ # Show sample data
249
+ st.write("### Sample Data")
250
+ st.dataframe(search.dataset.head())
251
+
252
+ # Sidebar
253
+ with st.sidebar:
254
+ st.subheader("βš™οΈ Settings & History")
255
+ if st.button("πŸ—‘οΈ Clear History"):
256
+ st.session_state['search_history'] = []
257
+ st.experimental_rerun()
258
+
259
+ st.markdown("### Recent Searches")
260
+ for entry in reversed(st.session_state['search_history'][-5:]):
261
+ with st.expander(f"{entry['timestamp']}: {entry['query']}"):
262
+ for i, result in enumerate(entry['results'], 1):
263
+ st.write(f"{i}. {result.get('title', result.get('description', 'No title'))[:100]}...")
264
+
265
+ if __name__ == "__main__":
266
+ main()