awacke1 commited on
Commit
ceb3a99
Β·
verified Β·
1 Parent(s): 5503ac5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +420 -0
app.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import torch
7
+ import json
8
+ import os
9
+ import glob
10
+ from pathlib import Path
11
+ from datetime import datetime
12
+ import edge_tts
13
+ import asyncio
14
+ import base64
15
+ import requests
16
+ import plotly.graph_objects as go
17
+ from gradio_client import Client
18
+ from collections import defaultdict
19
+ from bs4 import BeautifulSoup
20
+ from audio_recorder_streamlit import audio_recorder
21
+ import streamlit.components.v1 as components
22
+
23
+ # Page configuration
24
+ st.set_page_config(
25
+ page_title="Video Search & Research Assistant",
26
+ page_icon="πŸŽ₯",
27
+ layout="wide",
28
+ initial_sidebar_state="auto",
29
+ )
30
+
31
+ # Initialize session state
32
+ if 'search_history' not in st.session_state:
33
+ st.session_state['search_history'] = []
34
+ if 'last_voice_input' not in st.session_state:
35
+ st.session_state['last_voice_input'] = ""
36
+ if 'transcript_history' not in st.session_state:
37
+ st.session_state['transcript_history'] = []
38
+ if 'should_rerun' not in st.session_state:
39
+ st.session_state['should_rerun'] = False
40
+
41
+ # Custom styling
42
+ st.markdown("""
43
+ <style>
44
+ .main { background: linear-gradient(to right, #1a1a1a, #2d2d2d); color: #fff; }
45
+ .stMarkdown { font-family: 'Helvetica Neue', sans-serif; }
46
+ .stButton>button { margin-right: 0.5rem; }
47
+ </style>
48
+ """, unsafe_allow_html=True)
49
+
50
+ # Initialize components
51
+ speech_component = components.declare_component("speech_recognition", path="mycomponent")
52
+
53
+ class VideoSearch:
54
+ def __init__(self):
55
+ self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
56
+ self.load_dataset()
57
+
58
+ def fetch_dataset_rows(self):
59
+ """Fetch dataset from Hugging Face API with debug and caching"""
60
+ try:
61
+ st.info("Fetching from Hugging Face API...")
62
+ url = "https://datasets-server.huggingface.co/first-rows?dataset=omegalabsinc%2Fomega-multimodal&config=default&split=train"
63
+
64
+ response = requests.get(url, timeout=30)
65
+ st.write(f"Response status: {response.status_code}")
66
+
67
+ if response.status_code == 200:
68
+ data = response.json()
69
+
70
+ if 'rows' in data:
71
+ # Extract actual row data from the nested structure
72
+ processed_rows = []
73
+ for row_data in data['rows']:
74
+ if 'row' in row_data: # Access the nested 'row' data
75
+ processed_rows.append(row_data['row'])
76
+
77
+ df = pd.DataFrame(processed_rows)
78
+
79
+ # Debug output
80
+ st.write("DataFrame columns after processing:", list(df.columns))
81
+ st.write("Number of rows:", len(df))
82
+
83
+ return df
84
+ else:
85
+ st.error("No 'rows' found in API response")
86
+ st.write("Raw API Response:", data)
87
+ return self.load_example_data()
88
+ else:
89
+ st.error(f"API request failed with status code: {response.status_code}")
90
+ return self.load_example_data()
91
+
92
+ except Exception as e:
93
+ st.error(f"Error fetching dataset: {str(e)}")
94
+ return self.load_example_data()
95
+
96
+ def load_example_data(self):
97
+ """Load example data as fallback"""
98
+ example_data = [
99
+ {
100
+ "video_id": "cd21da96-fcca-4c94-a60f-0b1e4e1e29fc",
101
+ "youtube_id": "IO-vwtyicn4",
102
+ "description": "This video shows a close-up of an ancient text carved into a surface, with the text appearing to be in a cursive script.",
103
+ "views": 45489,
104
+ "start_time": 1452,
105
+ "end_time": 1458,
106
+ "video_embed": [0.014160037972033024, -0.003111184574663639, -0.016604168340563774],
107
+ "description_embed": [-0.05835828185081482, 0.02589797042310238, 0.11952091753482819]
108
+ },
109
+ {
110
+ "video_id": "a8ebde7d-d717-4c1e-8be4-bdb4bc0c544f",
111
+ "youtube_id": "mo4rEyF7gTE",
112
+ "description": "This video shows a close-up view of a classical architectural structure, featuring stone statues with ornate details.",
113
+ "views": 4468,
114
+ "start_time": 318,
115
+ "end_time": 324,
116
+ "video_embed": [0.015160037972033024, -0.004111184574663639, -0.017604168340563774],
117
+ "description_embed": [-0.06835828185081482, 0.03589797042310238, 0.12952091753482819]
118
+ },
119
+ {
120
+ "video_id": "d1be64a6-22e2-4fbd-a176-20749e7c3d8a",
121
+ "youtube_id": "IO-vwtyicn4",
122
+ "description": "This video shows a weathered ancient painting depicting figures in classical style with vibrant colors preserved.",
123
+ "views": 45489,
124
+ "start_time": 1698,
125
+ "end_time": 1704,
126
+ "video_embed": [0.016160037972033024, -0.005111184574663639, -0.018604168340563774],
127
+ "description_embed": [-0.07835828185081482, 0.04589797042310238, 0.13952091753482819]
128
+ }
129
+ ]
130
+ return pd.DataFrame(example_data)
131
+
132
+ def prepare_features(self):
133
+ """Prepare and cache embeddings"""
134
+ try:
135
+ if 'video_embed' not in self.dataset.columns:
136
+ st.warning("Using example data embeddings")
137
+ self.dataset = self.load_example_data()
138
+
139
+ # Debug: Show raw data types and first row
140
+ st.write("Data Types:", self.dataset.dtypes)
141
+ st.write("\nFirst row of embeddings:")
142
+ st.write("video_embed type:", type(self.dataset['video_embed'].iloc[0]))
143
+ st.write("video_embed content:", self.dataset['video_embed'].iloc[0])
144
+ st.write("\ndescription_embed type:", type(self.dataset['description_embed'].iloc[0]))
145
+ st.write("description_embed content:", self.dataset['description_embed'].iloc[0])
146
+
147
+ # Convert string representations of embeddings back to numpy arrays
148
+ def safe_eval_list(s):
149
+ try:
150
+ # Clean the string representation
151
+ if isinstance(s, str):
152
+ s = s.replace('[', '').replace(']', '').strip()
153
+ # Split by whitespace and/or commas
154
+ numbers = [float(x.strip()) for x in s.split() if x.strip()]
155
+ return numbers
156
+ elif isinstance(s, list):
157
+ return [float(x) for x in s]
158
+ else:
159
+ st.error(f"Unexpected type for embedding: {type(s)}")
160
+ return None
161
+ except Exception as e:
162
+ st.error(f"Error parsing embedding: {str(e)}")
163
+ st.write("Problematic string:", s)
164
+ return None
165
+
166
+ # Process embeddings with detailed error reporting
167
+ video_embeds = []
168
+ text_embeds = []
169
+
170
+ for idx in range(len(self.dataset)):
171
+ try:
172
+ video_embed = safe_eval_list(self.dataset['video_embed'].iloc[idx])
173
+ desc_embed = safe_eval_list(self.dataset['description_embed'].iloc[idx])
174
+
175
+ if video_embed is not None and desc_embed is not None:
176
+ video_embeds.append(video_embed)
177
+ text_embeds.append(desc_embed)
178
+ else:
179
+ st.warning(f"Skipping row {idx} due to parsing failure")
180
+ except Exception as e:
181
+ st.error(f"Error processing row {idx}: {str(e)}")
182
+ st.write("Row data:", self.dataset.iloc[idx])
183
+
184
+ if video_embeds and text_embeds:
185
+ try:
186
+ self.video_embeds = np.array(video_embeds)
187
+ self.text_embeds = np.array(text_embeds)
188
+ st.success(f"Successfully processed {len(video_embeds)} embeddings")
189
+ st.write("Video embeddings shape:", self.video_embeds.shape)
190
+ st.write("Text embeddings shape:", self.text_embeds.shape)
191
+ except Exception as e:
192
+ st.error(f"Error converting to numpy arrays: {str(e)}")
193
+ else:
194
+ st.warning("No valid embeddings found, using random embeddings")
195
+ num_rows = len(self.dataset)
196
+ self.video_embeds = np.random.randn(num_rows, 384)
197
+ self.text_embeds = np.random.randn(num_rows, 384)
198
+
199
+ except Exception as e:
200
+ st.error(f"Error preparing features: {str(e)}")
201
+ import traceback
202
+ st.write("Traceback:", traceback.format_exc())
203
+ # Create random embeddings as fallback
204
+ num_rows = len(self.dataset)
205
+ self.video_embeds = np.random.randn(num_rows, 384)
206
+ self.text_embeds = np.random.randn(num_rows, 384)
207
+
208
+ def load_dataset(self):
209
+ try:
210
+ self.dataset = self.fetch_dataset_rows()
211
+ if self.dataset is not None:
212
+ self.prepare_features()
213
+ else:
214
+ self.create_dummy_data()
215
+ except Exception as e:
216
+ st.error(f"Error loading dataset: {e}")
217
+ self.create_dummy_data()
218
+
219
+ def prepare_features(self):
220
+ try:
221
+ self.video_embeds = np.array([json.loads(e) if isinstance(e, str) else e
222
+ for e in self.dataset.video_embed])
223
+ self.text_embeds = np.array([json.loads(e) if isinstance(e, str) else e
224
+ for e in self.dataset.description_embed])
225
+ except Exception as e:
226
+ st.error(f"Error preparing features: {e}")
227
+ num_rows = len(self.dataset)
228
+ self.video_embeds = np.random.randn(num_rows, 384)
229
+ self.text_embeds = np.random.randn(num_rows, 384)
230
+
231
+ def search(self, query, top_k=5):
232
+ query_embedding = self.text_model.encode([query])[0]
233
+
234
+ video_sims = cosine_similarity([query_embedding], self.video_embeds)[0]
235
+ text_sims = cosine_similarity([query_embedding], self.text_embeds)[0]
236
+
237
+ combined_sims = 0.5 * video_sims + 0.5 * text_sims
238
+ top_indices = np.argsort(combined_sims)[-top_k:][::-1]
239
+
240
+ results = []
241
+ for idx in top_indices:
242
+ results.append({
243
+ 'video_id': self.dataset.iloc[idx]['video_id'],
244
+ 'youtube_id': self.dataset.iloc[idx]['youtube_id'],
245
+ 'description': self.dataset.iloc[idx]['description'],
246
+ 'start_time': self.dataset.iloc[idx]['start_time'],
247
+ 'end_time': self.dataset.iloc[idx]['end_time'],
248
+ 'relevance_score': float(combined_sims[idx]),
249
+ 'views': self.dataset.iloc[idx]['views']
250
+ })
251
+ return results
252
+
253
+ def perform_arxiv_search(query, vocal_summary=True, extended_refs=False):
254
+ """Perform Arxiv search with audio summaries"""
255
+ try:
256
+ client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
257
+ refs = client.predict(query, 20, "Semantic Search",
258
+ "mistralai/Mixtral-8x7B-Instruct-v0.1",
259
+ api_name="/update_with_rag_md")[0]
260
+ response = client.predict(query, "mistralai/Mixtral-8x7B-Instruct-v0.1",
261
+ True, api_name="/ask_llm")
262
+
263
+ result = f"### πŸ”Ž {query}\n\n{response}\n\n{refs}"
264
+ st.markdown(result)
265
+
266
+ if vocal_summary:
267
+ audio_file = asyncio.run(generate_speech(response[:500]))
268
+ if audio_file:
269
+ st.audio(audio_file)
270
+ os.remove(audio_file)
271
+
272
+ return result
273
+ except Exception as e:
274
+ st.error(f"Error in Arxiv search: {e}")
275
+ return None
276
+
277
+ async def generate_speech(text, voice="en-US-AriaNeural"):
278
+ """Generate speech using Edge TTS"""
279
+ if not text.strip():
280
+ return None
281
+
282
+ try:
283
+ communicate = edge_tts.Communicate(text, voice)
284
+ audio_file = f"speech_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
285
+ await communicate.save(audio_file)
286
+ return audio_file
287
+ except Exception as e:
288
+ st.error(f"Error generating speech: {e}")
289
+ return None
290
+
291
+ def process_audio_input(audio_bytes):
292
+ """Process audio input from recorder"""
293
+ if audio_bytes:
294
+ # Save temporary file
295
+ audio_path = f"temp_audio_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"
296
+ with open(audio_path, "wb") as f:
297
+ f.write(audio_bytes)
298
+
299
+ # Here you would typically use a speech-to-text service
300
+ # For now, we'll just acknowledge the recording
301
+ st.success("Audio recorded successfully!")
302
+
303
+ # Cleanup
304
+ if os.path.exists(audio_path):
305
+ os.remove(audio_path)
306
+
307
+ return True
308
+ return False
309
+
310
+ def main():
311
+ st.title("πŸŽ₯ Video Search & Research Assistant")
312
+
313
+ # Initialize search
314
+ search = VideoSearch()
315
+
316
+ # Create main tabs
317
+ tab1, tab2, tab3 = st.tabs(["πŸ” Video Search", "πŸŽ™οΈ Voice & Audio", "πŸ“š Arxiv Research"])
318
+
319
+ with tab1:
320
+ st.subheader("Search Video Dataset")
321
+
322
+ # Text search
323
+ query = st.text_input("Enter your search query:")
324
+ col1, col2 = st.columns(2)
325
+
326
+ with col1:
327
+ search_button = st.button("πŸ” Search")
328
+ with col2:
329
+ num_results = st.slider("Number of results:", 1, 10, 5)
330
+
331
+ if search_button and query:
332
+ results = search.search(query, num_results)
333
+ st.session_state['search_history'].append({
334
+ 'query': query,
335
+ 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
336
+ 'results': results
337
+ })
338
+
339
+ for i, result in enumerate(results, 1):
340
+ with st.expander(f"Result {i}: {result['description'][:100]}...", expanded=i==1):
341
+ cols = st.columns([2, 1])
342
+
343
+ with cols[0]:
344
+ st.markdown(f"**Full Description:**")
345
+ st.write(result['description'])
346
+ st.markdown(f"**Time Range:** {result['start_time']}s - {result['end_time']}s")
347
+ st.markdown(f"**Views:** {result['views']:,}")
348
+
349
+ with cols[1]:
350
+ st.markdown(f"**Relevance Score:** {result['relevance_score']:.2%}")
351
+ if result['youtube_id']:
352
+ st.video(f"https://youtube.com/watch?v={result['youtube_id']}&t={result['start_time']}")
353
+
354
+ # Generate audio summary
355
+ if st.button(f"πŸ”Š Generate Audio Summary", key=f"audio_{i}"):
356
+ summary = f"Video summary: {result['description'][:200]}"
357
+ audio_file = asyncio.run(generate_speech(summary))
358
+ if audio_file:
359
+ st.audio(audio_file)
360
+ os.remove(audio_file)
361
+
362
+ with tab2:
363
+ st.subheader("Voice Input & Audio Recording")
364
+
365
+ col1, col2 = st.columns(2)
366
+ with col1:
367
+ st.write("πŸŽ™οΈ Speech Recognition")
368
+ voice_input = speech_component()
369
+
370
+ if voice_input and voice_input != st.session_state['last_voice_input']:
371
+ st.session_state['last_voice_input'] = voice_input
372
+ st.markdown("**Transcribed Text:**")
373
+ st.write(voice_input)
374
+
375
+ if st.button("πŸ” Search Videos"):
376
+ results = search.search(voice_input, num_results)
377
+ for i, result in enumerate(results, 1):
378
+ with st.expander(f"Result {i}", expanded=i==1):
379
+ st.write(result['description'])
380
+ if result['youtube_id']:
381
+ st.video(f"https://youtube.com/watch?v={result['youtube_id']}&t={result['start_time']}")
382
+
383
+ with col2:
384
+ st.write("🎡 Audio Recorder")
385
+ audio_bytes = audio_recorder()
386
+ if audio_bytes:
387
+ process_audio_input(audio_bytes)
388
+
389
+ with tab3:
390
+ st.subheader("Arxiv Research")
391
+ arxiv_query = st.text_input("πŸ” Research Query:")
392
+
393
+ col1, col2 = st.columns(2)
394
+ with col1:
395
+ vocal_summary = st.checkbox("Generate Audio Summary", value=True)
396
+ with col2:
397
+ extended_refs = st.checkbox("Include Extended References", value=False)
398
+
399
+ if st.button("πŸ” Search Arxiv") and arxiv_query:
400
+ perform_arxiv_search(arxiv_query, vocal_summary, extended_refs)
401
+
402
+ # Sidebar for history and settings
403
+ with st.sidebar:
404
+ st.subheader("βš™οΈ Settings & History")
405
+
406
+ if st.button("πŸ—‘οΈ Clear History"):
407
+ st.session_state['search_history'] = []
408
+ st.experimental_rerun()
409
+
410
+ st.markdown("### Recent Searches")
411
+ for entry in reversed(st.session_state['search_history'][-5:]):
412
+ st.markdown(f"**{entry['timestamp']}**: {entry['query']}")
413
+
414
+ st.markdown("### Voice Settings")
415
+ st.selectbox("TTS Voice:",
416
+ ["en-US-AriaNeural", "en-US-GuyNeural", "en-GB-SoniaNeural"],
417
+ key="tts_voice")
418
+
419
+ if __name__ == "__main__":
420
+ main()