awacke1 commited on
Commit
0b7e2f0
Β·
verified Β·
1 Parent(s): ae1d609

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +220 -48
app.py CHANGED
@@ -37,39 +37,153 @@ if 'tts_voice' not in st.session_state:
37
  if 'arxiv_last_query' not in st.session_state:
38
  st.session_state['arxiv_last_query'] = ""
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  class VideoSearch:
41
  def __init__(self):
42
  self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
 
43
  self.load_dataset()
44
 
45
  def fetch_dataset_rows(self):
46
- """Fetch dataset from Hugging Face API"""
47
  try:
48
- url = "https://datasets-server.huggingface.co/first-rows?dataset=omegalabsinc%2Fomega-multimodal&config=default&split=train"
49
- response = requests.get(url, timeout=30)
50
- if response.status_code == 200:
51
- data = response.json()
52
- if 'rows' in data:
53
- processed_rows = []
54
- for row_data in data['rows']:
55
- row = row_data.get('row', row_data)
56
- for key in row:
57
- if any(term in key.lower() for term in ['embed', 'vector', 'encoding']):
58
- if isinstance(row[key], str):
59
- try:
60
- row[key] = [float(x.strip()) for x in row[key].strip('[]').split(',') if x.strip()]
61
- except:
62
- continue
63
- processed_rows.append(row)
64
-
65
- df = pd.DataFrame(processed_rows)
66
- st.session_state['search_columns'] = [col for col in df.columns
67
- if col not in ['video_embed', 'description_embed', 'audio_embed']]
68
- return df
69
  return self.load_example_data()
70
- except:
 
 
71
  return self.load_example_data()
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def prepare_features(self):
74
  """Prepare embeddings with adaptive field detection"""
75
  try:
@@ -110,22 +224,6 @@ class VideoSearch:
110
  num_rows = len(self.dataset)
111
  self.video_embeds = np.random.randn(num_rows, 384)
112
  self.text_embeds = np.random.randn(num_rows, 384)
113
-
114
- def load_example_data(self):
115
- """Load example data as fallback"""
116
- example_data = [
117
- {
118
- "video_id": "cd21da96-fcca-4c94-a60f-0b1e4e1e29fc",
119
- "youtube_id": "IO-vwtyicn4",
120
- "description": "This video shows a close-up of an ancient text carved into a surface.",
121
- "views": 45489,
122
- "start_time": 1452,
123
- "end_time": 1458,
124
- "video_embed": [0.014160037972033024, -0.003111184574663639, -0.016604168340563774],
125
- "description_embed": [-0.05835828185081482, 0.02589797042310238, 0.11952091753482819]
126
- }
127
- ]
128
- return pd.DataFrame(example_data)
129
 
130
  def load_dataset(self):
131
  self.dataset = self.fetch_dataset_rows()
@@ -174,9 +272,7 @@ async def generate_speech(text, voice=None):
174
  return None
175
 
176
  def transcribe_audio(audio_path):
177
- """Placeholder for ASR transcription (no OpenAI/Anthropic).
178
- Integrate your own ASR model or API here."""
179
- # For now, just return a message:
180
  return "ASR not implemented. Integrate a local model or another service here."
181
 
182
  def show_file_manager():
@@ -215,12 +311,10 @@ def show_file_manager():
215
  def arxiv_search(query, max_results=5):
216
  """Perform a simple Arxiv search using their API and return top results."""
217
  base_url = "http://export.arxiv.org/api/query?"
218
- # Encode the query
219
  search_url = base_url + f"search_query={quote(query)}&start=0&max_results={max_results}"
220
  r = requests.get(search_url)
221
  if r.status_code == 200:
222
  root = ET.fromstring(r.text)
223
- # Namespace handling
224
  ns = {'atom': 'http://www.w3.org/2005/Atom'}
225
  entries = root.findall('atom:entry', ns)
226
  results = []
@@ -248,7 +342,6 @@ def perform_arxiv_lookup(q, vocal_summary=True, titles_summary=True, full_audio=
248
  if link:
249
  st.markdown(f"[View Paper]({link})")
250
 
251
- # TTS Options
252
  if vocal_summary:
253
  spoken_text = f"Here are some Arxiv results for {q}. "
254
  if titles_summary:
@@ -278,7 +371,7 @@ def main():
278
  search = VideoSearch()
279
 
280
  # Create tabs
281
- tab1, tab2, tab3, tab4 = st.tabs(["πŸ” Search", "πŸŽ™οΈ Voice Input", "πŸ“š Arxiv", "πŸ“‚ Files"])
282
 
283
  # ---- Tab 1: Video Search ----
284
  with tab1:
@@ -332,7 +425,6 @@ def main():
332
  # ---- Tab 2: Voice Input ----
333
  with tab2:
334
  st.subheader("Voice Input")
335
-
336
  st.write("πŸŽ™οΈ Record your voice:")
337
  audio_bytes = audio_recorder()
338
  if audio_bytes:
@@ -373,6 +465,86 @@ def main():
373
  with tab4:
374
  show_file_manager()
375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  # Sidebar
377
  with st.sidebar:
378
  st.subheader("βš™οΈ Settings & History")
@@ -392,4 +564,4 @@ def main():
392
  key="tts_voice")
393
 
394
  if __name__ == "__main__":
395
- main()
 
37
  if 'arxiv_last_query' not in st.session_state:
38
  st.session_state['arxiv_last_query'] = ""
39
 
40
+ def fetch_dataset_info(dataset_id):
41
+ """Fetch dataset information including all available configs and splits"""
42
+ info_url = f"https://huggingface.co/api/datasets/{dataset_id}"
43
+ try:
44
+ response = requests.get(info_url, timeout=30)
45
+ if response.status_code == 200:
46
+ return response.json()
47
+ except Exception as e:
48
+ st.warning(f"Error fetching dataset info: {e}")
49
+ return None
50
+
51
+ def fetch_dataset_rows(dataset_id, config="default", split="train", max_rows=100):
52
+ """Fetch rows from a specific config and split of a dataset"""
53
+ url = f"https://datasets-server.huggingface.co/first-rows?dataset={dataset_id}&config={config}&split={split}"
54
+ try:
55
+ response = requests.get(url, timeout=30)
56
+ if response.status_code == 200:
57
+ data = response.json()
58
+ if 'rows' in data:
59
+ processed_rows = []
60
+ for row_data in data['rows']:
61
+ row = row_data.get('row', row_data)
62
+ # Process embeddings if present
63
+ for key in row:
64
+ if any(term in key.lower() for term in ['embed', 'vector', 'encoding']):
65
+ if isinstance(row[key], str):
66
+ try:
67
+ row[key] = [float(x.strip()) for x in row[key].strip('[]').split(',') if x.strip()]
68
+ except:
69
+ continue
70
+ row['_config'] = config
71
+ row['_split'] = split
72
+ processed_rows.append(row)
73
+ return processed_rows
74
+ except Exception as e:
75
+ st.warning(f"Error fetching rows for {config}/{split}: {e}")
76
+ return []
77
+
78
+ def search_dataset(dataset_id, search_text, include_configs=None, include_splits=None):
79
+ """
80
+ Search across all configurations and splits of a dataset
81
+
82
+ Args:
83
+ dataset_id (str): The Hugging Face dataset ID
84
+ search_text (str): Text to search for in descriptions and queries
85
+ include_configs (list): List of specific configs to search, or None for all
86
+ include_splits (list): List of specific splits to search, or None for all
87
+
88
+ Returns:
89
+ tuple: (DataFrame of results, list of available configs, list of available splits)
90
+ """
91
+ # Get dataset info
92
+ dataset_info = fetch_dataset_info(dataset_id)
93
+ if not dataset_info:
94
+ return pd.DataFrame(), [], []
95
+
96
+ # Get available configs and splits
97
+ configs = include_configs if include_configs else dataset_info.get('config_names', ['default'])
98
+ all_rows = []
99
+ available_splits = set()
100
+
101
+ # Search across configs and splits
102
+ for config in configs:
103
+ try:
104
+ # First fetch split info for this config
105
+ splits_url = f"https://datasets-server.huggingface.co/splits?dataset={dataset_id}&config={config}"
106
+ splits_response = requests.get(splits_url, timeout=30)
107
+ if splits_response.status_code == 200:
108
+ splits_data = splits_response.json()
109
+ splits = [split['split'] for split in splits_data.get('splits', [])]
110
+ if not splits:
111
+ splits = ['train'] # fallback to train if no splits found
112
+
113
+ # Filter splits if specified
114
+ if include_splits:
115
+ splits = [s for s in splits if s in include_splits]
116
+
117
+ available_splits.update(splits)
118
+
119
+ # Fetch and search rows for each split
120
+ for split in splits:
121
+ rows = fetch_dataset_rows(dataset_id, config, split)
122
+ for row in rows:
123
+ # Search in all text fields
124
+ text_content = ' '.join(str(v) for v in row.values() if isinstance(v, (str, int, float)))
125
+ if search_text.lower() in text_content.lower():
126
+ row['_matched_text'] = text_content
127
+ row['_relevance_score'] = text_content.lower().count(search_text.lower())
128
+ all_rows.append(row)
129
+
130
+ except Exception as e:
131
+ st.warning(f"Error processing config {config}: {e}")
132
+ continue
133
+
134
+ # Convert to DataFrame and sort by relevance
135
+ if all_rows:
136
+ df = pd.DataFrame(all_rows)
137
+ df = df.sort_values('_relevance_score', ascending=False)
138
+ return df, configs, list(available_splits)
139
+
140
+ return pd.DataFrame(), configs, list(available_splits)
141
+
142
  class VideoSearch:
143
  def __init__(self):
144
  self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
145
+ self.dataset_id = "omegalabsinc/omega-multimodal"
146
  self.load_dataset()
147
 
148
  def fetch_dataset_rows(self):
149
+ """Fetch dataset with enhanced search capabilities"""
150
  try:
151
+ # First try to get all available data
152
+ df, configs, splits = search_dataset(
153
+ self.dataset_id,
154
+ "", # empty search text to get all data
155
+ include_configs=None, # all configs
156
+ include_splits=None # all splits
157
+ )
158
+
159
+ if not df.empty:
160
+ st.session_state['search_columns'] = [col for col in df.columns
161
+ if col not in ['video_embed', 'description_embed', 'audio_embed']
162
+ and not col.startswith('_')]
163
+ return df
164
+
 
 
 
 
 
 
 
165
  return self.load_example_data()
166
+
167
+ except Exception as e:
168
+ st.warning(f"Error loading dataset: {e}")
169
  return self.load_example_data()
170
 
171
+ def load_example_data(self):
172
+ """Load example data as fallback"""
173
+ example_data = [
174
+ {
175
+ "video_id": "cd21da96-fcca-4c94-a60f-0b1e4e1e29fc",
176
+ "youtube_id": "IO-vwtyicn4",
177
+ "description": "This video shows a close-up of an ancient text carved into a surface.",
178
+ "views": 45489,
179
+ "start_time": 1452,
180
+ "end_time": 1458,
181
+ "video_embed": [0.014160037972033024, -0.003111184574663639, -0.016604168340563774],
182
+ "description_embed": [-0.05835828185081482, 0.02589797042310238, 0.11952091753482819]
183
+ }
184
+ ]
185
+ return pd.DataFrame(example_data)
186
+
187
  def prepare_features(self):
188
  """Prepare embeddings with adaptive field detection"""
189
  try:
 
224
  num_rows = len(self.dataset)
225
  self.video_embeds = np.random.randn(num_rows, 384)
226
  self.text_embeds = np.random.randn(num_rows, 384)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
  def load_dataset(self):
229
  self.dataset = self.fetch_dataset_rows()
 
272
  return None
273
 
274
  def transcribe_audio(audio_path):
275
+ """Placeholder for ASR transcription"""
 
 
276
  return "ASR not implemented. Integrate a local model or another service here."
277
 
278
  def show_file_manager():
 
311
  def arxiv_search(query, max_results=5):
312
  """Perform a simple Arxiv search using their API and return top results."""
313
  base_url = "http://export.arxiv.org/api/query?"
 
314
  search_url = base_url + f"search_query={quote(query)}&start=0&max_results={max_results}"
315
  r = requests.get(search_url)
316
  if r.status_code == 200:
317
  root = ET.fromstring(r.text)
 
318
  ns = {'atom': 'http://www.w3.org/2005/Atom'}
319
  entries = root.findall('atom:entry', ns)
320
  results = []
 
342
  if link:
343
  st.markdown(f"[View Paper]({link})")
344
 
 
345
  if vocal_summary:
346
  spoken_text = f"Here are some Arxiv results for {q}. "
347
  if titles_summary:
 
371
  search = VideoSearch()
372
 
373
  # Create tabs
374
+ tab1, tab2, tab3, tab4, tab5 = st.tabs(["πŸ” Search", "πŸŽ™οΈ Voice Input", "πŸ“š Arxiv", "πŸ“‚ Files", "πŸ” Advanced Search"])
375
 
376
  # ---- Tab 1: Video Search ----
377
  with tab1:
 
425
  # ---- Tab 2: Voice Input ----
426
  with tab2:
427
  st.subheader("Voice Input")
 
428
  st.write("πŸŽ™οΈ Record your voice:")
429
  audio_bytes = audio_recorder()
430
  if audio_bytes:
 
465
  with tab4:
466
  show_file_manager()
467
 
468
+ # ---- Tab 5: Advanced Dataset Search ----
469
+ with tab5:
470
+ st.subheader("Advanced Dataset Search")
471
+
472
+ # Dataset input
473
+ dataset_id = st.text_input("Dataset ID:", value="omegalabsinc/omega-multimodal")
474
+
475
+ # Search configuration
476
+ col1, col2 = st.columns([2, 1])
477
+ with col1:
478
+ search_text = st.text_input("Search text:",
479
+ placeholder="Enter text to search across all fields")
480
+
481
+ # Get available configs and splits
482
+ if dataset_id:
483
+ dataset_info = fetch_dataset_info(dataset_id)
484
+ if dataset_info:
485
+ configs = dataset_info.get('config_names', ['default'])
486
+ with col2:
487
+ selected_configs = st.multiselect(
488
+ "Configurations:",
489
+ options=configs,
490
+ default=['default'] if 'default' in configs else None
491
+ )
492
+
493
+ # Fetch available splits
494
+ if selected_configs:
495
+ all_splits = set()
496
+ for config in selected_configs:
497
+ splits_url = f"https://datasets-server.huggingface.co/splits?dataset={dataset_id}&config={config}"
498
+ try:
499
+ response = requests.get(splits_url, timeout=30)
500
+ if response.status_code == 200:
501
+ splits_data = response.json()
502
+ splits = [split['split'] for split in splits_data.get('splits', [])]
503
+ all_splits.update(splits)
504
+ except Exception as e:
505
+ st.warning(f"Error fetching splits for {config}: {e}")
506
+
507
+ selected_splits = st.multiselect(
508
+ "Splits:",
509
+ options=list(all_splits),
510
+ default=['train'] if 'train' in all_splits else None
511
+ )
512
+
513
+ if st.button("πŸ” Search Dataset"):
514
+ with st.spinner("Searching dataset..."):
515
+ results_df, _, _ = search_dataset(
516
+ dataset_id,
517
+ search_text,
518
+ include_configs=selected_configs,
519
+ include_splits=selected_splits
520
+ )
521
+
522
+ if not results_df.empty:
523
+ st.write(f"Found {len(results_df)} results")
524
+
525
+ # Display results in expandable sections
526
+ for idx, row in results_df.iterrows():
527
+ with st.expander(
528
+ f"Result {idx+1} (Config: {row['_config']}, Split: {row['_split']}, Score: {row['_relevance_score']})"
529
+ ):
530
+ # Display all fields except internal ones
531
+ for col in row.index:
532
+ if not col.startswith('_') and not any(
533
+ term in col.lower()
534
+ for term in ['embed', 'vector', 'encoding']
535
+ ):
536
+ st.write(f"**{col}:** {row[col]}")
537
+
538
+ # Add buttons for audio/video if available
539
+ if 'youtube_id' in row:
540
+ st.video(
541
+ f"https://youtube.com/watch?v={row['youtube_id']}&t={row.get('start_time', 0)}"
542
+ )
543
+ else:
544
+ st.warning("No results found.")
545
+ else:
546
+ st.error("Unable to fetch dataset information.")
547
+
548
  # Sidebar
549
  with st.sidebar:
550
  st.subheader("βš™οΈ Settings & History")
 
564
  key="tts_voice")
565
 
566
  if __name__ == "__main__":
567
+ main()