awacke1 commited on
Commit
8427637
Β·
verified Β·
1 Parent(s): c6b9b5c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +451 -166
app.py CHANGED
@@ -3,218 +3,383 @@ import pandas as pd
3
  import numpy as np
4
  from sentence_transformers import SentenceTransformer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
- import requests
7
- from datetime import datetime
8
  import os
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- HF_KEY = os.getenv('DATASET_KEY')
11
-
12
- # Initialize session state variables
13
  if 'search_history' not in st.session_state:
14
  st.session_state['search_history'] = []
 
 
 
 
 
 
15
  if 'search_columns' not in st.session_state:
16
  st.session_state['search_columns'] = []
17
  if 'initial_search_done' not in st.session_state:
18
  st.session_state['initial_search_done'] = False
19
- if 'hf_token' not in st.session_state:
20
- st.session_state['hf_token'] = HF_KEY
 
 
21
 
22
- def fetch_dataset_info_auth(dataset_id, hf_token):
23
- """Fetch dataset information with authentication"""
24
  info_url = f"https://huggingface.co/api/datasets/{dataset_id}"
25
- headers = {"Authorization": f"Bearer {hf_token}"}
26
  try:
27
- response = requests.get(info_url, headers=headers, timeout=30)
28
  if response.status_code == 200:
29
  return response.json()
30
  except Exception as e:
31
  st.warning(f"Error fetching dataset info: {e}")
32
  return None
33
 
34
- def fetch_dataset_splits_auth(dataset_id, hf_token):
35
- """Fetch available splits for the dataset"""
36
- splits_url = f"https://datasets-server.huggingface.co/splits?dataset={dataset_id}"
37
- headers = {"Authorization": f"Bearer {hf_token}"}
38
- try:
39
- response = requests.get(splits_url, headers=headers, timeout=30)
40
- if response.status_code == 200:
41
- return response.json().get('splits', [])
42
- except Exception as e:
43
- st.warning(f"Error fetching splits: {e}")
44
- return []
45
-
46
- def fetch_parquet_urls_auth(dataset_id, config, split, hf_token):
47
- """Fetch Parquet file URLs for a specific split"""
48
- parquet_url = f"https://huggingface.co/api/datasets/{dataset_id}/parquet/{config}/{split}"
49
- headers = {"Authorization": f"Bearer {hf_token}"}
50
  try:
51
- response = requests.get(parquet_url, headers=headers, timeout=30)
52
  if response.status_code == 200:
53
- return response.json()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  except Exception as e:
55
- st.warning(f"Error fetching parquet URLs: {e}")
56
  return []
57
 
58
- def fetch_rows_auth(dataset_id, config, split, offset, length, hf_token):
59
- """Fetch rows with authentication"""
60
- url = f"https://datasets-server.huggingface.co/rows?dataset={dataset_id}&config={config}&split={split}&offset={offset}&length={length}"
61
- headers = {"Authorization": f"Bearer {hf_token}"}
62
- try:
63
- response = requests.get(url, headers=headers, timeout=30)
64
- if response.status_code == 200:
65
- return response.json()
66
- except Exception as e:
67
- st.warning(f"Error fetching rows: {e}")
68
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- class ParquetVideoSearch:
71
- def __init__(self, hf_token):
72
  self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
73
- self.dataset_id = "tomg-group-umd/cinepile"
74
- self.config = "v2"
75
- self.hf_token = hf_token
76
  self.load_dataset()
77
 
78
- def load_dataset(self):
79
- """Load initial dataset sample"""
80
  try:
81
- rows_data = fetch_rows_auth(
 
82
  self.dataset_id,
83
- self.config,
84
- "train",
85
- 0,
86
- 100,
87
- self.hf_token
88
  )
89
 
90
- if rows_data and 'rows' in rows_data:
91
- processed_rows = []
92
- for row_data in rows_data['rows']:
93
- row = row_data.get('row', row_data)
94
- processed_rows.append(row)
95
-
96
- self.dataset = pd.DataFrame(processed_rows)
97
- st.session_state['search_columns'] = [col for col in self.dataset.columns
98
- if not any(term in col.lower() for term in ['embed', 'vector', 'encoding'])]
99
- else:
100
- self.dataset = self.load_example_data()
101
 
 
 
102
  except Exception as e:
103
  st.warning(f"Error loading dataset: {e}")
104
- self.dataset = self.load_example_data()
105
-
106
- self.prepare_features()
107
 
108
  def load_example_data(self):
109
  """Load example data as fallback"""
110
- return pd.DataFrame([{
111
- "video_id": "example",
112
- "title": "Example Video",
113
- "description": "Example video content",
114
- "duration": 120,
115
- "start_time": 0,
116
- "end_time": 120
117
- }])
 
 
 
 
 
118
 
119
  def prepare_features(self):
120
- """Prepare text features for search"""
121
  try:
122
- # Combine relevant text fields for search
123
- text_fields = ['title', 'description'] if 'title' in self.dataset.columns else ['description']
124
- combined_text = self.dataset[text_fields].fillna('').agg(' '.join, axis=1)
125
- self.text_embeds = self.text_model.encode(combined_text.tolist())
126
 
127
- except Exception as e:
128
- st.warning(f"Error preparing features: {e}")
129
- self.text_embeds = np.random.randn(len(self.dataset), 384)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  def search(self, query, column=None, top_k=20):
132
- """Search using text embeddings and optional column filtering"""
133
  query_embedding = self.text_model.encode([query])[0]
134
- similarities = cosine_similarity([query_embedding], self.text_embeds)[0]
 
 
135
 
136
  # Column filtering
137
  if column and column in self.dataset.columns and column != "All Fields":
138
  mask = self.dataset[column].astype(str).str.contains(query, case=False)
139
- similarities[~mask] *= 0.5
140
 
141
- top_k = min(top_k, len(similarities))
142
- top_indices = np.argsort(similarities)[-top_k:][::-1]
143
 
144
  results = []
145
  for idx in top_indices:
146
- result = {
147
- 'relevance_score': float(similarities[idx]),
148
- **self.dataset.iloc[idx].to_dict()
149
- }
150
  results.append(result)
151
 
152
  return results
153
 
154
- def render_video_result(result):
155
- """Render a video result with enhanced display"""
156
- col1, col2 = st.columns([2, 1])
157
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  with col1:
159
- if 'title' in result:
160
- st.markdown(f"**Title:** {result['title']}")
161
- st.markdown("**Description:**")
162
- st.write(result.get('description', 'No description available'))
163
-
164
- # Show timing information
165
- start_time = result.get('start_time', 0)
166
- end_time = result.get('end_time', result.get('duration', 0))
167
- st.markdown(f"**Time Range:** {start_time}s - {end_time}s")
168
-
169
- # Show additional metadata
170
- for key, value in result.items():
171
- if key not in ['title', 'description', 'start_time', 'end_time', 'duration',
172
- 'relevance_score', 'video_id', '_config', '_split']:
173
- st.markdown(f"**{key.replace('_', ' ').title()}:** {value}")
174
 
175
  with col2:
176
- st.markdown(f"**Relevance Score:** {result['relevance_score']:.2%}")
177
-
178
- # Display video if URL is available
179
- video_url = None
180
- if 'video_url' in result:
181
- video_url = result['video_url']
182
- elif 'youtube_id' in result:
183
- video_url = f"https://youtube.com/watch?v={result['youtube_id']}&t={start_time}"
184
-
185
- if video_url:
186
- st.video(video_url)
187
-
188
- def main():
189
- st.title("πŸŽ₯ Video Dataset Search")
190
-
191
- # Get HF token from secrets or user input
192
- if not st.session_state['hf_token']:
193
- st.session_state['hf_token'] = HF_KEY
194
-
195
- if not st.session_state['hf_token']:
196
- hf_token = st.text_input("Enter your Hugging Face API token:", type="password")
197
- if hf_token:
198
- st.session_state['hf_token'] = hf_token
199
 
200
- if not st.session_state.get('hf_token'):
201
- st.warning("Please provide a Hugging Face API token to access the dataset.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
  # Initialize search class
205
- search = ParquetVideoSearch(st.session_state['hf_token'])
206
 
207
  # Create tabs
208
- tab1, tab2 = st.tabs(["πŸ” Video Search", "πŸ“Š Dataset Info"])
209
 
210
  # ---- Tab 1: Video Search ----
211
  with tab1:
212
  st.subheader("Search Videos")
213
  col1, col2 = st.columns([3, 1])
214
-
215
  with col1:
216
  query = st.text_input("Enter your search query:",
217
- value="" if st.session_state['initial_search_done'] else "")
218
  with col2:
219
  search_column = st.selectbox("Search in field:",
220
  ["All Fields"] + st.session_state['search_columns'])
@@ -225,10 +390,9 @@ def main():
225
  with col4:
226
  search_button = st.button("πŸ” Search")
227
 
228
- if search_button and query:
229
  st.session_state['initial_search_done'] = True
230
  selected_column = None if search_column == "All Fields" else search_column
231
-
232
  with st.spinner("Searching..."):
233
  results = search.search(query, selected_column, num_results)
234
 
@@ -239,35 +403,151 @@ def main():
239
  })
240
 
241
  for i, result in enumerate(results, 1):
242
- with st.expander(
243
- f"Result {i}: {result.get('title', result.get('description', 'No title'))[:100]}...",
244
- expanded=(i==1)
245
- ):
246
- render_video_result(result)
247
-
248
- # ---- Tab 2: Dataset Info ----
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  with tab2:
250
- st.subheader("Dataset Information")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
- # Show available splits
253
- splits = fetch_dataset_splits_auth(search.dataset_id, st.session_state['hf_token'])
254
- if splits:
255
- st.write("### Available Splits")
256
- for split in splits:
257
- st.write(f"- {split['split']}: {split.get('num_rows', 'unknown')} rows")
 
 
 
 
 
258
 
259
- # Show dataset statistics
260
- st.write("### Dataset Statistics")
261
- st.write(f"- Loaded rows: {len(search.dataset)}")
262
- st.write(f"- Available columns: {', '.join(search.dataset.columns)}")
263
 
264
- # Show sample data
265
- st.write("### Sample Data")
266
- st.dataframe(search.dataset.head())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
  # Sidebar
269
  with st.sidebar:
270
- st.subheader("βš™οΈ Search History")
271
  if st.button("πŸ—‘οΈ Clear History"):
272
  st.session_state['search_history'] = []
273
  st.experimental_rerun()
@@ -276,7 +556,12 @@ def main():
276
  for entry in reversed(st.session_state['search_history'][-5:]):
277
  with st.expander(f"{entry['timestamp']}: {entry['query']}"):
278
  for i, result in enumerate(entry['results'], 1):
279
- st.write(f"{i}. {result.get('title', result.get('description', 'No title'))[:100]}...")
 
 
 
 
 
280
 
281
  if __name__ == "__main__":
282
  main()
 
3
  import numpy as np
4
  from sentence_transformers import SentenceTransformer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
+ import torch
7
+ import json
8
  import os
9
+ import glob
10
+ from pathlib import Path
11
+ from datetime import datetime
12
+ import edge_tts
13
+ import asyncio
14
+ import base64
15
+ import requests
16
+ from collections import defaultdict
17
+ from audio_recorder_streamlit import audio_recorder
18
+ import streamlit.components.v1 as components
19
+ from urllib.parse import quote
20
+ from xml.etree import ElementTree as ET
21
 
22
+ # Initialize session state
 
 
23
  if 'search_history' not in st.session_state:
24
  st.session_state['search_history'] = []
25
+ if 'last_voice_input' not in st.session_state:
26
+ st.session_state['last_voice_input'] = ""
27
+ if 'transcript_history' not in st.session_state:
28
+ st.session_state['transcript_history'] = []
29
+ if 'should_rerun' not in st.session_state:
30
+ st.session_state['should_rerun'] = False
31
  if 'search_columns' not in st.session_state:
32
  st.session_state['search_columns'] = []
33
  if 'initial_search_done' not in st.session_state:
34
  st.session_state['initial_search_done'] = False
35
+ if 'tts_voice' not in st.session_state:
36
+ st.session_state['tts_voice'] = "en-US-AriaNeural"
37
+ if 'arxiv_last_query' not in st.session_state:
38
+ st.session_state['arxiv_last_query'] = ""
39
 
40
+ def fetch_dataset_info(dataset_id):
41
+ """Fetch dataset information including all available configs and splits"""
42
  info_url = f"https://huggingface.co/api/datasets/{dataset_id}"
 
43
  try:
44
+ response = requests.get(info_url, timeout=30)
45
  if response.status_code == 200:
46
  return response.json()
47
  except Exception as e:
48
  st.warning(f"Error fetching dataset info: {e}")
49
  return None
50
 
51
+ def fetch_dataset_rows(dataset_id, config="default", split="train", max_rows=100):
52
+ """Fetch rows from a specific config and split of a dataset"""
53
+ url = f"https://datasets-server.huggingface.co/first-rows?dataset={dataset_id}&config={config}&split={split}"
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  try:
55
+ response = requests.get(url, timeout=30)
56
  if response.status_code == 200:
57
+ data = response.json()
58
+ if 'rows' in data:
59
+ processed_rows = []
60
+ for row_data in data['rows']:
61
+ row = row_data.get('row', row_data)
62
+ # Process embeddings if present
63
+ for key in row:
64
+ if any(term in key.lower() for term in ['embed', 'vector', 'encoding']):
65
+ if isinstance(row[key], str):
66
+ try:
67
+ row[key] = [float(x.strip()) for x in row[key].strip('[]').split(',') if x.strip()]
68
+ except:
69
+ continue
70
+ row['_config'] = config
71
+ row['_split'] = split
72
+ processed_rows.append(row)
73
+ return processed_rows
74
  except Exception as e:
75
+ st.warning(f"Error fetching rows for {config}/{split}: {e}")
76
  return []
77
 
78
+ def search_dataset(dataset_id, search_text, include_configs=None, include_splits=None):
79
+ """
80
+ Search across all configurations and splits of a dataset
81
+
82
+ Args:
83
+ dataset_id (str): The Hugging Face dataset ID
84
+ search_text (str): Text to search for in descriptions and queries
85
+ include_configs (list): List of specific configs to search, or None for all
86
+ include_splits (list): List of specific splits to search, or None for all
87
+
88
+ Returns:
89
+ tuple: (DataFrame of results, list of available configs, list of available splits)
90
+ """
91
+ # Get dataset info
92
+ dataset_info = fetch_dataset_info(dataset_id)
93
+ if not dataset_info:
94
+ return pd.DataFrame(), [], []
95
+
96
+ # Get available configs and splits
97
+ configs = include_configs if include_configs else dataset_info.get('config_names', ['default'])
98
+ all_rows = []
99
+ available_splits = set()
100
+
101
+ # Search across configs and splits
102
+ for config in configs:
103
+ try:
104
+ # First fetch split info for this config
105
+ splits_url = f"https://datasets-server.huggingface.co/splits?dataset={dataset_id}&config={config}"
106
+ splits_response = requests.get(splits_url, timeout=30)
107
+ if splits_response.status_code == 200:
108
+ splits_data = splits_response.json()
109
+ splits = [split['split'] for split in splits_data.get('splits', [])]
110
+ if not splits:
111
+ splits = ['train'] # fallback to train if no splits found
112
+
113
+ # Filter splits if specified
114
+ if include_splits:
115
+ splits = [s for s in splits if s in include_splits]
116
+
117
+ available_splits.update(splits)
118
+
119
+ # Fetch and search rows for each split
120
+ for split in splits:
121
+ rows = fetch_dataset_rows(dataset_id, config, split)
122
+ for row in rows:
123
+ # Search in all text fields
124
+ text_content = ' '.join(str(v) for v in row.values() if isinstance(v, (str, int, float)))
125
+ if search_text.lower() in text_content.lower():
126
+ row['_matched_text'] = text_content
127
+ row['_relevance_score'] = text_content.lower().count(search_text.lower())
128
+ all_rows.append(row)
129
+
130
+ except Exception as e:
131
+ st.warning(f"Error processing config {config}: {e}")
132
+ continue
133
+
134
+ # Convert to DataFrame and sort by relevance
135
+ if all_rows:
136
+ df = pd.DataFrame(all_rows)
137
+ df = df.sort_values('_relevance_score', ascending=False)
138
+ return df, configs, list(available_splits)
139
+
140
+ return pd.DataFrame(), configs, list(available_splits)
141
 
142
+ class VideoSearch:
143
+ def __init__(self):
144
  self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
145
+ self.dataset_id = "omegalabsinc/omega-multimodal"
 
 
146
  self.load_dataset()
147
 
148
+ def fetch_dataset_rows(self):
149
+ """Fetch dataset with enhanced search capabilities"""
150
  try:
151
+ # First try to get all available data
152
+ df, configs, splits = search_dataset(
153
  self.dataset_id,
154
+ "", # empty search text to get all data
155
+ include_configs=None, # all configs
156
+ include_splits=None # all splits
 
 
157
  )
158
 
159
+ if not df.empty:
160
+ st.session_state['search_columns'] = [col for col in df.columns
161
+ if col not in ['video_embed', 'description_embed', 'audio_embed']
162
+ and not col.startswith('_')]
163
+ return df
 
 
 
 
 
 
164
 
165
+ return self.load_example_data()
166
+
167
  except Exception as e:
168
  st.warning(f"Error loading dataset: {e}")
169
+ return self.load_example_data()
 
 
170
 
171
  def load_example_data(self):
172
  """Load example data as fallback"""
173
+ example_data = [
174
+ {
175
+ "video_id": "cd21da96-fcca-4c94-a60f-0b1e4e1e29fc",
176
+ "youtube_id": "IO-vwtyicn4",
177
+ "description": "This video shows a close-up of an ancient text carved into a surface.",
178
+ "views": 45489,
179
+ "start_time": 1452,
180
+ "end_time": 1458,
181
+ "video_embed": [0.014160037972033024, -0.003111184574663639, -0.016604168340563774],
182
+ "description_embed": [-0.05835828185081482, 0.02589797042310238, 0.11952091753482819]
183
+ }
184
+ ]
185
+ return pd.DataFrame(example_data)
186
 
187
  def prepare_features(self):
188
+ """Prepare embeddings with adaptive field detection"""
189
  try:
190
+ embed_cols = [col for col in self.dataset.columns
191
+ if any(term in col.lower() for term in ['embed', 'vector', 'encoding'])]
 
 
192
 
193
+ embeddings = {}
194
+ for col in embed_cols:
195
+ try:
196
+ data = []
197
+ for row in self.dataset[col]:
198
+ if isinstance(row, str):
199
+ values = [float(x.strip()) for x in row.strip('[]').split(',') if x.strip()]
200
+ elif isinstance(row, list):
201
+ values = row
202
+ else:
203
+ continue
204
+ data.append(values)
205
+
206
+ if data:
207
+ embeddings[col] = np.array(data)
208
+ except:
209
+ continue
210
+
211
+ # Set main embeddings for search
212
+ if 'video_embed' in embeddings:
213
+ self.video_embeds = embeddings['video_embed']
214
+ else:
215
+ self.video_embeds = next(iter(embeddings.values()))
216
+
217
+ if 'description_embed' in embeddings:
218
+ self.text_embeds = embeddings['description_embed']
219
+ else:
220
+ self.text_embeds = self.video_embeds
221
+
222
+ except:
223
+ # Fallback to random embeddings
224
+ num_rows = len(self.dataset)
225
+ self.video_embeds = np.random.randn(num_rows, 384)
226
+ self.text_embeds = np.random.randn(num_rows, 384)
227
+
228
+ def load_dataset(self):
229
+ self.dataset = self.fetch_dataset_rows()
230
+ self.prepare_features()
231
 
232
  def search(self, query, column=None, top_k=20):
 
233
  query_embedding = self.text_model.encode([query])[0]
234
+ video_sims = cosine_similarity([query_embedding], self.video_embeds)[0]
235
+ text_sims = cosine_similarity([query_embedding], self.text_embeds)[0]
236
+ combined_sims = 0.5 * video_sims + 0.5 * text_sims
237
 
238
  # Column filtering
239
  if column and column in self.dataset.columns and column != "All Fields":
240
  mask = self.dataset[column].astype(str).str.contains(query, case=False)
241
+ combined_sims[~mask] *= 0.5
242
 
243
+ top_k = min(top_k, 100)
244
+ top_indices = np.argsort(combined_sims)[-top_k:][::-1]
245
 
246
  results = []
247
  for idx in top_indices:
248
+ result = {'relevance_score': float(combined_sims[idx])}
249
+ for col in self.dataset.columns:
250
+ if col not in ['video_embed', 'description_embed', 'audio_embed']:
251
+ result[col] = self.dataset.iloc[idx][col]
252
  results.append(result)
253
 
254
  return results
255
 
256
+ @st.cache_resource
257
+ def get_speech_model():
258
+ return edge_tts.Communicate
259
+
260
+ async def generate_speech(text, voice=None):
261
+ if not text.strip():
262
+ return None
263
+ if not voice:
264
+ voice = st.session_state['tts_voice']
265
+ try:
266
+ communicate = get_speech_model()(text, voice)
267
+ audio_file = f"speech_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
268
+ await communicate.save(audio_file)
269
+ return audio_file
270
+ except Exception as e:
271
+ st.error(f"Error generating speech: {e}")
272
+ return None
273
+
274
+ def transcribe_audio(audio_path):
275
+ """Placeholder for ASR transcription"""
276
+ return "ASR not implemented. Integrate a local model or another service here."
277
+
278
+ def show_file_manager():
279
+ """Display file manager interface"""
280
+ st.subheader("πŸ“‚ File Manager")
281
+ col1, col2 = st.columns(2)
282
  with col1:
283
+ uploaded_file = st.file_uploader("Upload File", type=['txt', 'md', 'mp3'])
284
+ if uploaded_file:
285
+ with open(uploaded_file.name, "wb") as f:
286
+ f.write(uploaded_file.getvalue())
287
+ st.success(f"Uploaded: {uploaded_file.name}")
288
+ st.experimental_rerun()
 
 
 
 
 
 
 
 
 
289
 
290
  with col2:
291
+ if st.button("πŸ—‘ Clear All Files"):
292
+ for f in glob.glob("*.txt") + glob.glob("*.md") + glob.glob("*.mp3"):
293
+ os.remove(f)
294
+ st.success("All files cleared!")
295
+ st.experimental_rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
+ files = glob.glob("*.txt") + glob.glob("*.md") + glob.glob("*.mp3")
298
+ if files:
299
+ st.write("### Existing Files")
300
+ for f in files:
301
+ with st.expander(f"πŸ“„ {os.path.basename(f)}"):
302
+ if f.endswith('.mp3'):
303
+ st.audio(f)
304
+ else:
305
+ with open(f, 'r', encoding='utf-8') as file:
306
+ st.text_area("Content", file.read(), height=100)
307
+ if st.button(f"Delete {os.path.basename(f)}", key=f"del_{f}"):
308
+ os.remove(f)
309
+ st.experimental_rerun()
310
+
311
+ def arxiv_search(query, max_results=5):
312
+ """Perform a simple Arxiv search using their API and return top results."""
313
+ base_url = "http://export.arxiv.org/api/query?"
314
+ search_url = base_url + f"search_query={quote(query)}&start=0&max_results={max_results}"
315
+ r = requests.get(search_url)
316
+ if r.status_code == 200:
317
+ root = ET.fromstring(r.text)
318
+ ns = {'atom': 'http://www.w3.org/2005/Atom'}
319
+ entries = root.findall('atom:entry', ns)
320
+ results = []
321
+ for entry in entries:
322
+ title = entry.find('atom:title', ns).text.strip()
323
+ summary = entry.find('atom:summary', ns).text.strip()
324
+ link = None
325
+ for l in entry.findall('atom:link', ns):
326
+ if l.get('type') == 'text/html':
327
+ link = l.get('href')
328
+ break
329
+ results.append((title, summary, link))
330
+ return results
331
+ return []
332
+
333
+ def perform_arxiv_lookup(q, vocal_summary=True, titles_summary=True, full_audio=False):
334
+ results = arxiv_search(q, max_results=5)
335
+ if not results:
336
+ st.write("No Arxiv results found.")
337
  return
338
+ st.markdown(f"**Arxiv Search Results for '{q}':**")
339
+ for i, (title, summary, link) in enumerate(results, start=1):
340
+ st.markdown(f"**{i}. {title}**")
341
+ st.write(summary)
342
+ if link:
343
+ st.markdown(f"[View Paper]({link})")
344
+
345
+ if vocal_summary:
346
+ spoken_text = f"Here are some Arxiv results for {q}. "
347
+ if titles_summary:
348
+ spoken_text += " Titles: " + ", ".join([res[0] for res in results])
349
+ else:
350
+ # Just first summary if no titles_summary
351
+ spoken_text += " " + results[0][1][:200]
352
+
353
+ audio_file = asyncio.run(generate_speech(spoken_text))
354
+ if audio_file:
355
+ st.audio(audio_file)
356
+
357
+ if full_audio:
358
+ # Full audio of summaries
359
+ full_text = ""
360
+ for i,(title, summary, _) in enumerate(results, start=1):
361
+ full_text += f"Result {i}: {title}. {summary} "
362
+ audio_file_full = asyncio.run(generate_speech(full_text))
363
+ if audio_file_full:
364
+ st.write("### Full Audio")
365
+ st.audio(audio_file_full)
366
+
367
+ def main():
368
+ st.title("πŸŽ₯ Video & Arxiv Search with Voice (No OpenAI/Anthropic)")
369
 
370
  # Initialize search class
371
+ search = VideoSearch()
372
 
373
  # Create tabs
374
+ tab1, tab2, tab3, tab4, tab5 = st.tabs(["πŸ” Search", "πŸŽ™οΈ Voice Input", "πŸ“š Arxiv", "πŸ“‚ Files", "πŸ” Advanced Search"])
375
 
376
  # ---- Tab 1: Video Search ----
377
  with tab1:
378
  st.subheader("Search Videos")
379
  col1, col2 = st.columns([3, 1])
 
380
  with col1:
381
  query = st.text_input("Enter your search query:",
382
+ value="ancient" if not st.session_state['initial_search_done'] else "")
383
  with col2:
384
  search_column = st.selectbox("Search in field:",
385
  ["All Fields"] + st.session_state['search_columns'])
 
390
  with col4:
391
  search_button = st.button("πŸ” Search")
392
 
393
+ if (search_button or not st.session_state['initial_search_done']) and query:
394
  st.session_state['initial_search_done'] = True
395
  selected_column = None if search_column == "All Fields" else search_column
 
396
  with st.spinner("Searching..."):
397
  results = search.search(query, selected_column, num_results)
398
 
 
403
  })
404
 
405
  for i, result in enumerate(results, 1):
406
+ with st.expander(f"Result {i}: {result['description'][:100]}...", expanded=(i==1)):
407
+ cols = st.columns([2, 1])
408
+ with cols[0]:
409
+ st.markdown("**Description:**")
410
+ st.write(result['description'])
411
+ st.markdown(f"**Time Range:** {result['start_time']}s - {result['end_time']}s")
412
+ st.markdown(f"**Views:** {result['views']:,}")
413
+
414
+ with cols[1]:
415
+ st.markdown(f"**Relevance Score:** {result['relevance_score']:.2%}")
416
+ if result.get('youtube_id'):
417
+ st.video(f"https://youtube.com/watch?v={result['youtube_id']}&t={result['start_time']}")
418
+
419
+ if st.button(f"πŸ”Š Audio Summary", key=f"audio_{i}"):
420
+ summary = f"Video summary: {result['description'][:200]}"
421
+ audio_file = asyncio.run(generate_speech(summary))
422
+ if audio_file:
423
+ st.audio(audio_file)
424
+
425
+ # ---- Tab 2: Voice Input ----
426
  with tab2:
427
+ st.subheader("Voice Input")
428
+ st.write("πŸŽ™οΈ Record your voice:")
429
+ audio_bytes = audio_recorder()
430
+ if audio_bytes:
431
+ audio_path = f"temp_audio_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"
432
+ with open(audio_path, "wb") as f:
433
+ f.write(audio_bytes)
434
+ st.success("Audio recorded successfully!")
435
+
436
+ voice_query = transcribe_audio(audio_path)
437
+ st.markdown("**Transcribed Text:**")
438
+ st.write(voice_query)
439
+ st.session_state['last_voice_input'] = voice_query
440
+
441
+ if st.button("πŸ” Search from Voice"):
442
+ results = search.search(voice_query, None, 20)
443
+ for i, result in enumerate(results, 1):
444
+ with st.expander(f"Result {i}", expanded=(i==1)):
445
+ st.write(result['description'])
446
+ if result.get('youtube_id'):
447
+ st.video(f"https://youtube.com/watch?v={result['youtube_id']}&t={result.get('start_time', 0)}")
448
+
449
+ if os.path.exists(audio_path):
450
+ os.remove(audio_path)
451
+
452
+ # ---- Tab 3: Arxiv Search ----
453
+ with tab3:
454
+ st.subheader("Arxiv Search")
455
+ q = st.text_input("Enter your Arxiv search query:", value=st.session_state['arxiv_last_query'])
456
+ vocal_summary = st.checkbox("πŸŽ™ Short Audio Summary", value=True)
457
+ titles_summary = st.checkbox("πŸ”– Titles Only", value=True)
458
+ full_audio = st.checkbox("πŸ“š Full Audio Results", value=False)
459
 
460
+ if st.button("πŸ” Arxiv Search"):
461
+ st.session_state['arxiv_last_query'] = q
462
+ perform_arxiv_lookup(q, vocal_summary=vocal_summary, titles_summary=titles_summary, full_audio=full_audio)
463
+
464
+ # ---- Tab 4: File Manager ----
465
+ with tab4:
466
+ show_file_manager()
467
+
468
+ # ---- Tab 5: Advanced Dataset Search ----
469
+ with tab5:
470
+ st.subheader("Advanced Dataset Search")
471
 
472
+ # Dataset input
473
+ dataset_id = st.text_input("Dataset ID:", value="omegalabsinc/omega-multimodal")
 
 
474
 
475
+ # Search configuration
476
+ col1, col2 = st.columns([2, 1])
477
+ with col1:
478
+ search_text = st.text_input("Search text:",
479
+ placeholder="Enter text to search across all fields")
480
+
481
+ # Get available configs and splits
482
+ if dataset_id:
483
+ dataset_info = fetch_dataset_info(dataset_id)
484
+ if dataset_info:
485
+ configs = dataset_info.get('config_names', ['default'])
486
+ with col2:
487
+ selected_configs = st.multiselect(
488
+ "Configurations:",
489
+ options=configs,
490
+ default=['default'] if 'default' in configs else None
491
+ )
492
+
493
+ # Fetch available splits
494
+ if selected_configs:
495
+ all_splits = set()
496
+ for config in selected_configs:
497
+ splits_url = f"https://datasets-server.huggingface.co/splits?dataset={dataset_id}&config={config}"
498
+ try:
499
+ response = requests.get(splits_url, timeout=30)
500
+ if response.status_code == 200:
501
+ splits_data = response.json()
502
+ splits = [split['split'] for split in splits_data.get('splits', [])]
503
+ all_splits.update(splits)
504
+ except Exception as e:
505
+ st.warning(f"Error fetching splits for {config}: {e}")
506
+
507
+ selected_splits = st.multiselect(
508
+ "Splits:",
509
+ options=list(all_splits),
510
+ default=['train'] if 'train' in all_splits else None
511
+ )
512
+
513
+ if st.button("πŸ” Search Dataset"):
514
+ with st.spinner("Searching dataset..."):
515
+ results_df, _, _ = search_dataset(
516
+ dataset_id,
517
+ search_text,
518
+ include_configs=selected_configs,
519
+ include_splits=selected_splits
520
+ )
521
+
522
+ if not results_df.empty:
523
+ st.write(f"Found {len(results_df)} results")
524
+
525
+ # Display results in expandable sections
526
+ for idx, row in results_df.iterrows():
527
+ with st.expander(
528
+ f"Result {idx+1} (Config: {row['_config']}, Split: {row['_split']}, Score: {row['_relevance_score']})"
529
+ ):
530
+ # Display all fields except internal ones
531
+ for col in row.index:
532
+ if not col.startswith('_') and not any(
533
+ term in col.lower()
534
+ for term in ['embed', 'vector', 'encoding']
535
+ ):
536
+ st.write(f"**{col}:** {row[col]}")
537
+
538
+ # Add buttons for audio/video if available
539
+ if 'youtube_id' in row:
540
+ st.video(
541
+ f"https://youtube.com/watch?v={row['youtube_id']}&t={row.get('start_time', 0)}"
542
+ )
543
+ else:
544
+ st.warning("No results found.")
545
+ else:
546
+ st.error("Unable to fetch dataset information.")
547
 
548
  # Sidebar
549
  with st.sidebar:
550
+ st.subheader("βš™οΈ Settings & History")
551
  if st.button("πŸ—‘οΈ Clear History"):
552
  st.session_state['search_history'] = []
553
  st.experimental_rerun()
 
556
  for entry in reversed(st.session_state['search_history'][-5:]):
557
  with st.expander(f"{entry['timestamp']}: {entry['query']}"):
558
  for i, result in enumerate(entry['results'], 1):
559
+ st.write(f"{i}. {result['description'][:100]}...")
560
+
561
+ st.markdown("### Voice Settings")
562
+ st.selectbox("TTS Voice:",
563
+ ["en-US-AriaNeural", "en-US-GuyNeural", "en-GB-SoniaNeural"],
564
+ key="tts_voice")
565
 
566
  if __name__ == "__main__":
567
  main()