awacke1 commited on
Commit
e9907ed
Β·
verified Β·
1 Parent(s): 16ef1bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +209 -318
app.py CHANGED
@@ -22,20 +22,24 @@ import base64
22
  import re
23
 
24
  # -------------------- Configuration & Constants --------------------
25
- # User name assignment
26
  USER_NAMES = [
27
  "Alex", "Jordan", "Taylor", "Morgan", "Rowan", "Avery", "Riley", "Quinn",
28
  "Casey", "Jesse", "Reese", "Skyler", "Ellis", "Devon", "Aubrey", "Kendall",
29
  "Parker", "Dakota", "Sage", "Finley"
30
  ]
31
 
 
 
 
 
 
 
32
  ROWS_PER_PAGE = 100
33
  MIN_SEARCH_SCORE = 0.3
34
  EXACT_MATCH_BOOST = 2.0
35
  SAVED_INPUTS_DIR = "saved_inputs"
36
  os.makedirs(SAVED_INPUTS_DIR, exist_ok=True)
37
 
38
- # -------------------- Session State Initialization --------------------
39
  SESSION_VARS = {
40
  'search_history': [],
41
  'last_voice_input': "",
@@ -53,21 +57,20 @@ SESSION_VARS = {
53
  'nps_last_shown': None,
54
  'old_val': None,
55
  'voice_text': None,
56
- 'user_name': None, # Track user name
57
- 'max_items': 100 # Default max items
 
58
  }
59
 
60
  for var, default in SESSION_VARS.items():
61
  if var not in st.session_state:
62
  st.session_state[var] = default
63
 
64
- # Assign user name if not assigned
65
- if st.session_state['user_name'] is None:
66
- st.session_state['user_name'] = random.choice(USER_NAMES)
67
 
68
- # -------------------- Utility Functions --------------------
69
  def create_voice_component():
70
- """Create the voice input component"""
71
  mycomponent = components.declare_component(
72
  "mycomponent",
73
  path="mycomponent"
@@ -83,7 +86,6 @@ def clean_for_speech(text: str) -> str:
83
  return text
84
 
85
  async def edge_tts_generate_audio(text, voice="en-US-AriaNeural", rate=0, pitch=0):
86
- """Generate audio using Edge TTS"""
87
  text = clean_for_speech(text)
88
  if not text.strip():
89
  return None
@@ -94,68 +96,39 @@ async def edge_tts_generate_audio(text, voice="en-US-AriaNeural", rate=0, pitch=
94
  await communicate.save(out_fn)
95
  return out_fn
96
 
97
- def speak_with_edge_tts(text, voice="en-US-AriaNeural", rate=0, pitch=0):
98
- return asyncio.run(edge_tts_generate_audio(text, voice, rate, pitch))
99
 
100
  def play_and_download_audio(file_path):
101
- """Play and provide download link for audio"""
102
  if file_path and os.path.exists(file_path):
103
  st.audio(file_path)
104
  dl_link = f'<a href="data:audio/mpeg;base64,{base64.b64encode(open(file_path,"rb").read()).decode()}" download="{os.path.basename(file_path)}">Download {os.path.basename(file_path)}</a>'
105
  st.markdown(dl_link, unsafe_allow_html=True)
106
 
107
- @st.cache_resource
108
- def get_model():
109
- return SentenceTransformer('all-MiniLM-L6-v2')
110
-
111
- @st.cache_data
112
- def load_dataset_page(dataset_id, token, page, rows_per_page):
113
- try:
114
- start_idx = page * rows_per_page
115
- end_idx = start_idx + rows_per_page
116
- dataset = load_dataset(
117
- dataset_id,
118
- token=token,
119
- streaming=False,
120
- split=f'train[{start_idx}:{end_idx}]'
121
- )
122
- return pd.DataFrame(dataset)
123
- except Exception as e:
124
- st.error(f"Error loading page {page}: {str(e)}")
125
- return pd.DataFrame()
126
-
127
- @st.cache_data
128
- def get_dataset_info(dataset_id, token):
129
- try:
130
- dataset = load_dataset(dataset_id, token=token, streaming=True)
131
- return dataset['train'].info
132
- except Exception as e:
133
- st.error(f"Error loading dataset info: {str(e)}")
134
- return None
135
-
136
- def fetch_dataset_info(dataset_id):
137
- info_url = f"https://huggingface.co/api/datasets/{dataset_id}"
138
- try:
139
- response = requests.get(info_url, timeout=30)
140
- if response.status_code == 200:
141
- return response.json()
142
- except Exception as e:
143
- st.warning(f"Error fetching dataset info: {e}")
144
- return None
145
-
146
- def generate_filename(text):
147
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
148
  safe_text = re.sub(r'[^\w\s-]', '', text[:50]).strip().lower()
149
  safe_text = re.sub(r'[-\s]+', '-', safe_text)
150
- return f"{timestamp}_{safe_text}.md"
151
 
152
- def save_input_as_md(text):
153
  if not text.strip():
154
  return
155
- fn = generate_filename(text)
156
  full_path = os.path.join(SAVED_INPUTS_DIR, fn)
157
  with open(full_path, 'w', encoding='utf-8') as f:
158
- f.write(f"# User: {st.session_state['user_name']}\n")
 
 
 
 
 
 
 
 
 
 
 
159
  f.write(f"**Timestamp:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
160
  f.write(text)
161
  return full_path
@@ -164,60 +137,61 @@ def list_saved_inputs():
164
  files = sorted(glob.glob(os.path.join(SAVED_INPUTS_DIR, "*.md")))
165
  return files
166
 
167
- def render_result(result, index=None):
168
- score = result.get('relevance_score', 0)
169
- result_filtered = {k: v for k, v in result.items()
170
- if k not in ['relevance_score', 'video_embed', 'description_embed', 'audio_embed']}
171
-
172
- if 'youtube_id' in result:
173
- st.video(f"https://youtube.com/watch?v={result['youtube_id']}&t={result.get('start_time', 0)}")
174
-
175
- cols = st.columns([2, 1])
176
- with cols[0]:
177
- text_content = []
178
- for key, value in result_filtered.items():
179
- if isinstance(value, (str, int, float)):
180
- st.write(f"**{key}:** {value}")
181
- if isinstance(value, str) and len(value.strip()) > 0:
182
- text_content.append(f"{key}: {value}")
183
-
184
- with cols[1]:
185
- st.metric("Relevance", f"{score:.2%}")
186
-
187
- voices = {
188
- "Aria (US Female)": "en-US-AriaNeural",
189
- "Guy (US Male)": "en-US-GuyNeural",
190
- "Sonia (UK Female)": "en-GB-SoniaNeural",
191
- "Tony (UK Male)": "en-GB-TonyNeural"
192
- }
193
-
194
- # Ensure unique keys by using the index
195
- voice_key = f"voice_{index}" if index is not None else f"voice_{id(result)}"
196
-
197
- selected_voice = st.selectbox(
198
- "Voice:",
199
- list(voices.keys()),
200
- key=voice_key
 
 
 
 
 
 
 
 
 
 
 
201
  )
202
-
203
- read_key = f"read_{voice_key}"
204
- if st.button("πŸ”Š Read", key=read_key):
205
- text_to_read = ". ".join(text_content)
206
- audio_file = speak_with_edge_tts(text_to_read, voices[selected_voice])
207
- if audio_file:
208
- play_and_download_audio(audio_file)
209
 
210
  class FastDatasetSearcher:
211
  def __init__(self, dataset_id="tomg-group-umd/cinepile"):
212
  self.dataset_id = dataset_id
213
  self.text_model = get_model()
214
  self.token = os.environ.get('DATASET_KEY')
215
- if not self.token:
216
- st.error("Please set the DATASET_KEY environment variable")
217
- st.stop()
218
-
219
- if st.session_state['dataset_info'] is None:
220
- st.session_state['dataset_info'] = get_dataset_info(self.dataset_id, self.token)
221
 
222
  def load_page(self, page=0):
223
  return load_dataset_page(self.dataset_id, self.token, page, ROWS_PER_PAGE)
@@ -245,7 +219,6 @@ class FastDatasetSearcher:
245
  text_parts = []
246
  row_matched = False
247
  exact_match = False
248
-
249
  priority_fields = ['description', 'matched_text']
250
  other_fields = [col for col in searchable_cols if col not in priority_fields]
251
 
@@ -271,7 +244,6 @@ class FastDatasetSearcher:
271
  text_parts.append(str(val))
272
 
273
  text = ' '.join(text_parts)
274
-
275
  if text.strip():
276
  text_tokens = set(text.lower().split())
277
  matching_terms = query_terms.intersection(text_tokens)
@@ -303,241 +275,160 @@ class FastDatasetSearcher:
303
  ]
304
 
305
  return filtered_df.sort_values('score', ascending=False)
306
-
307
- except Exception as e:
308
- st.error(f"Search error: {str(e)}")
309
  return df
310
 
311
- # -------------------- Main App --------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  def main():
313
- st.title("πŸŽ₯ Smart Video & Voice Search")
314
-
315
- # Load saved inputs (conversation history)
316
- saved_files = list_saved_inputs()
317
-
318
- # Initialize components
319
- voice_component = create_voice_component()
320
- search = FastDatasetSearcher()
321
-
322
- # Voice input at top level
323
- voice_val = voice_component(my_input_value="Start speaking...")
324
-
325
- # User can override max items
326
  with st.sidebar:
327
- st.write(f"**Current User:** {st.session_state['user_name']}")
328
- st.session_state['max_items'] = st.number_input("Max Items per search iteration:", min_value=1, max_value=1000, value=st.session_state['max_items'])
329
- st.subheader("πŸ“ Saved Inputs:")
330
- # Show saved md files in order
331
- for fpath in saved_files:
332
- fname = os.path.basename(fpath)
333
- st.write(f"- [{fname}]({fpath})")
334
-
335
- if voice_val:
336
- voice_text = str(voice_val).strip()
337
- edited_input = st.text_area("✏️ Edit Voice Input:", value=voice_text, height=100)
338
 
339
- # Auto-run default True now
340
- run_option = st.selectbox("Select Search Type:",
341
- ["Quick Search", "Deep Search", "Voice Summary"])
342
 
343
- col1, col2 = st.columns(2)
344
- with col1:
345
- autorun = st.checkbox("⚑ Auto-Run", value=True)
346
- with col2:
347
- full_audio = st.checkbox("πŸ”Š Full Audio", value=False)
348
-
349
- input_changed = (voice_text != st.session_state.get('old_val'))
350
 
351
- if autorun and input_changed:
352
- # Save input as md file immediately
353
- saved_path = save_input_as_md(edited_input)
354
- st.session_state['old_val'] = voice_text
355
- with st.spinner("Processing voice input..."):
356
- # Instead of just top 20, show up to max_items in order
357
- if run_option == "Quick Search":
358
- df = search.load_page()
359
- results = search.quick_search(edited_input, df)
360
- # Show results in order, stopping at max_items
361
- shown = 0
362
- for i, result in enumerate(results.iterrows(), 1):
363
- if shown >= st.session_state['max_items']:
364
- break
365
- with st.expander(f"Result {i}", expanded=(i==1)):
366
- render_result(result[1], index=i)
367
- shown += 1
368
-
369
- elif run_option == "Deep Search":
370
- # For deep search, iterate through pages until we hit max_items
371
- results_all = []
372
- page = 0
373
- while len(results_all) < st.session_state['max_items']:
374
- df = search.load_page(page)
375
- if df.empty:
376
- break
377
- these_results = search.quick_search(edited_input, df)
378
- if these_results.empty:
379
- break
380
- results_all.extend(these_results.iterrows())
381
- page += 1
382
-
383
- shown = 0
384
- for i, result in enumerate(results_all, 1):
385
- if shown >= st.session_state['max_items']:
386
- break
387
- with st.expander(f"Result {i}", expanded=(i==1)):
388
- render_result(result[1], index=i)
389
- shown += 1
390
-
391
- elif run_option == "Voice Summary":
392
- audio_file = speak_with_edge_tts(edited_input)
393
- if audio_file:
394
- play_and_download_audio(audio_file)
395
-
396
- elif st.button("πŸ” Search", key="voice_input_search"):
397
- # Manual search trigger
398
- # Save input as md file
399
- saved_path = save_input_as_md(edited_input)
400
- st.session_state['old_val'] = voice_text
401
- with st.spinner("Processing..."):
402
- df = search.load_page()
403
- results = search.quick_search(edited_input, df)
404
- shown = 0
405
- for i, result in enumerate(results.iterrows(), 1):
406
- if shown >= st.session_state['max_items']:
407
- break
408
- with st.expander(f"Result {i}", expanded=(i==1)):
409
- render_result(result[1], index=i)
410
- shown += 1
411
-
412
- # Tabs
413
- tab1, tab2, tab3, tab4 = st.tabs([
414
- "πŸ” Search", "πŸŽ™οΈ Voice", "πŸ’Ύ History", "βš™οΈ Settings"
415
- ])
416
-
417
  with tab1:
418
- st.subheader("πŸ” Search")
419
- col1, col2 = st.columns([3, 1])
420
- with col1:
421
- query = st.text_input("Enter search query:",
422
- value="" if st.session_state['initial_search_done'] == False else "")
423
- with col2:
424
- # Not strictly filtering by column now; user requested just show in order
425
- search_column = st.selectbox("Search in:", ["All Fields"] + st.session_state['search_columns'])
426
-
427
- col3, col4 = st.columns(2)
428
- with col3:
429
- num_results = st.slider("Max results:", 1, 100, 20)
430
- with col4:
431
- search_button = st.button("πŸ” Search", key="main_search_button")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432
 
433
- if (search_button or not st.session_state['initial_search_done']) and query:
434
- st.session_state['initial_search_done'] = True
435
- selected_column = None if search_column == "All Fields" else search_column
436
-
437
- with st.spinner("Searching..."):
438
  df = search.load_page()
439
  results = search.quick_search(query, df)
440
-
441
  if len(results) > 0:
442
- st.session_state['search_history'].append({
443
- 'query': query,
444
- 'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
445
- 'results': results[:5]
446
- })
447
-
448
  st.write(f"Found {len(results)} results:")
449
  shown = 0
450
  for i, (_, result) in enumerate(results.iterrows(), 1):
451
  if shown >= num_results:
452
  break
453
  with st.expander(f"Result {i}", expanded=(i==1)):
454
- render_result(result, index=i)
 
 
 
455
  shown += 1
456
  else:
457
  st.warning("No matching results found.")
458
-
459
- with tab2:
460
- st.subheader("πŸŽ™οΈ Voice Input")
461
- st.write("Use the voice input above to start speaking, or record a new message:")
462
-
463
- col1, col2 = st.columns(2)
464
- with col1:
465
- if st.button("πŸŽ™οΈ Start New Recording", key="start_recording_button"):
466
- st.session_state['recording'] = True
467
- st.experimental_rerun()
468
- with col2:
469
- if st.button("πŸ›‘ Stop Recording", key="stop_recording_button"):
470
- st.session_state['recording'] = False
471
- st.experimental_rerun()
472
-
473
- if st.session_state.get('recording', False):
474
- voice_component = create_voice_component()
475
- new_val = voice_component(my_input_value="Recording...")
476
- if new_val:
477
- st.text_area("Recorded Text:", value=new_val, height=100)
478
- if st.button("πŸ” Search with Recording", key="recording_search_button"):
479
- # Save this input right away
480
- saved_path = save_input_as_md(new_val)
481
- with st.spinner("Processing recording..."):
482
- df = search.load_page()
483
- results = search.quick_search(new_val, df)
484
- shown = 0
485
- for i, (_, result) in enumerate(results.iterrows(), 1):
486
- if shown >= st.session_state['max_items']:
487
- break
488
- with st.expander(f"Result {i}", expanded=(i==1)):
489
- render_result(result, index=i)
490
- shown += 1
491
-
492
- with tab3:
493
- st.subheader("πŸ’Ύ Search History")
494
- if not st.session_state['search_history']:
495
- st.info("No search history yet. Try searching for something!")
496
- else:
497
- for entry in reversed(st.session_state['search_history']):
498
- with st.expander(f"πŸ•’ {entry['timestamp']} - {entry['query']}", expanded=False):
499
- for i, result in enumerate(entry['results'], 1):
500
- st.write(f"**Result {i}:**")
501
- if isinstance(result, pd.Series):
502
- render_result(result, index=i)
503
- else:
504
- st.write(result)
505
-
506
  with tab4:
507
- st.subheader("βš™οΈ Settings")
508
- st.write("Voice Settings:")
509
- default_voice = st.selectbox(
510
- "Default Voice:",
511
- [
512
- "en-US-AriaNeural",
513
- "en-US-GuyNeural",
514
- "en-GB-SoniaNeural",
515
- "en-GB-TonyNeural"
516
- ],
517
- index=0,
518
- key="default_voice_setting"
519
- )
520
-
521
- st.write("Search Settings:")
522
- st.slider("Minimum Search Score:", 0.0, 1.0, MIN_SEARCH_SCORE, 0.1, key="min_search_score")
523
- st.slider("Exact Match Boost:", 1.0, 3.0, EXACT_MATCH_BOOST, 0.1, key="exact_match_boost")
524
-
525
- if st.button("πŸ—‘οΈ Clear Search History", key="clear_history_button"):
526
  st.session_state['search_history'] = []
 
 
 
527
  st.success("Search history cleared!")
528
- st.experimental_rerun()
529
-
530
- # Sidebar metrics
531
- with st.sidebar:
532
- st.subheader("πŸ“Š Search Metrics")
533
- total_searches = len(st.session_state['search_history'])
534
- st.metric("Total Searches", total_searches)
535
-
536
- if total_searches > 0:
537
- recent_searches = st.session_state['search_history'][-5:]
538
- st.write("Recent Searches:")
539
- for entry in reversed(recent_searches):
540
- st.write(f"πŸ” {entry['query']}")
541
-
542
  if __name__ == "__main__":
543
  main()
 
22
  import re
23
 
24
  # -------------------- Configuration & Constants --------------------
 
25
  USER_NAMES = [
26
  "Alex", "Jordan", "Taylor", "Morgan", "Rowan", "Avery", "Riley", "Quinn",
27
  "Casey", "Jesse", "Reese", "Skyler", "Ellis", "Devon", "Aubrey", "Kendall",
28
  "Parker", "Dakota", "Sage", "Finley"
29
  ]
30
 
31
+ ENGLISH_VOICES = [
32
+ "en-US-AriaNeural", "en-US-GuyNeural", "en-GB-SoniaNeural", "en-GB-TonyNeural",
33
+ "en-US-JennyNeural", "en-US-DavisNeural", "en-GB-LibbyNeural", "en-CA-ClaraNeural",
34
+ "en-CA-LiamNeural", "en-AU-NatashaNeural", "en-AU-WilliamNeural"
35
+ ]
36
+
37
  ROWS_PER_PAGE = 100
38
  MIN_SEARCH_SCORE = 0.3
39
  EXACT_MATCH_BOOST = 2.0
40
  SAVED_INPUTS_DIR = "saved_inputs"
41
  os.makedirs(SAVED_INPUTS_DIR, exist_ok=True)
42
 
 
43
  SESSION_VARS = {
44
  'search_history': [],
45
  'last_voice_input': "",
 
57
  'nps_last_shown': None,
58
  'old_val': None,
59
  'voice_text': None,
60
+ 'user_name': random.choice(USER_NAMES),
61
+ 'max_items': 100,
62
+ 'global_voice': "en-US-AriaNeural" # Default global voice
63
  }
64
 
65
  for var, default in SESSION_VARS.items():
66
  if var not in st.session_state:
67
  st.session_state[var] = default
68
 
69
+ @st.cache_resource
70
+ def get_model():
71
+ return SentenceTransformer('all-MiniLM-L6-v2')
72
 
 
73
  def create_voice_component():
 
74
  mycomponent = components.declare_component(
75
  "mycomponent",
76
  path="mycomponent"
 
86
  return text
87
 
88
  async def edge_tts_generate_audio(text, voice="en-US-AriaNeural", rate=0, pitch=0):
 
89
  text = clean_for_speech(text)
90
  if not text.strip():
91
  return None
 
96
  await communicate.save(out_fn)
97
  return out_fn
98
 
99
+ def speak_with_edge_tts(text, voice="en-US-AriaNeural"):
100
+ return asyncio.run(edge_tts_generate_audio(text, voice, 0, 0))
101
 
102
  def play_and_download_audio(file_path):
 
103
  if file_path and os.path.exists(file_path):
104
  st.audio(file_path)
105
  dl_link = f'<a href="data:audio/mpeg;base64,{base64.b64encode(open(file_path,"rb").read()).decode()}" download="{os.path.basename(file_path)}">Download {os.path.basename(file_path)}</a>'
106
  st.markdown(dl_link, unsafe_allow_html=True)
107
 
108
+ def generate_filename(prefix, text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
110
  safe_text = re.sub(r'[^\w\s-]', '', text[:50]).strip().lower()
111
  safe_text = re.sub(r'[-\s]+', '-', safe_text)
112
+ return f"{prefix}_{timestamp}_{safe_text}.md"
113
 
114
+ def save_input_as_md(user_name, text, prefix="input"):
115
  if not text.strip():
116
  return
117
+ fn = generate_filename(prefix, text)
118
  full_path = os.path.join(SAVED_INPUTS_DIR, fn)
119
  with open(full_path, 'w', encoding='utf-8') as f:
120
+ f.write(f"# User: {user_name}\n")
121
+ f.write(f"**Timestamp:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
122
+ f.write(text)
123
+ return full_path
124
+
125
+ def save_response_as_md(user_name, text, prefix="response"):
126
+ if not text.strip():
127
+ return
128
+ fn = generate_filename(prefix, text)
129
+ full_path = os.path.join(SAVED_INPUTS_DIR, fn)
130
+ with open(full_path, 'w', encoding='utf-8') as f:
131
+ f.write(f"# User: {user_name}\n")
132
  f.write(f"**Timestamp:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
133
  f.write(text)
134
  return full_path
 
137
  files = sorted(glob.glob(os.path.join(SAVED_INPUTS_DIR, "*.md")))
138
  return files
139
 
140
+ def parse_md_file(fpath):
141
+ # Extract user and text from md
142
+ user_line = ""
143
+ ts_line = ""
144
+ content_lines = []
145
+ with open(fpath, 'r', encoding='utf-8') as f:
146
+ lines = f.readlines()
147
+ for line in lines:
148
+ if line.startswith("# User:"):
149
+ user_line = line.replace("# User:", "").strip()
150
+ elif line.startswith("**Timestamp:**"):
151
+ ts_line = line.replace("**Timestamp:**", "").strip()
152
+ else:
153
+ content_lines.append(line.strip())
154
+ content = "\n".join(content_lines).strip()
155
+ return user_line, ts_line, content
156
+
157
+ def fetch_dataset_info(dataset_id, token):
158
+ info_url = f"https://huggingface.co/api/datasets/{dataset_id}"
159
+ try:
160
+ response = requests.get(info_url, timeout=30)
161
+ if response.status_code == 200:
162
+ return response.json()
163
+ except Exception:
164
+ pass
165
+ return None
166
+
167
+ @st.cache_data
168
+ def get_dataset_info(dataset_id, token):
169
+ try:
170
+ dataset = load_dataset(dataset_id, token=token, streaming=True)
171
+ return dataset['train'].info
172
+ except:
173
+ return None
174
+
175
+ @st.cache_data
176
+ def load_dataset_page(dataset_id, token, page, rows_per_page):
177
+ try:
178
+ start_idx = page * rows_per_page
179
+ end_idx = start_idx + rows_per_page
180
+ dataset = load_dataset(
181
+ dataset_id,
182
+ token=token,
183
+ streaming=False,
184
+ split=f'train[{start_idx}:{end_idx}]'
185
  )
186
+ return pd.DataFrame(dataset)
187
+ except:
188
+ return pd.DataFrame()
 
 
 
 
189
 
190
  class FastDatasetSearcher:
191
  def __init__(self, dataset_id="tomg-group-umd/cinepile"):
192
  self.dataset_id = dataset_id
193
  self.text_model = get_model()
194
  self.token = os.environ.get('DATASET_KEY')
 
 
 
 
 
 
195
 
196
  def load_page(self, page=0):
197
  return load_dataset_page(self.dataset_id, self.token, page, ROWS_PER_PAGE)
 
219
  text_parts = []
220
  row_matched = False
221
  exact_match = False
 
222
  priority_fields = ['description', 'matched_text']
223
  other_fields = [col for col in searchable_cols if col not in priority_fields]
224
 
 
244
  text_parts.append(str(val))
245
 
246
  text = ' '.join(text_parts)
 
247
  if text.strip():
248
  text_tokens = set(text.lower().split())
249
  matching_terms = query_terms.intersection(text_tokens)
 
275
  ]
276
 
277
  return filtered_df.sort_values('score', ascending=False)
278
+ except:
 
 
279
  return df
280
 
281
+ def play_text(text):
282
+ voice = st.session_state.get('global_voice', "en-US-AriaNeural")
283
+ audio_file = speak_with_edge_tts(text, voice=voice)
284
+ if audio_file:
285
+ play_and_download_audio(audio_file)
286
+
287
+ def arxiv_search(query, max_results=3):
288
+ # Simple arXiv search using RSS (for demonstration)
289
+ # In production, use official arXiv API or a library.
290
+ base_url = "http://export.arxiv.org/api/query"
291
+ params = {
292
+ 'search_query': query.replace(' ', '+'),
293
+ 'start': 0,
294
+ 'max_results': max_results
295
+ }
296
+ response = requests.get(base_url, params=params, timeout=30)
297
+ if response.status_code == 200:
298
+ root = ET.fromstring(response.text)
299
+ ns = {"a": "http://www.w3.org/2005/Atom"}
300
+ entries = root.findall('a:entry', ns)
301
+ results = []
302
+ for entry in entries:
303
+ title = entry.find('a:title', ns).text.strip()
304
+ summary = entry.find('a:summary', ns).text.strip()
305
+ # Just truncating summary for demo
306
+ summary_short = summary[:300] + "..."
307
+ results.append((title, summary_short))
308
+ return results
309
+ return []
310
+
311
+ def summarize_arxiv_results(results):
312
+ # Just combine titles and short summaries
313
+ lines = []
314
+ for i, (title, summary) in enumerate(results, 1):
315
+ lines.append(f"Result {i}: {title}\n{summary}\n")
316
+ return "\n\n".join(lines)
317
+
318
  def main():
319
+ st.title("πŸŽ™οΈ Voice Chat & Search")
320
+
321
+ # Sidebar
 
 
 
 
 
 
 
 
 
 
322
  with st.sidebar:
323
+ # Editable user name
324
+ st.session_state['user_name'] = st.text_input("Current User:", value=st.session_state['user_name'])
 
 
 
 
 
 
 
 
 
325
 
326
+ # Global voice selection
327
+ st.session_state['global_voice'] = st.selectbox("Select Global Voice:", ENGLISH_VOICES, index=0)
 
328
 
329
+ st.session_state['max_items'] = st.number_input("Max Items per search iteration:", min_value=1, max_value=1000, value=st.session_state['max_items'])
 
 
 
 
 
 
330
 
331
+ st.subheader("πŸ“ Saved Inputs & Responses")
332
+ saved_files = list_saved_inputs()
333
+ for fpath in saved_files:
334
+ user, ts, content = parse_md_file(fpath)
335
+ fname = os.path.basename(fpath)
336
+ st.write(f"- {fname} (User: {user})")
337
+
338
+ # Create voice component for input
339
+ voice_component = create_voice_component()
340
+ voice_val = voice_component(my_input_value="Start speaking...")
341
+
342
+ # Tabs: Voice Chat History, Arxiv Search, Dataset Search, Settings
343
+ tab1, tab2, tab3, tab4 = st.tabs(["πŸ—£οΈ Voice Chat History", "πŸ“š ArXiv Search", "πŸ“Š Dataset Search", "βš™οΈ Settings"])
344
+
345
+ # ------------------ Voice Chat History -------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  with tab1:
347
+ st.subheader("Voice Chat History")
348
+ # List saved inputs and responses and allow playing them
349
+ files = list_saved_inputs()
350
+ for fpath in reversed(files):
351
+ user, ts, content = parse_md_file(fpath)
352
+ with st.expander(f"{ts} - {user}", expanded=False):
353
+ st.write(content)
354
+ if st.button("πŸ”Š Read Aloud", key=f"read_{fpath}"):
355
+ play_text(content)
356
+
357
+ # ------------------ ArXiv Search -------------------------
358
+ with tab2:
359
+ st.subheader("ArXiv Search")
360
+ # If we have a voice_val and autorun with ArXiv chosen:
361
+ edited_input = st.text_area("Enter or Edit Search Query:", value=(voice_val.strip() if voice_val else ""), height=100)
362
+ autorun = st.checkbox("⚑ Auto-Run", value=True)
363
+ run_arxiv = st.button("πŸ” ArXiv Search")
364
+
365
+ input_changed = (edited_input != st.session_state.get('old_val'))
366
+ if autorun and input_changed and edited_input.strip():
367
+ st.session_state['old_val'] = edited_input
368
+ # Save user input
369
+ save_input_as_md(st.session_state['user_name'], edited_input, prefix="input")
370
+ with st.spinner("Searching ArXiv..."):
371
+ results = arxiv_search(edited_input)
372
+ if results:
373
+ summary = summarize_arxiv_results(results)
374
+ # Save response
375
+ save_response_as_md(st.session_state['user_name'], summary, prefix="response")
376
+ st.write(summary)
377
+ # Autoplay TTS
378
+ play_text(summary)
379
+ else:
380
+ st.warning("No results found on ArXiv.")
381
+
382
+ if run_arxiv and edited_input.strip():
383
+ # Manual trigger
384
+ save_input_as_md(st.session_state['user_name'], edited_input, prefix="input")
385
+ with st.spinner("Searching ArXiv..."):
386
+ results = arxiv_search(edited_input)
387
+ if results:
388
+ summary = summarize_arxiv_results(results)
389
+ save_response_as_md(st.session_state['user_name'], summary, prefix="response")
390
+ st.write(summary)
391
+ play_text(summary)
392
+ else:
393
+ st.warning("No results found on ArXiv.")
394
+
395
+ # ------------------ Dataset Search -------------------------
396
+ with tab3:
397
+ st.subheader("Dataset Search")
398
+ search = FastDatasetSearcher()
399
+ query = st.text_input("Enter dataset search query:")
400
+ run_ds_search = st.button("Search Dataset")
401
+ num_results = st.slider("Max results:", 1, 100, 20)
402
 
403
+ if run_ds_search and query.strip():
404
+ with st.spinner("Searching dataset..."):
 
 
 
405
  df = search.load_page()
406
  results = search.quick_search(query, df)
 
407
  if len(results) > 0:
 
 
 
 
 
 
408
  st.write(f"Found {len(results)} results:")
409
  shown = 0
410
  for i, (_, result) in enumerate(results.iterrows(), 1):
411
  if shown >= num_results:
412
  break
413
  with st.expander(f"Result {i}", expanded=(i==1)):
414
+ # Just print result keys/values here
415
+ for k, v in result.items():
416
+ if k not in ['score', 'matched']:
417
+ st.write(f"**{k}:** {v}")
418
  shown += 1
419
  else:
420
  st.warning("No matching results found.")
421
+
422
+ # ------------------ Settings Tab -------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  with tab4:
424
+ st.subheader("Settings")
425
+ st.write("Adjust voice and search parameters in the sidebar.")
426
+ if st.button("πŸ—‘οΈ Clear Search History"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
  st.session_state['search_history'] = []
428
+ # Optionally delete files:
429
+ # for fpath in list_saved_inputs():
430
+ # os.remove(fpath)
431
  st.success("Search history cleared!")
432
+
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  if __name__ == "__main__":
434
  main()