awacke1 commited on
Commit
24ca3ed
Β·
verified Β·
1 Parent(s): 88675e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -53
app.py CHANGED
@@ -1,9 +1,6 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
- from sentence_transformers import SentenceTransformer
5
- from sklearn.metrics.pairwise import cosine_similarity
6
- import torch
7
  import json
8
  import os
9
  import glob
@@ -20,7 +17,6 @@ from xml.etree import ElementTree as ET
20
  from datasets import load_dataset
21
 
22
  # -------------------- Configuration & Constants --------------------
23
- # Exactly 11 user names and 11 voices
24
  USER_NAMES = [
25
  "Aria", "Guy", "Sonia", "Tony", "Jenny", "Davis", "Libby", "Clara", "Liam", "Natasha", "William"
26
  ]
@@ -35,8 +31,6 @@ ENGLISH_VOICES = [
35
  USER_VOICES = dict(zip(USER_NAMES, ENGLISH_VOICES))
36
 
37
  ROWS_PER_PAGE = 100
38
- MIN_SEARCH_SCORE = 0.3
39
- EXACT_MATCH_BOOST = 2.0
40
  SAVED_INPUTS_DIR = "saved_inputs"
41
  os.makedirs(SAVED_INPUTS_DIR, exist_ok=True)
42
 
@@ -59,17 +53,13 @@ SESSION_VARS = {
59
  'user_name': random.choice(USER_NAMES),
60
  'max_items': 100,
61
  'global_voice': "en-US-AriaNeural",
62
- 'last_arxiv_input': None # To avoid double-running ArXiv search
63
  }
64
 
65
  for var, default in SESSION_VARS.items():
66
  if var not in st.session_state:
67
  st.session_state[var] = default
68
 
69
- @st.cache_resource
70
- def get_model():
71
- return SentenceTransformer('all-MiniLM-L6-v2')
72
-
73
  def create_voice_component():
74
  mycomponent = components.declare_component(
75
  "mycomponent",
@@ -178,14 +168,12 @@ def summarize_arxiv_results(results):
178
  lines.append(f"Result {i}: {title}\n{summary}\n")
179
  return "\n\n".join(lines)
180
 
181
- # Simple dataset search: text-based substring search
182
  def simple_dataset_search(query, df):
183
  if df.empty or not query.strip():
184
  return pd.DataFrame()
185
  query_terms = query.lower().split()
186
  matches = []
187
  for idx, row in df.iterrows():
188
- # Combine all text fields into one string
189
  text_parts = []
190
  for col in df.columns:
191
  val = row[col]
@@ -194,13 +182,14 @@ def simple_dataset_search(query, df):
194
  elif isinstance(val, (int, float)):
195
  text_parts.append(str(val))
196
  full_text = " ".join(text_parts)
197
- # Check if any query term is in full_text
198
  if any(qt in full_text for qt in query_terms):
199
  matches.append(row)
200
  if matches:
201
  return pd.DataFrame(matches)
202
  return pd.DataFrame()
203
 
 
 
204
  @st.cache_data
205
  def load_dataset_page(dataset_id, token, page, rows_per_page):
206
  try:
@@ -261,34 +250,33 @@ def main():
261
  conversation = []
262
  for fpath in files:
263
  user, ts, content = parse_md_file(fpath)
264
- conversation.append((user, ts, content))
265
- for user, ts, content in reversed(conversation):
 
 
266
  with st.expander(f"{ts} - {user}", expanded=False):
267
  st.write(content)
268
- if st.button(f"πŸ”Š Read Aloud {ts}-{user}", key=f"read_{fpath}"):
 
269
  voice = USER_VOICES.get(user, "en-US-AriaNeural")
270
  audio_file = speak_with_edge_tts(content, voice=voice)
271
  if audio_file:
272
  play_and_download_audio(audio_file)
273
 
274
  # Read entire conversation
275
- if st.button("πŸ“œ Read Conversation"):
276
- # Sort by timestamp to ensure chronological order
277
- # Already in order because files is sorted, but let's rely on chronological order:
278
- # They are sorted ascending, so conversation is appended ascending.
279
- # It's safe to assume files list is chronological by filename.
280
  mp3_files = []
281
- for user, ts, content in conversation:
282
  voice = USER_VOICES.get(user, "en-US-AriaNeural")
283
  audio_file = speak_with_edge_tts(content, voice=voice)
284
  if audio_file:
285
  mp3_files.append(audio_file)
286
- # Show each line's MP3
287
  st.write(f"**{user} ({ts}):**")
288
  play_and_download_audio(audio_file)
289
 
290
  if mp3_files:
291
- # Concatenate all mp3 files into one
292
  combined_file = f"full_conversation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
293
  concatenate_mp3(mp3_files, combined_file)
294
  st.write("**Full Conversation Audio:**")
@@ -299,50 +287,43 @@ def main():
299
  st.subheader("ArXiv Search")
300
  edited_input = st.text_area("Enter or Edit Search Query:", value=(voice_val.strip() if voice_val else ""), height=100)
301
  autorun = st.checkbox("⚑ Auto-Run", value=True)
302
- run_arxiv = st.button("πŸ” ArXiv Search")
303
 
304
  input_changed = (edited_input != st.session_state.get('old_val'))
305
- # Only run once:
306
- # Conditions to run ArXiv search:
307
- # - If autorun and input_changed and edited_input non-empty
308
- # - Or if run_arxiv button is pressed and edited_input non-empty
309
  should_run_arxiv = False
310
  if autorun and input_changed and edited_input.strip():
311
  should_run_arxiv = True
312
  if run_arxiv and edited_input.strip():
313
  should_run_arxiv = True
314
 
315
- if should_run_arxiv:
316
  st.session_state['old_val'] = edited_input
317
- # Avoid double-running by checking if last_arxiv_input is same
318
- if st.session_state['last_arxiv_input'] != edited_input:
319
- st.session_state['last_arxiv_input'] = edited_input
320
- save_input_as_md(st.session_state['user_name'], edited_input, prefix="input")
321
- with st.spinner("Searching ArXiv..."):
322
- results = arxiv_search(edited_input)
323
- if results:
324
- summary = summarize_arxiv_results(results)
325
- save_response_as_md(st.session_state['user_name'], summary, prefix="response")
326
- st.write(summary)
327
- # Play summary aloud
328
- voice = USER_VOICES.get(st.session_state['user_name'], "en-US-AriaNeural")
329
- audio_file = speak_with_edge_tts(summary, voice=voice)
330
- if audio_file:
331
- play_and_download_audio(audio_file)
332
- else:
333
- st.warning("No results found on ArXiv.")
334
 
335
  # ------------------ Dataset Search -------------------------
336
  with tab3:
337
  st.subheader("Dataset Search")
338
  ds_searcher = SimpleDatasetSearcher()
339
  query = st.text_input("Enter dataset search query:")
340
- run_ds_search = st.button("Search Dataset")
341
- num_results = st.slider("Max results:", 1, 100, 20)
342
 
343
  if run_ds_search and query.strip():
344
  with st.spinner("Searching dataset..."):
345
- # For simplicity, just load first page
346
  df = ds_searcher.load_page(0)
347
  results = simple_dataset_search(query, df)
348
  if not results.empty:
@@ -361,8 +342,7 @@ def main():
361
  # ------------------ Settings Tab -------------------------
362
  with tab4:
363
  st.subheader("Settings")
364
- # Clear search history: deletes all md files and clears session
365
- if st.button("πŸ—‘οΈ Clear Search History"):
366
  # Delete all files
367
  for fpath in list_saved_inputs():
368
  os.remove(fpath)
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
 
 
 
4
  import json
5
  import os
6
  import glob
 
17
  from datasets import load_dataset
18
 
19
  # -------------------- Configuration & Constants --------------------
 
20
  USER_NAMES = [
21
  "Aria", "Guy", "Sonia", "Tony", "Jenny", "Davis", "Libby", "Clara", "Liam", "Natasha", "William"
22
  ]
 
31
  USER_VOICES = dict(zip(USER_NAMES, ENGLISH_VOICES))
32
 
33
  ROWS_PER_PAGE = 100
 
 
34
  SAVED_INPUTS_DIR = "saved_inputs"
35
  os.makedirs(SAVED_INPUTS_DIR, exist_ok=True)
36
 
 
53
  'user_name': random.choice(USER_NAMES),
54
  'max_items': 100,
55
  'global_voice': "en-US-AriaNeural",
56
+ 'last_arxiv_input': None
57
  }
58
 
59
  for var, default in SESSION_VARS.items():
60
  if var not in st.session_state:
61
  st.session_state[var] = default
62
 
 
 
 
 
63
  def create_voice_component():
64
  mycomponent = components.declare_component(
65
  "mycomponent",
 
168
  lines.append(f"Result {i}: {title}\n{summary}\n")
169
  return "\n\n".join(lines)
170
 
 
171
  def simple_dataset_search(query, df):
172
  if df.empty or not query.strip():
173
  return pd.DataFrame()
174
  query_terms = query.lower().split()
175
  matches = []
176
  for idx, row in df.iterrows():
 
177
  text_parts = []
178
  for col in df.columns:
179
  val = row[col]
 
182
  elif isinstance(val, (int, float)):
183
  text_parts.append(str(val))
184
  full_text = " ".join(text_parts)
 
185
  if any(qt in full_text for qt in query_terms):
186
  matches.append(row)
187
  if matches:
188
  return pd.DataFrame(matches)
189
  return pd.DataFrame()
190
 
191
+ from datasets import load_dataset
192
+
193
  @st.cache_data
194
  def load_dataset_page(dataset_id, token, page, rows_per_page):
195
  try:
 
250
  conversation = []
251
  for fpath in files:
252
  user, ts, content = parse_md_file(fpath)
253
+ conversation.append((user, ts, content, fpath))
254
+
255
+ # Enumerate to ensure unique keys
256
+ for i, (user, ts, content, fpath) in enumerate(reversed(conversation), start=1):
257
  with st.expander(f"{ts} - {user}", expanded=False):
258
  st.write(content)
259
+ # Make button key unique by including i
260
+ if st.button(f"πŸ”Š Read Aloud {ts}-{user}", key=f"read_{i}_{fpath}"):
261
  voice = USER_VOICES.get(user, "en-US-AriaNeural")
262
  audio_file = speak_with_edge_tts(content, voice=voice)
263
  if audio_file:
264
  play_and_download_audio(audio_file)
265
 
266
  # Read entire conversation
267
+ if st.button("πŸ“œ Read Conversation", key="read_conversation_all"):
268
+ # conversation is currently reversed, re-reverse to get chronological
269
+ conversation_chrono = list(reversed(conversation))
 
 
270
  mp3_files = []
271
+ for user, ts, content, fpath in conversation_chrono:
272
  voice = USER_VOICES.get(user, "en-US-AriaNeural")
273
  audio_file = speak_with_edge_tts(content, voice=voice)
274
  if audio_file:
275
  mp3_files.append(audio_file)
 
276
  st.write(f"**{user} ({ts}):**")
277
  play_and_download_audio(audio_file)
278
 
279
  if mp3_files:
 
280
  combined_file = f"full_conversation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
281
  concatenate_mp3(mp3_files, combined_file)
282
  st.write("**Full Conversation Audio:**")
 
287
  st.subheader("ArXiv Search")
288
  edited_input = st.text_area("Enter or Edit Search Query:", value=(voice_val.strip() if voice_val else ""), height=100)
289
  autorun = st.checkbox("⚑ Auto-Run", value=True)
290
+ run_arxiv = st.button("πŸ” ArXiv Search", key="run_arxiv_button")
291
 
292
  input_changed = (edited_input != st.session_state.get('old_val'))
 
 
 
 
293
  should_run_arxiv = False
294
  if autorun and input_changed and edited_input.strip():
295
  should_run_arxiv = True
296
  if run_arxiv and edited_input.strip():
297
  should_run_arxiv = True
298
 
299
+ if should_run_arxiv and st.session_state['last_arxiv_input'] != edited_input:
300
  st.session_state['old_val'] = edited_input
301
+ st.session_state['last_arxiv_input'] = edited_input
302
+ save_input_as_md(st.session_state['user_name'], edited_input, prefix="input")
303
+ with st.spinner("Searching ArXiv..."):
304
+ results = arxiv_search(edited_input)
305
+ if results:
306
+ summary = summarize_arxiv_results(results)
307
+ save_response_as_md(st.session_state['user_name'], summary, prefix="response")
308
+ st.write(summary)
309
+ # Play summary aloud
310
+ voice = USER_VOICES.get(st.session_state['user_name'], "en-US-AriaNeural")
311
+ audio_file = speak_with_edge_tts(summary, voice=voice)
312
+ if audio_file:
313
+ play_and_download_audio(audio_file)
314
+ else:
315
+ st.warning("No results found on ArXiv.")
 
 
316
 
317
  # ------------------ Dataset Search -------------------------
318
  with tab3:
319
  st.subheader("Dataset Search")
320
  ds_searcher = SimpleDatasetSearcher()
321
  query = st.text_input("Enter dataset search query:")
322
+ run_ds_search = st.button("Search Dataset", key="ds_search_button")
323
+ num_results = st.slider("Max results:", 1, 100, 20, key="ds_max_results")
324
 
325
  if run_ds_search and query.strip():
326
  with st.spinner("Searching dataset..."):
 
327
  df = ds_searcher.load_page(0)
328
  results = simple_dataset_search(query, df)
329
  if not results.empty:
 
342
  # ------------------ Settings Tab -------------------------
343
  with tab4:
344
  st.subheader("Settings")
345
+ if st.button("πŸ—‘οΈ Clear Search History", key="clear_history"):
 
346
  # Delete all files
347
  for fpath in list_saved_inputs():
348
  os.remove(fpath)