Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -165,7 +165,6 @@ def generate_filename(prompt, response, file_type="md"):
|
|
| 165 |
snippet_cleaned = clean_text_for_filename(snippet)
|
| 166 |
|
| 167 |
# Combine info terms and snippet
|
| 168 |
-
# Prioritize info terms in front
|
| 169 |
name_parts = info_terms + [snippet_cleaned]
|
| 170 |
full_name = '_'.join(name_parts)
|
| 171 |
|
|
@@ -271,7 +270,8 @@ def process_video(video_path, seconds_per_frame=1):
|
|
| 271 |
for i in range(0, total, skip):
|
| 272 |
vid.set(cv2.CAP_PROP_POS_FRAMES, i)
|
| 273 |
ret, frame = vid.read()
|
| 274 |
-
if not ret:
|
|
|
|
| 275 |
_, buf = cv2.imencode(".jpg", frame)
|
| 276 |
frames_b64.append(base64.b64encode(buf).decode("utf-8"))
|
| 277 |
vid.release()
|
|
@@ -298,18 +298,72 @@ def save_full_transcript(query, text):
|
|
| 298 |
"""Save full transcript of Arxiv results as a file."""
|
| 299 |
create_file(query, text, "md")
|
| 300 |
|
| 301 |
-
|
| 302 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
start = time.time()
|
|
|
|
|
|
|
| 304 |
client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
|
| 305 |
refs = client.predict(q,20,"Semantic Search","mistralai/Mixtral-8x7B-Instruct-v0.1",api_name="/update_with_rag_md")[0]
|
| 306 |
r2 = client.predict(q,"mistralai/Mixtral-8x7B-Instruct-v0.1",True,api_name="/ask_llm")
|
| 307 |
|
|
|
|
| 308 |
result = f"### 🔎 {q}\n\n{r2}\n\n{refs}"
|
| 309 |
-
|
| 310 |
st.markdown(result)
|
| 311 |
|
| 312 |
-
# Generate
|
| 313 |
if full_audio:
|
| 314 |
complete_text = f"Complete response for query: {q}. {clean_for_speech(r2)} {clean_for_speech(refs)}"
|
| 315 |
audio_file_full = speak_with_edge_tts(complete_text)
|
|
@@ -329,7 +383,41 @@ def perform_ai_lookup(q, vocal_summary=True, extended_refs=False, titles_summary
|
|
| 329 |
st.write("### 📜 Long Refs")
|
| 330 |
play_and_download_audio(audio_file_refs)
|
| 331 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
if titles_summary:
|
|
|
|
| 333 |
titles = []
|
| 334 |
for line in refs.split('\n'):
|
| 335 |
m = re.search(r"\[([^\]]+)\]", line)
|
|
@@ -339,7 +427,7 @@ def perform_ai_lookup(q, vocal_summary=True, extended_refs=False, titles_summary
|
|
| 339 |
titles_text = "Titles: " + ", ".join(titles)
|
| 340 |
titles_text = clean_for_speech(titles_text)
|
| 341 |
audio_file_titles = speak_with_edge_tts(titles_text)
|
| 342 |
-
st.write("### 🔖 Titles")
|
| 343 |
play_and_download_audio(audio_file_titles)
|
| 344 |
|
| 345 |
elapsed = time.time()-start
|
|
@@ -352,7 +440,8 @@ def perform_ai_lookup(q, vocal_summary=True, extended_refs=False, titles_summary
|
|
| 352 |
|
| 353 |
def process_with_gpt(text):
|
| 354 |
"""Process text with GPT-4"""
|
| 355 |
-
if not text:
|
|
|
|
| 356 |
st.session_state.messages.append({"role":"user","content":text})
|
| 357 |
with st.chat_message("user"):
|
| 358 |
st.markdown(text)
|
|
@@ -370,7 +459,8 @@ def process_with_gpt(text):
|
|
| 370 |
|
| 371 |
def process_with_claude(text):
|
| 372 |
"""Process text with Claude"""
|
| 373 |
-
if not text:
|
|
|
|
| 374 |
with st.chat_message("user"):
|
| 375 |
st.markdown(text)
|
| 376 |
with st.chat_message("assistant"):
|
|
@@ -568,7 +658,6 @@ def main():
|
|
| 568 |
if full_transcript:
|
| 569 |
save_full_transcript(q_new, result)
|
| 570 |
|
| 571 |
-
|
| 572 |
elif tab_main == "🎤 Voice":
|
| 573 |
st.subheader("🎤 Voice Input")
|
| 574 |
user_text = st.text_area("💬 Message:", height=100)
|
|
|
|
| 165 |
snippet_cleaned = clean_text_for_filename(snippet)
|
| 166 |
|
| 167 |
# Combine info terms and snippet
|
|
|
|
| 168 |
name_parts = info_terms + [snippet_cleaned]
|
| 169 |
full_name = '_'.join(name_parts)
|
| 170 |
|
|
|
|
| 270 |
for i in range(0, total, skip):
|
| 271 |
vid.set(cv2.CAP_PROP_POS_FRAMES, i)
|
| 272 |
ret, frame = vid.read()
|
| 273 |
+
if not ret:
|
| 274 |
+
break
|
| 275 |
_, buf = cv2.imencode(".jpg", frame)
|
| 276 |
frames_b64.append(base64.b64encode(buf).decode("utf-8"))
|
| 277 |
vid.release()
|
|
|
|
| 298 |
"""Save full transcript of Arxiv results as a file."""
|
| 299 |
create_file(query, text, "md")
|
| 300 |
|
| 301 |
+
# ------------------------------
|
| 302 |
+
# NEW: Helper to parse references
|
| 303 |
+
# ------------------------------
|
| 304 |
+
def parse_arxiv_refs(ref_text: str):
|
| 305 |
+
"""
|
| 306 |
+
Parse the multi-line references returned by the RAG pipeline.
|
| 307 |
+
Typical format lines like:
|
| 308 |
+
1) [Paper Title 2023] This is the summary ...
|
| 309 |
+
2) [Another Title (2024)] Another summary text ...
|
| 310 |
+
We'll attempt to find a year with a small regex or fallback.
|
| 311 |
+
Return list of dicts: { 'title': str, 'summary': str, 'year': int or None }
|
| 312 |
+
"""
|
| 313 |
+
lines = ref_text.split('\n')
|
| 314 |
+
results = []
|
| 315 |
+
for line in lines:
|
| 316 |
+
line = line.strip()
|
| 317 |
+
if not line:
|
| 318 |
+
continue
|
| 319 |
+
# Attempt to find [Title ...]
|
| 320 |
+
title_match = re.search(r"\[([^\]]+)\]", line)
|
| 321 |
+
if title_match:
|
| 322 |
+
raw_title = title_match.group(1).strip()
|
| 323 |
+
else:
|
| 324 |
+
# If no bracket found, skip or treat entire line as summary
|
| 325 |
+
raw_title = "No Title"
|
| 326 |
+
|
| 327 |
+
# Attempt to find trailing summary after bracket
|
| 328 |
+
# Example line: " [Paper Title 2024] Paper summary blah blah"
|
| 329 |
+
# So remove the bracketed portion from the line
|
| 330 |
+
remainder = line.replace(title_match.group(0), "").strip() if title_match else line
|
| 331 |
+
summary = remainder
|
| 332 |
+
|
| 333 |
+
# Attempt to guess year from the raw title
|
| 334 |
+
# We look for 4-digit patterns in raw_title or summary
|
| 335 |
+
year_match = re.search(r'(20\d{2})', raw_title)
|
| 336 |
+
if not year_match:
|
| 337 |
+
# fallback: try summary
|
| 338 |
+
year_match = re.search(r'(20\d{2})', summary)
|
| 339 |
+
if year_match:
|
| 340 |
+
year = int(year_match.group(1))
|
| 341 |
+
else:
|
| 342 |
+
year = None
|
| 343 |
+
|
| 344 |
+
results.append({
|
| 345 |
+
'title': raw_title,
|
| 346 |
+
'summary': summary,
|
| 347 |
+
'year': year
|
| 348 |
+
})
|
| 349 |
+
return results
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
def perform_ai_lookup(q, vocal_summary=True, extended_refs=False,
|
| 353 |
+
titles_summary=True, full_audio=False):
|
| 354 |
+
"""Perform Arxiv search and generate audio summaries."""
|
| 355 |
start = time.time()
|
| 356 |
+
|
| 357 |
+
# 🎯 1) Query the HF RAG pipeline
|
| 358 |
client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
|
| 359 |
refs = client.predict(q,20,"Semantic Search","mistralai/Mixtral-8x7B-Instruct-v0.1",api_name="/update_with_rag_md")[0]
|
| 360 |
r2 = client.predict(q,"mistralai/Mixtral-8x7B-Instruct-v0.1",True,api_name="/ask_llm")
|
| 361 |
|
| 362 |
+
# 🎯 2) Combine for final text output
|
| 363 |
result = f"### 🔎 {q}\n\n{r2}\n\n{refs}"
|
|
|
|
| 364 |
st.markdown(result)
|
| 365 |
|
| 366 |
+
# 🎯 3) Generate "all at once" audio if requested
|
| 367 |
if full_audio:
|
| 368 |
complete_text = f"Complete response for query: {q}. {clean_for_speech(r2)} {clean_for_speech(refs)}"
|
| 369 |
audio_file_full = speak_with_edge_tts(complete_text)
|
|
|
|
| 383 |
st.write("### 📜 Long Refs")
|
| 384 |
play_and_download_audio(audio_file_refs)
|
| 385 |
|
| 386 |
+
# --------------------------------------
|
| 387 |
+
# NEW: Parse references, show sorted list
|
| 388 |
+
# --------------------------------------
|
| 389 |
+
parsed_refs = parse_arxiv_refs(refs)
|
| 390 |
+
|
| 391 |
+
# Sort by year descending (put None at bottom)
|
| 392 |
+
# If you want to skip older than 2022, you can filter them:
|
| 393 |
+
# parsed_refs = [r for r in parsed_refs if (r["year"] is not None and r["year"] >= 2022)]
|
| 394 |
+
parsed_refs.sort(key=lambda x: x["year"] if x["year"] else 0, reverse=True)
|
| 395 |
+
|
| 396 |
+
st.write("## Individual Papers (Most Recent First)")
|
| 397 |
+
for idx, paper in enumerate(parsed_refs):
|
| 398 |
+
year_str = paper["year"] if paper["year"] else "Unknown Year"
|
| 399 |
+
st.markdown(f"**{idx+1}. {paper['title']}** \n*Year:* {year_str}")
|
| 400 |
+
st.markdown(f"*Summary:* {paper['summary']}")
|
| 401 |
+
|
| 402 |
+
# Two new TTS buttons: Title only or Title+Summary
|
| 403 |
+
colA, colB = st.columns(2)
|
| 404 |
+
with colA:
|
| 405 |
+
if st.button(f"🔊 Title", key=f"title_{idx}"):
|
| 406 |
+
text_tts = clean_for_speech(paper['title'])
|
| 407 |
+
audio_file_title = speak_with_edge_tts(text_tts)
|
| 408 |
+
play_and_download_audio(audio_file_title)
|
| 409 |
+
|
| 410 |
+
with colB:
|
| 411 |
+
if st.button(f"🔊 Title+Summary", key=f"summary_{idx}"):
|
| 412 |
+
text_tts = clean_for_speech(paper['title'] + ". " + paper['summary'])
|
| 413 |
+
audio_file_title_summary = speak_with_edge_tts(text_tts)
|
| 414 |
+
play_and_download_audio(audio_file_title_summary)
|
| 415 |
+
|
| 416 |
+
st.write("---")
|
| 417 |
+
|
| 418 |
+
# Keep your original block for "Titles Only" if you want:
|
| 419 |
if titles_summary:
|
| 420 |
+
# This is your existing code block
|
| 421 |
titles = []
|
| 422 |
for line in refs.split('\n'):
|
| 423 |
m = re.search(r"\[([^\]]+)\]", line)
|
|
|
|
| 427 |
titles_text = "Titles: " + ", ".join(titles)
|
| 428 |
titles_text = clean_for_speech(titles_text)
|
| 429 |
audio_file_titles = speak_with_edge_tts(titles_text)
|
| 430 |
+
st.write("### 🔖 Titles (All-In-One)")
|
| 431 |
play_and_download_audio(audio_file_titles)
|
| 432 |
|
| 433 |
elapsed = time.time()-start
|
|
|
|
| 440 |
|
| 441 |
def process_with_gpt(text):
|
| 442 |
"""Process text with GPT-4"""
|
| 443 |
+
if not text:
|
| 444 |
+
return
|
| 445 |
st.session_state.messages.append({"role":"user","content":text})
|
| 446 |
with st.chat_message("user"):
|
| 447 |
st.markdown(text)
|
|
|
|
| 459 |
|
| 460 |
def process_with_claude(text):
|
| 461 |
"""Process text with Claude"""
|
| 462 |
+
if not text:
|
| 463 |
+
return
|
| 464 |
with st.chat_message("user"):
|
| 465 |
st.markdown(text)
|
| 466 |
with st.chat_message("assistant"):
|
|
|
|
| 658 |
if full_transcript:
|
| 659 |
save_full_transcript(q_new, result)
|
| 660 |
|
|
|
|
| 661 |
elif tab_main == "🎤 Voice":
|
| 662 |
st.subheader("🎤 Voice Input")
|
| 663 |
user_text = st.text_area("💬 Message:", height=100)
|