DeepResearchEvaluator

Sleeping

App Files Files Community

awacke1 commited on Dec 31, 2024

Commit

711281e

verified ·

1 Parent(s): 8e010af

Update backup8.app.py

Browse files

Files changed (1) hide show

backup8.app.py +125 -121

backup8.app.py CHANGED Viewed

@@ -299,146 +299,150 @@ def save_full_transcript(query, text):
     """Save full transcript of Arxiv results as a file."""
     create_file(query, text, "md")
-# ------------------------------
-# NEW: Helper to parse references
-# ------------------------------
 def parse_arxiv_refs(ref_text: str):
     """
-    Parse the multi-line references returned by the RAG pipeline.
-    Typical format lines like:
-       1) [Paper Title 2023] This is the summary ...
-       2) [Another Title (2024)] Another summary text ...
-    We'll attempt to find a year with a small regex or fallback.
-    Return list of dicts: { 'title': str, 'summary': str, 'year': int or None }
     """
-    lines = ref_text.split('\n')
-    results = []
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
-        # Attempt to find [Title ...]
-        title_match = re.search(r"\[([^\]]+)\]", line)
-        if title_match:
-            raw_title = title_match.group(1).strip()
-        else:
-            # If no bracket found, skip or treat entire line as summary
-            raw_title = "No Title"
-        # Attempt to find trailing summary after bracket
-        # Example line: " [Paper Title 2024] Paper summary blah blah"
-        # So remove the bracketed portion from the line
-        remainder = line.replace(title_match.group(0), "").strip() if title_match else line
-        summary = remainder
-        # Attempt to guess year from the raw title
-        # We look for 4-digit patterns in raw_title or summary
-        year_match = re.search(r'(20\d{2})', raw_title)
-        if not year_match:
-            # fallback: try summary
-            year_match = re.search(r'(20\d{2})', summary)
-        if year_match:
-            year = int(year_match.group(1))
-        else:
-            year = None
-        results.append({
-            'title': raw_title,
-            'summary': summary,
-            'year': year
-        })
-    return results
 def perform_ai_lookup(q, vocal_summary=True, extended_refs=False,
-                      titles_summary=True, full_audio=False):
-    """Perform Arxiv search and generate audio summaries."""
     start = time.time()
-    # 🎯 1) Query the HF RAG pipeline
     client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
-    refs = client.predict(q,20,"Semantic Search","mistralai/Mixtral-8x7B-Instruct-v0.1",api_name="/update_with_rag_md")[0]
-    r2 = client.predict(q,"mistralai/Mixtral-8x7B-Instruct-v0.1",True,api_name="/ask_llm")
-    # 🎯 2) Combine for final text output
     result = f"### 🔎 {q}\n\n{r2}\n\n{refs}"
     st.markdown(result)
-    # 🎯 3) Generate "all at once" audio if requested
-    if full_audio:
-        complete_text = f"Complete response for query: {q}. {clean_for_speech(r2)} {clean_for_speech(refs)}"
-        audio_file_full = speak_with_edge_tts(complete_text)
-        st.write("### 📚 Full Audio")
-        play_and_download_audio(audio_file_full)
-    if vocal_summary:
-        main_text = clean_for_speech(r2)
-        audio_file_main = speak_with_edge_tts(main_text)
-        st.write("### 🎙 Short Audio")
-        play_and_download_audio(audio_file_main)
-    if extended_refs:
-        summaries_text = "Extended references: " + refs.replace('"','')
-        summaries_text = clean_for_speech(summaries_text)
-        audio_file_refs = speak_with_edge_tts(summaries_text)
-        st.write("### 📜 Long Refs")
-        play_and_download_audio(audio_file_refs)
-    # --------------------------------------
-    # NEW: Parse references, show sorted list
-    # --------------------------------------
-    parsed_refs = parse_arxiv_refs(refs)
-    # Sort by year descending (put None at bottom)
-    # If you want to skip older than 2022, you can filter them:
-    # parsed_refs = [r for r in parsed_refs if (r["year"] is not None and r["year"] >= 2022)]
-    parsed_refs.sort(key=lambda x: x["year"] if x["year"] else 0, reverse=True)
-    st.write("## Individual Papers (Most Recent First)")
-    for idx, paper in enumerate(parsed_refs):
-        year_str = paper["year"] if paper["year"] else "Unknown Year"
-        st.markdown(f"**{idx+1}. {paper['title']}**  \n*Year:* {year_str}")
-        st.markdown(f"*Summary:* {paper['summary']}")
-        # Two new TTS buttons: Title only or Title+Summary
-        colA, colB = st.columns(2)
-        with colA:
-            if st.button(f"🔊 Title", key=f"title_{idx}"):
-                text_tts = clean_for_speech(paper['title'])
-                audio_file_title = speak_with_edge_tts(text_tts)
-                play_and_download_audio(audio_file_title)
-        with colB:
-            if st.button(f"🔊 Title+Summary", key=f"summary_{idx}"):
-                text_tts = clean_for_speech(paper['title'] + ". " + paper['summary'])
-                audio_file_title_summary = speak_with_edge_tts(text_tts)
-                play_and_download_audio(audio_file_title_summary)
-        st.write("---")
-    # Keep your original block for "Titles Only" if you want:
-    if titles_summary:
-        # This is your existing code block
-        titles = []
-        for line in refs.split('\n'):
-            m = re.search(r"\[([^\]]+)\]", line)
-            if m:
-                titles.append(m.group(1))
-        if titles:
-            titles_text = "Titles: " + ", ".join(titles)
-            titles_text = clean_for_speech(titles_text)
-            audio_file_titles = speak_with_edge_tts(titles_text)
-            st.write("### 🔖 Titles (All-In-One)")
-            play_and_download_audio(audio_file_titles)
     elapsed = time.time()-start
     st.write(f"**Total Elapsed:** {elapsed:.2f} s")
-    # Always create a file with the result
     create_file(q, result, "md")
     return result
 def process_with_gpt(text):
     """Process text with GPT-4"""
     if not text:
@@ -598,9 +602,9 @@ def main():
     # Show input in a text box for editing if detected
     if val:
-        val_stripped = val.replace('\n', ' ')
         edited_input = st.text_area("✏️ Edit Input:", value=val_stripped, height=100)
-        edited_input = edited_input.replace('\n', ' ')
         run_option = st.selectbox("Model:", ["Arxiv", "GPT-4o", "Claude-3.5"])
         col1, col2 = st.columns(2)

     """Save full transcript of Arxiv results as a file."""
     create_file(query, text, "md")
 def parse_arxiv_refs(ref_text: str):
     """
+    Parse papers by finding lines with two pipe characters as title lines.
+    Returns list of paper dictionaries with audio files.
     """
+    if not ref_text:
+        return []
+    results = []
+    current_paper = {}
+    lines = ref_text.split('\n')
+    for i, line in enumerate(lines):
+        # Check if this is a title line (contains exactly 2 pipe characters)
+        if line.count('|') == 2:
+            # If we have a previous paper, add it to results
+            if current_paper:
+                results.append(current_paper)
+                if len(results) >= 20:  # Limit to 20 papers
+                    break
+            # Parse new paper header
+            try:
+                # Remove ** and split by |
+                header_parts = line.strip('* ').split('|')
+                date = header_parts[0].strip()
+                title = header_parts[1].strip()
+                # Extract arXiv URL if present
+                url_match = re.search(r'(https://arxiv.org/\S+)', line)
+                url = url_match.group(1) if url_match else f"paper_{len(results)}"
+                current_paper = {
+                    'date': date,
+                    'title': title,
+                    'url': url,
+                    'authors': '',
+                    'summary': '',
+                    'content_start': i + 1  # Track where content begins
+                }
+            except Exception as e:
+                st.warning(f"Error parsing paper header: {str(e)}")
+                current_paper = {}
+                continue
+        # If we have a current paper and this isn't a title line, add to content
+        elif current_paper:
+            if not current_paper['authors']:  # First line after title is authors
+                current_paper['authors'] = line.strip('* ')
+            else:  # Rest is summary
+                if current_paper['summary']:
+                    current_paper['summary'] += ' ' + line.strip()
+                else:
+                    current_paper['summary'] = line.strip()
+    # Don't forget the last paper
+    if current_paper:
+        results.append(current_paper)
+    return results[:20]  # Ensure we return maximum 20 papers
+def create_paper_audio_files(papers):
+    """
+    Create audio files for each paper's components and add file paths to paper dict.
+    """
+    for paper in papers:
+        try:
+            # Generate audio for title
+            title_text = clean_for_speech(paper['title'])
+            title_file = speak_with_edge_tts(title_text)
+            paper['title_audio'] = title_file
+            # Generate audio for full content
+            full_text = f"{paper['title']} by {paper['authors']}. {paper['summary']}"
+            full_text = clean_for_speech(full_text)
+            full_file = speak_with_edge_tts(full_text)
+            paper['full_audio'] = full_file
+        except Exception as e:
+            st.warning(f"Error generating audio for paper {paper['title']}: {str(e)}")
+            paper['title_audio'] = None
+            paper['full_audio'] = None
+def display_papers(papers):
+    """
+    Display papers with their audio controls using URLs as unique keys.
+    """
+    st.write("## Research Papers")
+    for idx, paper in enumerate(papers):
+        with st.expander(f"📄 {paper['title']}", expanded=True):
+            st.markdown(f"**{paper['date']} | {paper['title']} | ⬇️**")
+            st.markdown(f"*{paper['authors']}*")
+            st.markdown(paper['summary'])
+            # Audio controls in columns
+            col1, col2 = st.columns(2)
+            with col1:
+                if paper.get('title_audio'):
+                    st.write("🎙️ Title Audio")
+                    st.audio(paper['title_audio'])
+            with col2:
+                if paper.get('full_audio'):
+                    st.write("📚 Full Paper Audio")
+                    st.audio(paper['full_audio'])
 def perform_ai_lookup(q, vocal_summary=True, extended_refs=False,
+                     titles_summary=True, full_audio=False):
+    """Perform Arxiv search with audio generation per paper."""
     start = time.time()
+    # Query the HF RAG pipeline
     client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
+    refs = client.predict(q, 20, "Semantic Search",
+                         "mistralai/Mixtral-8x7B-Instruct-v0.1",
+                         api_name="/update_with_rag_md")[0]
+    r2 = client.predict(q, "mistralai/Mixtral-8x7B-Instruct-v0.1",
+                       True, api_name="/ask_llm")
+    # Combine for final text output
     result = f"### 🔎 {q}\n\n{r2}\n\n{refs}"
     st.markdown(result)
+    # Parse and process papers
+    papers = parse_arxiv_refs(refs)
+    if papers:
+        create_paper_audio_files(papers)
+        display_papers(papers)
+    else:
+        st.warning("No papers found in the response.")
     elapsed = time.time()-start
     st.write(f"**Total Elapsed:** {elapsed:.2f} s")
+    # Save full transcript
     create_file(q, result, "md")
     return result
 def process_with_gpt(text):
     """Process text with GPT-4"""
     if not text:
     # Show input in a text box for editing if detected
     if val:
+        val_stripped = val.replace('\\n', ' ')
         edited_input = st.text_area("✏️ Edit Input:", value=val_stripped, height=100)
+        #edited_input = edited_input.replace('\n', ' ')
         run_option = st.selectbox("Model:", ["Arxiv", "GPT-4o", "Claude-3.5"])
         col1, col2 = st.columns(2)