Spaces:

wakeupmh
/

ama-autism

Sleeping

App Files Files Community

wakeupmh commited on Feb 15

Commit

62b3157

1 Parent(s): cc41495

fix: search in results

Browse files

Files changed (2) hide show

app.py +36 -30
faiss_index/index.py +12 -6

app.py CHANGED Viewed

@@ -40,29 +40,35 @@ def load_dataset(query):
             search_query = query
         papers = idx.fetch_arxiv_papers(search_query, max_results=25)
-        if not papers:
-            st.warning("No relevant papers found. Please try rephrasing your question.")
-            return pd.DataFrame(columns=['title', 'text'])
-        idx.build_faiss_index(papers, dataset_dir=DATASET_DIR)
     # Load and convert to pandas for easier handling
     dataset = load_from_disk(DATASET_PATH)
     df = pd.DataFrame({
         'title': dataset['title'],
-        'text': dataset['text']
     })
     return df
 def generate_answer(question, context, max_length=150):
     tokenizer, model = load_models()
-    # Improve prompt to focus on autism-related information
-    prompt = f"""Based on scientific research about autism, answer the following question.
     If the context doesn't contain relevant information about autism, respond with 'I cannot find specific information about this topic in the autism research papers.'
     Question: {question}
-    Context: {context}"""
     # Optimize input processing
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
@@ -99,26 +105,26 @@ if query:
     with st.status("Searching for answers..."):
         # Load dataset
         df = load_dataset(query)
-        if df.empty:
-            st.warning("I couldn't find any relevant research papers about this topic. Please try rephrasing your question or ask something else about autism.")
-        else:
-            # Get relevant context
-            context = "\n".join([
-                f"{text[:1000]}" for text in df['text'].head(3)
-            ])
-            # Generate answer
-            answer = generate_answer(query, context)
-            if answer and not answer.isspace():
-                st.success("Answer found!")
-                st.write(answer)
-                st.write("### Sources used:")
-                for _, row in df.head(3).iterrows():
-                    st.write(f"**Title:** {row['title']}")
-                    st.write(f"**Summary:** {row['text'][:200]}...")
-                    st.write("---")
-            else:
-                st.warning("I couldn't find a specific answer in the research papers. Try rephrasing your question.")

             search_query = query
         papers = idx.fetch_arxiv_papers(search_query, max_results=25)
+    if not papers:
+        st.warning("No relevant papers found. Please try rephrasing your question.")
+        return pd.DataFrame(columns=['title', 'text', 'url', 'published'])
+    idx.build_faiss_index(papers, dataset_dir=DATASET_DIR)
     # Load and convert to pandas for easier handling
     dataset = load_from_disk(DATASET_PATH)
     df = pd.DataFrame({
         'title': dataset['title'],
+        'text': dataset['text'],
+        'url': [p['url'] for p in papers],
+        'published': [p['published'] for p in papers]
     })
     return df
 def generate_answer(question, context, max_length=150):
     tokenizer, model = load_models()
+    # Improve prompt to generate concise, summarized answers
+    prompt = f"""Based on scientific research about autism, provide a brief, clear summary answering the following question.
+    Focus only on the most important findings and be concise.
     If the context doesn't contain relevant information about autism, respond with 'I cannot find specific information about this topic in the autism research papers.'
     Question: {question}
+    Context: {context}
+    Provide a concise summary:"""
     # Optimize input processing
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
     with st.status("Searching for answers..."):
         # Load dataset
         df = load_dataset(query)
+        # Get relevant context
+        context = "\n".join([
+            f"{text[:1000]}" for text in df['text'].head(3)
+        ])
+        # Generate answer
+        answer = generate_answer(query, context)
+        status.update(
+            label="Search complete!", state="complete", expanded=False
+        )
+    if answer and not answer.isspace():
+        st.success("Answer found!")
+        st.write(answer)
+        st.write("### Sources used:")
+        for _, row in df.head(3).iterrows():
+            st.markdown(f"**[{row['title']}]({row['url']})** ({row['published']})")
+            st.write(f"**Summary:** {row['text'][:200]}...")
+            st.write("---")
+    else:
+        st.warning("I couldn't find a specific answer in the research papers. Try rephrasing your question.")
+    if df.empty:
+        st.warning("I couldn't find any relevant research papers about this topic. Please try rephrasing your question or ask something else about autism.")

faiss_index/index.py CHANGED Viewed

@@ -22,8 +22,12 @@ def fetch_arxiv_papers(query, max_results=10):
     query = query.replace('and', '').strip()  # Remove 'and' as it's treated as AND operator
     terms = [term.strip() for term in query.split() if term.strip()]
-    # Create a more flexible search query
-    search_query = ' OR '.join([f'abs:"{term}" OR ti:"{term}"' for term in terms])
     search_query = f'({search_query}) AND (cat:q-bio* OR cat:med*)'
     logging.info(f"Searching arXiv with query: {search_query}")
@@ -39,18 +43,20 @@ def fetch_arxiv_papers(query, max_results=10):
         papers = []
         for i, result in enumerate(results):
-            # Include paper if it contains any of the search terms
             text = (result.title + " " + result.summary).lower()
-            if any(term.lower() in text for term in terms):
                 papers.append({
                     "id": str(i),
                     "text": result.summary,
-                    "title": result.title
                 })
                 if len(papers) >= max_results:
                     break
-        logging.info(f"Found {len(papers)} relevant papers from arXiv")
         return papers
     except Exception as e:
         logging.error(f"Error fetching papers from arXiv: {str(e)}")

     query = query.replace('and', '').strip()  # Remove 'and' as it's treated as AND operator
     terms = [term.strip() for term in query.split() if term.strip()]
+    # Always include autism in the search
+    if 'autism' not in [t.lower() for t in terms]:
+        terms.insert(0, 'autism')
+    # Create search query with required autism term
+    search_query = f'(abs:"autism" OR ti:"autism") AND ({" OR ".join([f'abs:"{term}" OR ti:"{term}"' for term in terms if term.lower() != "autism"])})'
     search_query = f'({search_query}) AND (cat:q-bio* OR cat:med*)'
     logging.info(f"Searching arXiv with query: {search_query}")
         papers = []
         for i, result in enumerate(results):
+            # Only include papers that mention autism
             text = (result.title + " " + result.summary).lower()
+            if 'autism' in text:
                 papers.append({
                     "id": str(i),
                     "text": result.summary,
+                    "title": result.title,
+                    "url": result.entry_id,  # Add the paper URL
+                    "published": result.published.strftime("%Y-%m-%d")  # Add publication date
                 })
                 if len(papers) >= max_results:
                     break
+        logging.info(f"Found {len(papers)} relevant papers about autism from arXiv")
         return papers
     except Exception as e:
         logging.error(f"Error fetching papers from arXiv: {str(e)}")