Spaces:

wakeupmh
/

ama-autism

Sleeping

App Files Files Community

wakeupmh commited on Feb 15

Commit

cc0b0d6

1 Parent(s): 92c1c48

fix: shearch

Browse files

Files changed (2) hide show

app.py +32 -23
faiss_index/index.py +24 -4

app.py CHANGED Viewed

@@ -30,12 +30,13 @@ def load_models():
 @st.cache_data(ttl=3600)  # Cache for 1 hour
 def load_dataset(query):
-    # Create initial dataset if it doesn't exist
-    if not os.path.exists(DATASET_PATH):
-        with st.spinner("Building initial dataset from autism research papers..."):
-            import faiss_index.index as idx
-            papers = idx.fetch_arxiv_papers(f"{query} AND (cat:q-bio.NC OR cat:q-bio.QM OR cat:q-bio.GN OR cat:q-bio.CB OR cat:q-bio.MN)", max_results=25)  # Reduced max results
-            idx.build_faiss_index(papers, dataset_dir=DATASET_DIR)
     # Load and convert to pandas for easier handling
     dataset = load_from_disk(DATASET_PATH)
@@ -45,20 +46,24 @@ def load_dataset(query):
     })
     return df
-def generate_answer(question, context, max_length=150):  # Reduced max length
     tokenizer, model = load_models()
-    # Add context about medical information
-    prompt = f"Based on scientific research about autism and health: question: {question} context: {context}"
     # Optimize input processing
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
-    with torch.inference_mode():  # More efficient than no_grad
         outputs = model.generate(
             **inputs,
             max_length=max_length,
-            num_beams=2,  # Reduced beam search
             temperature=0.7,
             top_p=0.9,
             repetition_penalty=1.2,
@@ -71,7 +76,11 @@ def generate_answer(question, context, max_length=150):  # Reduced max length
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
-    return answer if answer and not answer.isspace() else "I cannot find a specific answer to this question in the provided context."
 # Streamlit App
 st.title("🧩 AMA Autism")
@@ -90,14 +99,14 @@ if query:
         # Generate answer
         answer = generate_answer(query, context)
-        if answer and not answer.isspace():
-            st.success("Answer found!")
-            st.write(answer)
-            st.write("### Sources Used:")
-            for _, row in df.head(3).iterrows():
-                st.write(f"**Title:** {row['title']}")
-                st.write(f"**Summary:** {row['text'][:200]}...")
-                st.write("---")
-        else:
-            st.warning("I couldn't find a specific answer in the research papers. Try rephrasing your question.")

 @st.cache_data(ttl=3600)  # Cache for 1 hour
 def load_dataset(query):
+    # Always fetch fresh results for the specific query
+    with st.spinner("Searching autism research papers..."):
+        import faiss_index.index as idx
+        # Make the query more specific to autism and b12
+        search_query = f"autism {query} AND (cat:q-bio.NC OR cat:q-bio.QM OR cat:q-bio.GN OR cat:q-bio.CB OR cat:q-bio.MN)"
+        papers = idx.fetch_arxiv_papers(search_query, max_results=25)
+        idx.build_faiss_index(papers, dataset_dir=DATASET_DIR)
     # Load and convert to pandas for easier handling
     dataset = load_from_disk(DATASET_PATH)
     })
     return df
+def generate_answer(question, context, max_length=150):
     tokenizer, model = load_models()
+    # Improve prompt to focus on autism-related information
+    prompt = f"""Based on scientific research about autism, answer the following question.
+    If the context doesn't contain relevant information about autism, respond with 'I cannot find specific information about this topic in the autism research papers.'
+    Question: {question}
+    Context: {context}"""
     # Optimize input processing
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
+    with torch.inference_mode():
         outputs = model.generate(
             **inputs,
             max_length=max_length,
+            num_beams=2,
             temperature=0.7,
             top_p=0.9,
             repetition_penalty=1.2,
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
+    # Additional validation of the answer
+    if not answer or answer.isspace() or "cannot find" in answer.lower():
+        return "I cannot find specific information about this topic in the autism research papers."
+    return answer
 # Streamlit App
 st.title("🧩 AMA Autism")
         # Generate answer
         answer = generate_answer(query, context)
+    if answer and not answer.isspace():
+        st.success("Answer found!")
+        st.write(answer)
+        st.write("### Sources Used:")
+        for _, row in df.head(3).iterrows():
+            st.write(f"**Title:** {row['title']}")
+            st.write(f"**Summary:** {row['text'][:200]}...")
+            st.write("---")
+    else:
+        st.warning("I couldn't find a specific answer in the research papers. Try rephrasing your question.")

faiss_index/index.py CHANGED Viewed

@@ -17,14 +17,34 @@ DATASET_DIR = os.path.join(DATA_DIR, "rag_dataset")
 def fetch_arxiv_papers(query, max_results=10):
     """Fetch papers from arXiv and format them for RAG"""
     client = arxiv.Client()
     search = arxiv.Search(
-        query=f"{query} AND (cat:q-bio.NC OR cat:q-bio.QM OR cat:q-bio.GN OR cat:q-bio.CB OR cat:q-bio.MN)",  # Focus on biology and medical categories
         max_results=max_results,
-        sort_by=arxiv.SortCriterion.Relevance  # Changed to relevance-based sorting
     )
     results = list(client.results(search))
-    papers = [{"id": str(i), "text": result.summary, "title": result.title} for i, result in enumerate(results)]
-    logging.info(f"Fetched {len(papers)} papers from arXiv")
     return papers
 def build_faiss_index(papers, dataset_dir=DATASET_DIR):

 def fetch_arxiv_papers(query, max_results=10):
     """Fetch papers from arXiv and format them for RAG"""
     client = arxiv.Client()
+    # Construct a more focused search query
+    search_terms = query.lower().split()
+    if 'autism' not in search_terms:
+        search_terms.insert(0, 'autism')
+    # Add specific category filters for medical and biological papers
+    search_query = f"({' AND '.join(search_terms)}) AND (cat:q-bio.NC OR cat:q-bio.QM OR cat:q-bio.GN OR cat:q-bio.CB OR cat:q-bio.MN)"
     search = arxiv.Search(
+        query=search_query,
         max_results=max_results,
+        sort_by=arxiv.SortCriterion.Relevance
     )
     results = list(client.results(search))
+    papers = []
+    # Filter results to ensure they're relevant to autism
+    for i, result in enumerate(results):
+        if 'autism' in result.title.lower() or 'autism' in result.summary.lower():
+            papers.append({
+                "id": str(i),
+                "text": result.summary,
+                "title": result.title
+            })
+    logging.info(f"Fetched {len(papers)} relevant papers from arXiv")
     return papers
 def build_faiss_index(papers, dataset_dir=DATASET_DIR):