Spaces:

wakeupmh
/

ama-autism

Sleeping

App Files Files Community

wakeupmh commited on Feb 15

Commit

97889da

1 Parent(s): 6782472

fix: response

Browse files

Files changed (1) hide show

app.py +62 -36

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ logging.basicConfig(level=logging.INFO)
 DATA_DIR = "/data" if os.path.exists("/data") else "."
 DATASET_DIR = os.path.join(DATA_DIR, "rag_dataset")
 DATASET_PATH = os.path.join(DATASET_DIR, "dataset")
-MODEL_PATH = "google/flan-t5-base"  # Lighter model
 @st.cache_resource
 def load_local_model():
@@ -24,7 +24,7 @@ def load_local_model():
     tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
     model = AutoModelForSeq2SeqLM.from_pretrained(
         MODEL_PATH,
-        torch_dtype=torch.float32,  # Using float32 for CPU compatibility
         device_map="auto"
     )
     return model, tokenizer
@@ -33,8 +33,11 @@ def fetch_arxiv_papers(query, max_results=5):
     """Fetch papers from arXiv"""
     client = arxiv.Client()
-    # Clean and prepare the search query
-    search_query = f"ti:{query} OR abs:{query} AND cat:q-bio"
     # Search arXiv
     search = arxiv.Search(
@@ -49,7 +52,7 @@ def fetch_arxiv_papers(query, max_results=5):
             'title': result.title,
             'abstract': result.summary,
             'url': result.pdf_url,
-            'published': result.published
         })
     return papers
@@ -58,11 +61,17 @@ def fetch_pubmed_papers(query, max_results=5):
     """Fetch papers from PubMed"""
     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
     # Search for papers
     search_url = f"{base_url}/esearch.fcgi"
     search_params = {
         'db': 'pubmed',
-        'term': query,
         'retmax': max_results,
         'sort': 'relevance',
         'retmode': 'xml'
@@ -92,13 +101,16 @@ def fetch_pubmed_papers(query, max_results=5):
         for article in articles.findall('.//PubmedArticle'):
             title = article.find('.//ArticleTitle')
             abstract = article.find('.//Abstract/AbstractText')
-            papers.append({
-                'title': title.text if title is not None else 'No title available',
-                'abstract': abstract.text if abstract is not None else 'No abstract available',
-                'url': f"https://pubmed.ncbi.nlm.nih.gov/{article.find('.//PMID').text}/",
-                'published': article.find('.//PubDate/Year').text if article.find('.//PubDate/Year') is not None else 'Unknown'
-            })
     except Exception as e:
         st.error(f"Error fetching PubMed papers: {str(e)}")
@@ -113,12 +125,13 @@ def search_research_papers(query):
     # Combine and format papers
     all_papers = []
     for paper in arxiv_papers + pubmed_papers:
-        all_papers.append({
-            'title': paper['title'],
-            'text': f"Title: {paper['title']}\nAbstract: {paper['abstract']}",
-            'url': paper['url'],
-            'published': paper['published']
-        })
     return pd.DataFrame(all_papers)
@@ -127,38 +140,50 @@ def generate_answer(question, context, max_length=512):
     model, tokenizer = load_local_model()
     # Format the context as a structured query
-    prompt = f"""Based on the following research papers about autism, provide a detailed answer:
-Question: {question}
 Research Context:
 {context}
-Please analyze:
-1. Main findings
-2. Research methods
 3. Clinical implications
-4. Limitations
-Answer:"""
     # Generate response
-    inputs = tokenizer(prompt, return_tensors="pt", max_length=max_length, truncation=True)
     with torch.inference_mode():
         outputs = model.generate(
             **inputs,
             max_length=max_length,
-            num_beams=4,
             temperature=0.7,
-            top_p=0.9,
-            repetition_penalty=1.2,
             early_stopping=True
         )
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Format the response
     formatted_response = response.replace(". ", ".\n").replace("• ", "\n• ")
     return formatted_response
@@ -181,10 +206,6 @@ if query:
         st.write("Searching for data in PubMed and arXiv...")
         st.write(f"Found {len(df)} relevant papers!")
-        # Display paper sources
-        for _, paper in df.iterrows():
-            st.markdown(f"- [{paper['title']}]({paper['url']}) ({paper['published']})")
         # Get relevant context
         context = "\n".join([
             f"{text[:1000]}" for text in df['text'].head(3)
@@ -193,4 +214,9 @@ if query:
         # Generate answer
         st.write("Generating answer...")
         answer = generate_answer(query, context)
-        st.markdown(answer)

 DATA_DIR = "/data" if os.path.exists("/data") else "."
 DATASET_DIR = os.path.join(DATA_DIR, "rag_dataset")
 DATASET_PATH = os.path.join(DATASET_DIR, "dataset")
+MODEL_PATH = "facebook/bart-large-cnn"  # Changed to BART model which is better for summarization
 @st.cache_resource
 def load_local_model():
     tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
     model = AutoModelForSeq2SeqLM.from_pretrained(
         MODEL_PATH,
+        torch_dtype=torch.float32,
         device_map="auto"
     )
     return model, tokenizer
     """Fetch papers from arXiv"""
     client = arxiv.Client()
+    # Ensure query includes autism-related terms
+    if 'autism' not in query.lower():
+        search_query = f"(ti:{query} OR abs:{query}) AND (ti:autism OR abs:autism) AND cat:q-bio"
+    else:
+        search_query = f"(ti:{query} OR abs:{query}) AND cat:q-bio"
     # Search arXiv
     search = arxiv.Search(
             'title': result.title,
             'abstract': result.summary,
             'url': result.pdf_url,
+            'published': result.published.strftime("%Y-%m-%d")
         })
     return papers
     """Fetch papers from PubMed"""
     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
+    # Ensure query includes autism-related terms
+    if 'autism' not in query.lower():
+        search_term = f"({query}) AND (autism[Title/Abstract] OR ASD[Title/Abstract])"
+    else:
+        search_term = query
     # Search for papers
     search_url = f"{base_url}/esearch.fcgi"
     search_params = {
         'db': 'pubmed',
+        'term': search_term,
         'retmax': max_results,
         'sort': 'relevance',
         'retmode': 'xml'
         for article in articles.findall('.//PubmedArticle'):
             title = article.find('.//ArticleTitle')
             abstract = article.find('.//Abstract/AbstractText')
+            year = article.find('.//PubDate/Year')
+            pmid = article.find('.//PMID')
+            if title is not None and abstract is not None:
+                papers.append({
+                    'title': title.text,
+                    'abstract': abstract.text,
+                    'url': f"https://pubmed.ncbi.nlm.nih.gov/{pmid.text}/",
+                    'published': year.text if year is not None else 'Unknown'
+                })
     except Exception as e:
         st.error(f"Error fetching PubMed papers: {str(e)}")
     # Combine and format papers
     all_papers = []
     for paper in arxiv_papers + pubmed_papers:
+        if paper['abstract'] and len(paper['abstract'].strip()) > 0:
+            all_papers.append({
+                'title': paper['title'],
+                'text': f"Title: {paper['title']}\n\nAbstract: {paper['abstract']}",
+                'url': paper['url'],
+                'published': paper['published']
+            })
     return pd.DataFrame(all_papers)
     model, tokenizer = load_local_model()
     # Format the context as a structured query
+    prompt = f"""Summarize the following research about autism and answer the question.
 Research Context:
 {context}
+Question: {question}
+Provide a detailed answer that includes:
+1. Main findings from the research
+2. Research methods used
 3. Clinical implications
+4. Limitations of the studies
+If the research doesn't address the question directly, explain what information is missing."""
     # Generate response
+    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
     with torch.inference_mode():
         outputs = model.generate(
             **inputs,
             max_length=max_length,
+            min_length=200,  # Ensure longer responses
+            num_beams=5,
+            length_penalty=2.0,  # Encourage even longer responses
             temperature=0.7,
+            no_repeat_ngram_size=3,
+            repetition_penalty=1.3,
             early_stopping=True
         )
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # If response is too short or empty, provide a fallback message
+    if len(response.strip()) < 100:
+        return """I apologize, but I couldn't generate a specific answer from the research papers provided.
+        This might be because:
+        1. The research papers don't directly address your question
+        2. The context needs more specific information
+        3. The question might need to be more specific
+        Please try rephrasing your question or ask about a more specific aspect of autism."""
+    # Format the response for better readability
     formatted_response = response.replace(". ", ".\n").replace("• ", "\n• ")
     return formatted_response
         st.write("Searching for data in PubMed and arXiv...")
         st.write(f"Found {len(df)} relevant papers!")
         # Get relevant context
         context = "\n".join([
             f"{text[:1000]}" for text in df['text'].head(3)
         # Generate answer
         st.write("Generating answer...")
         answer = generate_answer(query, context)
+    # Display paper sources
+    with st.expander("View source papers"):
+        for _, paper in df.iterrows():
+            st.markdown(f"- [{paper['title']}]({paper['url']}) ({paper['published']})")
+    st.success("Answer found!")
+    st.markdown(answer)