Spaces:

wakeupmh
/

ama-autism

Sleeping

App Files Files Community

wakeupmh commited on Feb 15

Commit

4a9703a

1 Parent(s): 62b3157

fix: daiss

Browse files

Files changed (3) hide show

app.py +2 -2
faiss_index/index.py +103 -6
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -31,7 +31,7 @@ def load_models():
 @st.cache_data(ttl=3600)  # Cache for 1 hour
 def load_dataset(query):
     # Always fetch fresh results for the specific query
-    with st.spinner("Searching autism research papers..."):
         import faiss_index.index as idx
         # Ensure both autism and the query terms are included
         if 'autism' not in query.lower():
@@ -39,7 +39,7 @@ def load_dataset(query):
         else:
             search_query = query
-        papers = idx.fetch_arxiv_papers(search_query, max_results=25)
     if not papers:
         st.warning("No relevant papers found. Please try rephrasing your question.")

 @st.cache_data(ttl=3600)  # Cache for 1 hour
 def load_dataset(query):
     # Always fetch fresh results for the specific query
+    with st.spinner("Searching research papers from arXiv and PubMed..."):
         import faiss_index.index as idx
         # Ensure both autism and the query terms are included
         if 'autism' not in query.lower():
         else:
             search_query = query
+        papers = idx.fetch_papers(search_query, max_results=25)  # This now fetches from both sources
     if not papers:
         st.warning("No relevant papers found. Please try rephrasing your question.")

faiss_index/index.py CHANGED Viewed

@@ -6,6 +6,10 @@ import os
 from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
 import torch
 import logging
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -19,7 +23,7 @@ def fetch_arxiv_papers(query, max_results=10):
     client = arxiv.Client()
     # Clean and prepare the search query
-    query = query.replace('and', '').strip()  # Remove 'and' as it's treated as AND operator
     terms = [term.strip() for term in query.split() if term.strip()]
     # Always include autism in the search
@@ -27,14 +31,21 @@ def fetch_arxiv_papers(query, max_results=10):
         terms.insert(0, 'autism')
     # Create search query with required autism term
-    search_query = f'(abs:"autism" OR ti:"autism") AND ({" OR ".join([f'abs:"{term}" OR ti:"{term}"' for term in terms if term.lower() != "autism"])})'
     search_query = f'({search_query}) AND (cat:q-bio* OR cat:med*)'
     logging.info(f"Searching arXiv with query: {search_query}")
     search = arxiv.Search(
         query=search_query,
-        max_results=max_results * 2,  # Get more results to filter
         sort_by=arxiv.SortCriterion.Relevance
     )
@@ -47,11 +58,11 @@ def fetch_arxiv_papers(query, max_results=10):
             text = (result.title + " " + result.summary).lower()
             if 'autism' in text:
                 papers.append({
-                    "id": str(i),
                     "text": result.summary,
                     "title": result.title,
-                    "url": result.entry_id,  # Add the paper URL
-                    "published": result.published.strftime("%Y-%m-%d")  # Add publication date
                 })
                 if len(papers) >= max_results:
                     break
@@ -62,6 +73,92 @@ def fetch_arxiv_papers(query, max_results=10):
         logging.error(f"Error fetching papers from arXiv: {str(e)}")
         return []
 def build_faiss_index(papers, dataset_dir=DATASET_DIR):
     """Build and save dataset with FAISS index for RAG"""
     if not papers:

 from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
 import torch
 import logging
+import requests
+from datetime import datetime
+import xml.etree.ElementTree as ET
+from time import sleep
 # Configure logging
 logging.basicConfig(level=logging.INFO)
     client = arxiv.Client()
     # Clean and prepare the search query
+    query = query.replace('and', '').strip()
     terms = [term.strip() for term in query.split() if term.strip()]
     # Always include autism in the search
         terms.insert(0, 'autism')
     # Create search query with required autism term
+    term_queries = []
+    for term in terms:
+        if term.lower() != "autism":
+            term_queries.append(f'abs:"{term}" OR ti:"{term}"')
+    search_query = '(abs:"autism" OR ti:"autism")'
+    if term_queries:
+        search_query += f' AND ({" OR ".join(term_queries)})'
     search_query = f'({search_query}) AND (cat:q-bio* OR cat:med*)'
     logging.info(f"Searching arXiv with query: {search_query}")
     search = arxiv.Search(
         query=search_query,
+        max_results=max_results * 2,
         sort_by=arxiv.SortCriterion.Relevance
     )
             text = (result.title + " " + result.summary).lower()
             if 'autism' in text:
                 papers.append({
+                    "id": f"arxiv_{i}",
                     "text": result.summary,
                     "title": result.title,
+                    "url": result.entry_id,
+                    "published": result.published.strftime("%Y-%m-%d")
                 })
                 if len(papers) >= max_results:
                     break
         logging.error(f"Error fetching papers from arXiv: {str(e)}")
         return []
+def fetch_pubmed_papers(query, max_results=10):
+    """Fetch papers from PubMed using E-utilities"""
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
+    # Search for papers
+    search_url = f"{base_url}/esearch.fcgi"
+    search_params = {
+        'db': 'pubmed',
+        'term': f"{query} AND autism",
+        'retmax': max_results,
+        'sort': 'relevance',
+        'retmode': 'xml'
+    }
+    try:
+        # Get paper IDs
+        response = requests.get(search_url, params=search_params)
+        root = ET.fromstring(response.content)
+        id_list = [id_elem.text for id_elem in root.findall('.//Id')]
+        if not id_list:
+            return []
+        # Fetch paper details
+        fetch_url = f"{base_url}/efetch.fcgi"
+        fetch_params = {
+            'db': 'pubmed',
+            'id': ','.join(id_list),
+            'retmode': 'xml'
+        }
+        response = requests.get(fetch_url, params=fetch_params)
+        root = ET.fromstring(response.content)
+        papers = []
+        for article in root.findall('.//PubmedArticle'):
+            try:
+                # Extract article information
+                title = article.find('.//ArticleTitle').text
+                abstract = article.find('.//Abstract/AbstractText')
+                abstract = abstract.text if abstract is not None else ""
+                if 'autism' in (title + abstract).lower():
+                    pmid = article.find('.//PMID').text
+                    date = article.find('.//PubDate')
+                    year = date.find('Year').text if date.find('Year') is not None else "Unknown"
+                    papers.append({
+                        "id": f"pubmed_{pmid}",
+                        "text": abstract,
+                        "title": title,
+                        "url": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
+                        "published": year
+                    })
+            except Exception as e:
+                logging.warning(f"Error processing PubMed article: {str(e)}")
+                continue
+        logging.info(f"Found {len(papers)} relevant papers from PubMed")
+        return papers
+    except Exception as e:
+        logging.error(f"Error fetching papers from PubMed: {str(e)}")
+        return []
+def fetch_papers(query, max_results=10):
+    """Fetch papers from both arXiv and PubMed"""
+    arxiv_papers = fetch_arxiv_papers(query, max_results=max_results)
+    sleep(1)  # Respect rate limits
+    pubmed_papers = fetch_pubmed_papers(query, max_results=max_results)
+    # Combine and deduplicate papers based on title similarity
+    all_papers = arxiv_papers + pubmed_papers
+    unique_papers = []
+    seen_titles = set()
+    for paper in all_papers:
+        title_lower = paper['title'].lower()
+        if not any(title_lower in seen_title or seen_title in title_lower for seen_title in seen_titles):
+            unique_papers.append(paper)
+            seen_titles.add(title_lower)
+    # Sort by relevance (papers with 'autism' in title first)
+    unique_papers.sort(key=lambda x: 'autism' in x['title'].lower(), reverse=True)
+    return unique_papers[:max_results]
 def build_faiss_index(papers, dataset_dir=DATASET_DIR):
     """Build and save dataset with FAISS index for RAG"""
     if not papers:

requirements.txt CHANGED Viewed

@@ -9,4 +9,5 @@ torch>=2.2.0
 accelerate>=0.26.0
 bitsandbytes>=0.41.1
 numpy>=1.24.0
-pandas>=2.2.0

 accelerate>=0.26.0
 bitsandbytes>=0.41.1
 numpy>=1.24.0
+pandas>=2.2.0
+requests>=2.31.0