Spaces:

wakeupmh
/

ama-autism

Sleeping

App Files Files Community

wakeupmh commited on Feb 15

Commit

4660a83

1 Parent(s): cb9a068

test: agno

Browse files

Files changed (3) hide show

_old_app.py +203 -0
app.py +134 -72
requirements.txt +4 -5

_old_app.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import streamlit as st
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
+import os
+from datasets import load_from_disk, Dataset
+import torch
+import logging
+import pandas as pd
+import arxiv
+import requests
+import xml.etree.ElementTree as ET
+from agno.embedder.huggingface import HuggingfaceCustomEmbedder
+from agno.vectordb.lancedb import LanceDb, SearchType
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+# Define data paths and constants
+DATA_DIR = "/data" if os.path.exists("/data") else "."
+DATASET_DIR = os.path.join(DATA_DIR, "rag_dataset")
+DATASET_PATH = os.path.join(DATASET_DIR, "dataset")
+MODEL_PATH = "google/flan-t5-base"  # Lighter model
+@st.cache_resource
+def load_local_model():
+    """Load the local Hugging Face model"""
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+    model = AutoModelForSeq2SeqLM.from_pretrained(
+        MODEL_PATH,
+        torch_dtype=torch.float32,  # Using float32 for CPU compatibility
+        device_map="auto"
+    )
+    return model, tokenizer
+def fetch_arxiv_papers(query, max_results=5):
+    """Fetch papers from arXiv"""
+    client = arxiv.Client()
+    # Clean and prepare the search query
+    search_query = f"ti:{query} OR abs:{query} AND cat:q-bio"
+    # Search arXiv
+    search = arxiv.Search(
+        query=search_query,
+        max_results=max_results,
+        sort_by=arxiv.SortCriterion.Relevance
+    )
+    papers = []
+    for result in client.results(search):
+        papers.append({
+            'title': result.title,
+            'abstract': result.summary,
+            'url': result.pdf_url,
+            'published': result.published
+        })
+    return papers
+def fetch_pubmed_papers(query, max_results=5):
+    """Fetch papers from PubMed"""
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
+    # Search for papers
+    search_url = f"{base_url}/esearch.fcgi"
+    search_params = {
+        'db': 'pubmed',
+        'term': query,
+        'retmax': max_results,
+        'sort': 'relevance',
+        'retmode': 'xml'
+    }
+    papers = []
+    try:
+        # Get paper IDs
+        response = requests.get(search_url, params=search_params)
+        root = ET.fromstring(response.content)
+        id_list = [id_elem.text for id_elem in root.findall('.//Id')]
+        if not id_list:
+            return papers
+        # Fetch paper details
+        fetch_url = f"{base_url}/efetch.fcgi"
+        fetch_params = {
+            'db': 'pubmed',
+            'id': ','.join(id_list),
+            'retmode': 'xml'
+        }
+        response = requests.get(fetch_url, params=fetch_params)
+        articles = ET.fromstring(response.content)
+        for article in articles.findall('.//PubmedArticle'):
+            title = article.find('.//ArticleTitle')
+            abstract = article.find('.//Abstract/AbstractText')
+            papers.append({
+                'title': title.text if title is not None else 'No title available',
+                'abstract': abstract.text if abstract is not None else 'No abstract available',
+                'url': f"https://pubmed.ncbi.nlm.nih.gov/{article.find('.//PMID').text}/",
+                'published': article.find('.//PubDate/Year').text if article.find('.//PubDate/Year') is not None else 'Unknown'
+            })
+    except Exception as e:
+        st.error(f"Error fetching PubMed papers: {str(e)}")
+    return papers
+def search_research_papers(query):
+    """Search both arXiv and PubMed for papers"""
+    arxiv_papers = fetch_arxiv_papers(query)
+    pubmed_papers = fetch_pubmed_papers(query)
+    # Combine and format papers
+    all_papers = []
+    for paper in arxiv_papers + pubmed_papers:
+        all_papers.append({
+            'title': paper['title'],
+            'text': f"Title: {paper['title']}\nAbstract: {paper['abstract']}",
+            'url': paper['url'],
+            'published': paper['published']
+        })
+    return pd.DataFrame(all_papers)
+def generate_answer(question, context, max_length=512):
+    """Generate a comprehensive answer using the local model"""
+    model, tokenizer = load_local_model()
+    # Format the context as a structured query
+    prompt = f"""Based on the following research papers about autism, provide a detailed answer:
+Question: {question}
+Research Context:
+{context}
+Please analyze:
+1. Main findings
+2. Research methods
+3. Clinical implications
+4. Limitations
+Answer:"""
+    # Generate response
+    inputs = tokenizer(prompt, return_tensors="pt", max_length=max_length, truncation=True)
+    with torch.inference_mode():
+        outputs = model.generate(
+            **inputs,
+            max_length=max_length,
+            num_beams=4,
+            temperature=0.7,
+            top_p=0.9,
+            repetition_penalty=1.2,
+            early_stopping=True
+        )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Format the response
+    formatted_response = response.replace(". ", ".\n").replace("• ", "\n• ")
+    return formatted_response
+# Streamlit App
+st.title("🧩 AMA Autism")
+st.write("This app searches through scientific papers to answer your questions about autism. For best results, be specific in your questions.")
+query = st.text_input("Please ask me anything about autism ✨")
+if query:
+    with st.status("Searching for answers...") as status:
+        # Search for papers
+        df = search_research_papers(query)
+        st.write("Searching for data in PubMed and arXiv...")
+        st.write("Data found!")
+        # Get relevant context
+        context = "\n".join([
+            f"{text[:1000]}" for text in df['text'].head(3)
+        ])
+        # Generate answer
+        answer = generate_answer(query, context)
+        st.write("Generating answer...")
+        status.update(
+            label="Search complete!", state="complete", expanded=False
+        )
+    if answer and not answer.isspace():
+        st.success("Answer found!")
+        st.write(answer)
+        st.write("### Sources used:")
+        for _, row in df.head(3).iterrows():
+            st.markdown(f"**[{row['title']}]({row['url']})** ({row['published']})")
+            st.write(f"**Summary:** {row['text'][:200]}...")
+            st.write("---")
+    else:
+        st.warning("I couldn't find a specific answer in the research papers. Try rephrasing your question.")
+    if df.empty:
+        st.warning("I couldn't find any relevant research papers about this topic. Please try rephrasing your question or ask something else about autism.")

app.py CHANGED Viewed

@@ -1,109 +1,169 @@
 import streamlit as st
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 import os
 from datasets import load_from_disk, Dataset
 import torch
 import logging
 import pandas as pd
 # Configure logging
 logging.basicConfig(level=logging.INFO)
-# Define data paths
 DATA_DIR = "/data" if os.path.exists("/data") else "."
 DATASET_DIR = os.path.join(DATA_DIR, "rag_dataset")
 DATASET_PATH = os.path.join(DATASET_DIR, "dataset")
-# Cache models and dataset
 @st.cache_resource
-def load_models():
-    model_name = "google/flan-t5-small"  # Lighter model
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForSeq2SeqLM.from_pretrained(
-        model_name,
-        torch_dtype=torch.float16,
-        low_cpu_mem_usage=True,
-        device_map='auto',
-        max_memory={'cpu': '1GB'}
     )
-    return tokenizer, model
-@st.cache_data(ttl=3600)  # Cache for 1 hour
-def load_dataset(query):
-    # Always fetch fresh results for the specific query
-    with st.spinner("Searching research papers from arXiv and PubMed..."):
-        import faiss_index.index as idx
-        # Ensure both autism and the query terms are included
-        if 'autism' not in query.lower():
-            search_query = f"autism {query}"
-        else:
-            search_query = query
-        papers = idx.fetch_papers(search_query, max_results=25)  # This now fetches from both sources
-    if not papers:
-        st.warning("No relevant papers found. Please try rephrasing your question.")
-        return pd.DataFrame(columns=['title', 'text', 'url', 'published'])
-    idx.build_faiss_index(papers, dataset_dir=DATASET_DIR)
-    # Load and convert to pandas for easier handling
-    dataset = load_from_disk(DATASET_PATH)
-    df = pd.DataFrame({
-        'title': dataset['title'],
-        'text': dataset['text'],
-        'url': [p['url'] for p in papers],
-        'published': [p['published'] for p in papers]
-    })
-    return df
-def generate_answer(question, context, max_length=300):
-    tokenizer, model = load_models()
-    # Enhanced prompt for more detailed and structured answers
-    prompt = f"""Based on scientific research about autism, provide a comprehensive and structured summary answering the following question.
-    Include the following aspects when relevant:
-    1. Main findings and conclusions
-    2. Supporting evidence or research methods
-    3. Clinical implications or practical applications
-    4. Any limitations or areas needing further research
-    Use clear headings and bullet points when appropriate to organize the information.
-    If the context doesn't contain relevant information about autism, respond with 'I cannot find specific information about this topic in the autism research papers.'
-    Question: {question}
-    Context: {context}
-    Detailed summary:"""
-    # Optimize input processing
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=768)
     with torch.inference_mode():
         outputs = model.generate(
             **inputs,
             max_length=max_length,
-            num_beams=4,
-            temperature=0.8,
             top_p=0.9,
-            repetition_penalty=1.3,
-            length_penalty=1.2,
             early_stopping=True
         )
-    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Clear GPU memory if possible
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    # Enhanced answer validation and formatting
-    if not answer or answer.isspace() or "cannot find" in answer.lower():
-        return "I cannot find specific information about this topic in the autism research papers."
-    # Format the answer with proper line breaks and structure
-    formatted_answer = answer.replace(". ", ".\n").replace("• ", "\n• ")
-    return formatted_answer
 # Streamlit App
 st.title("🧩 AMA Autism")
@@ -112,14 +172,16 @@ query = st.text_input("Please ask me anything about autism ✨")
 if query:
     with st.status("Searching for answers...") as status:
-        # Load dataset
-        df = load_dataset(query)
         st.write("Searching for data in PubMed and arXiv...")
         # Get relevant context
         context = "\n".join([
             f"{text[:1000]}" for text in df['text'].head(3)
         ])
-        st.write("Data found!")
         # Generate answer
         answer = generate_answer(query, context)
         st.write("Generating answer...")

 import streamlit as st
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
 import os
 from datasets import load_from_disk, Dataset
 import torch
 import logging
 import pandas as pd
+import arxiv
+import requests
+import xml.etree.ElementTree as ET
+from agno.embedder.huggingface import HuggingfaceCustomEmbedder
+from agno.vectordb.lancedb import LanceDb, SearchType
 # Configure logging
 logging.basicConfig(level=logging.INFO)
+# Define data paths and constants
 DATA_DIR = "/data" if os.path.exists("/data") else "."
 DATASET_DIR = os.path.join(DATA_DIR, "rag_dataset")
 DATASET_PATH = os.path.join(DATASET_DIR, "dataset")
+MODEL_PATH = "google/flan-t5-base"  # Lighter model
 @st.cache_resource
+def load_local_model():
+    """Load the local Hugging Face model"""
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
     model = AutoModelForSeq2SeqLM.from_pretrained(
+        MODEL_PATH,
+        torch_dtype=torch.float32,  # Using float32 for CPU compatibility
+        device_map="auto"
     )
+    return model, tokenizer
+def fetch_arxiv_papers(query, max_results=5):
+    """Fetch papers from arXiv"""
+    client = arxiv.Client()
+    # Clean and prepare the search query
+    search_query = f"ti:{query} OR abs:{query} AND cat:q-bio"
+    # Search arXiv
+    search = arxiv.Search(
+        query=search_query,
+        max_results=max_results,
+        sort_by=arxiv.SortCriterion.Relevance
+    )
+    papers = []
+    for result in client.results(search):
+        papers.append({
+            'title': result.title,
+            'abstract': result.summary,
+            'url': result.pdf_url,
+            'published': result.published
+        })
+    return papers
+def fetch_pubmed_papers(query, max_results=5):
+    """Fetch papers from PubMed"""
+    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
+    # Search for papers
+    search_url = f"{base_url}/esearch.fcgi"
+    search_params = {
+        'db': 'pubmed',
+        'term': query,
+        'retmax': max_results,
+        'sort': 'relevance',
+        'retmode': 'xml'
+    }
+    papers = []
+    try:
+        # Get paper IDs
+        response = requests.get(search_url, params=search_params)
+        root = ET.fromstring(response.content)
+        id_list = [id_elem.text for id_elem in root.findall('.//Id')]
+        if not id_list:
+            return papers
+        # Fetch paper details
+        fetch_url = f"{base_url}/efetch.fcgi"
+        fetch_params = {
+            'db': 'pubmed',
+            'id': ','.join(id_list),
+            'retmode': 'xml'
+        }
+        response = requests.get(fetch_url, params=fetch_params)
+        articles = ET.fromstring(response.content)
+        for article in articles.findall('.//PubmedArticle'):
+            title = article.find('.//ArticleTitle')
+            abstract = article.find('.//Abstract/AbstractText')
+            papers.append({
+                'title': title.text if title is not None else 'No title available',
+                'abstract': abstract.text if abstract is not None else 'No abstract available',
+                'url': f"https://pubmed.ncbi.nlm.nih.gov/{article.find('.//PMID').text}/",
+                'published': article.find('.//PubDate/Year').text if article.find('.//PubDate/Year') is not None else 'Unknown'
+            })
+    except Exception as e:
+        st.error(f"Error fetching PubMed papers: {str(e)}")
+    return papers
+def search_research_papers(query):
+    """Search both arXiv and PubMed for papers"""
+    arxiv_papers = fetch_arxiv_papers(query)
+    pubmed_papers = fetch_pubmed_papers(query)
+    # Combine and format papers
+    all_papers = []
+    for paper in arxiv_papers + pubmed_papers:
+        all_papers.append({
+            'title': paper['title'],
+            'text': f"Title: {paper['title']}\nAbstract: {paper['abstract']}",
+            'url': paper['url'],
+            'published': paper['published']
+        })
+    return pd.DataFrame(all_papers)
+def generate_answer(question, context, max_length=512):
+    """Generate a comprehensive answer using the local model"""
+    model, tokenizer = load_local_model()
+    # Format the context as a structured query
+    prompt = f"""Based on the following research papers about autism, provide a detailed answer:
+Question: {question}
+Research Context:
+{context}
+Please analyze:
+1. Main findings
+2. Research methods
+3. Clinical implications
+4. Limitations
+Answer:"""
+    # Generate response
+    inputs = tokenizer(prompt, return_tensors="pt", max_length=max_length, truncation=True)
     with torch.inference_mode():
         outputs = model.generate(
             **inputs,
             max_length=max_length,
+            num_beams=4,
+            temperature=0.7,
             top_p=0.9,
+            repetition_penalty=1.2,
             early_stopping=True
         )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Format the response
+    formatted_response = response.replace(". ", ".\n").replace("• ", "\n• ")
+    return formatted_response
 # Streamlit App
 st.title("🧩 AMA Autism")
 if query:
     with st.status("Searching for answers...") as status:
+        # Search for papers
+        df = search_research_papers(query)
         st.write("Searching for data in PubMed and arXiv...")
+        st.write("Data found!")
         # Get relevant context
         context = "\n".join([
             f"{text[:1000]}" for text in df['text'].head(3)
         ])
         # Generate answer
         answer = generate_answer(query, context)
         st.write("Generating answer...")

requirements.txt CHANGED Viewed

@@ -1,13 +1,12 @@
 streamlit>=1.32.0
 transformers>=4.37.0
 datasets>=2.17.0
-sentence-transformers>=2.3.1
-faiss-cpu>=1.7.4
-arxiv>=2.1.0
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch>=2.2.0
 accelerate>=0.26.0
-bitsandbytes>=0.41.1
 numpy>=1.24.0
 pandas>=2.2.0
-requests>=2.31.0

 streamlit>=1.32.0
 transformers>=4.37.0
 datasets>=2.17.0
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch>=2.2.0
 accelerate>=0.26.0
 numpy>=1.24.0
 pandas>=2.2.0
+requests>=2.31.0
+arxiv>=2.1.0
+lancedb>=0.3.3
+tantivy>=0.19.2