Spaces:

wakeupmh
/

ama-autism

Sleeping

App Files Files Community

wakeupmh commited on Feb 16

Commit

7133da4

1 Parent(s): 158b752

refactor: using smol

Browse files

Files changed (5) hide show

.DS_Store +0 -0
_old_app.py +0 -203
app.py +13 -14
faiss_index/__init__.py +0 -1
faiss_index/index.py +0 -232

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

_old_app.py DELETED Viewed

@@ -1,203 +0,0 @@
-import streamlit as st
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
-import os
-from datasets import load_from_disk, Dataset
-import torch
-import logging
-import pandas as pd
-import arxiv
-import requests
-import xml.etree.ElementTree as ET
-from agno.embedder.huggingface import HuggingfaceCustomEmbedder
-from agno.vectordb.lancedb import LanceDb, SearchType
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-# Define data paths and constants
-DATA_DIR = "/data" if os.path.exists("/data") else "."
-DATASET_DIR = os.path.join(DATA_DIR, "rag_dataset")
-DATASET_PATH = os.path.join(DATASET_DIR, "dataset")
-MODEL_PATH = "google/flan-t5-base"  # Lighter model
-@st.cache_resource
-def load_local_model():
-    """Load the local Hugging Face model"""
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
-    model = AutoModelForSeq2SeqLM.from_pretrained(
-        MODEL_PATH,
-        torch_dtype=torch.float32,  # Using float32 for CPU compatibility
-        device_map="auto"
-    )
-    return model, tokenizer
-def fetch_arxiv_papers(query, max_results=5):
-    """Fetch papers from arXiv"""
-    client = arxiv.Client()
-    # Clean and prepare the search query
-    search_query = f"ti:{query} OR abs:{query} AND cat:q-bio"
-    # Search arXiv
-    search = arxiv.Search(
-        query=search_query,
-        max_results=max_results,
-        sort_by=arxiv.SortCriterion.Relevance
-    )
-    papers = []
-    for result in client.results(search):
-        papers.append({
-            'title': result.title,
-            'abstract': result.summary,
-            'url': result.pdf_url,
-            'published': result.published
-        })
-    return papers
-def fetch_pubmed_papers(query, max_results=5):
-    """Fetch papers from PubMed"""
-    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
-    # Search for papers
-    search_url = f"{base_url}/esearch.fcgi"
-    search_params = {
-        'db': 'pubmed',
-        'term': query,
-        'retmax': max_results,
-        'sort': 'relevance',
-        'retmode': 'xml'
-    }
-    papers = []
-    try:
-        # Get paper IDs
-        response = requests.get(search_url, params=search_params)
-        root = ET.fromstring(response.content)
-        id_list = [id_elem.text for id_elem in root.findall('.//Id')]
-        if not id_list:
-            return papers
-        # Fetch paper details
-        fetch_url = f"{base_url}/efetch.fcgi"
-        fetch_params = {
-            'db': 'pubmed',
-            'id': ','.join(id_list),
-            'retmode': 'xml'
-        }
-        response = requests.get(fetch_url, params=fetch_params)
-        articles = ET.fromstring(response.content)
-        for article in articles.findall('.//PubmedArticle'):
-            title = article.find('.//ArticleTitle')
-            abstract = article.find('.//Abstract/AbstractText')
-            papers.append({
-                'title': title.text if title is not None else 'No title available',
-                'abstract': abstract.text if abstract is not None else 'No abstract available',
-                'url': f"https://pubmed.ncbi.nlm.nih.gov/{article.find('.//PMID').text}/",
-                'published': article.find('.//PubDate/Year').text if article.find('.//PubDate/Year') is not None else 'Unknown'
-            })
-    except Exception as e:
-        st.error(f"Error fetching PubMed papers: {str(e)}")
-    return papers
-def search_research_papers(query):
-    """Search both arXiv and PubMed for papers"""
-    arxiv_papers = fetch_arxiv_papers(query)
-    pubmed_papers = fetch_pubmed_papers(query)
-    # Combine and format papers
-    all_papers = []
-    for paper in arxiv_papers + pubmed_papers:
-        all_papers.append({
-            'title': paper['title'],
-            'text': f"Title: {paper['title']}\nAbstract: {paper['abstract']}",
-            'url': paper['url'],
-            'published': paper['published']
-        })
-    return pd.DataFrame(all_papers)
-def generate_answer(question, context, max_length=512):
-    """Generate a comprehensive answer using the local model"""
-    model, tokenizer = load_local_model()
-    # Format the context as a structured query
-    prompt = f"""Based on the following research papers about autism, provide a detailed answer:
-Question: {question}
-Research Context:
-{context}
-Please analyze:
-1. Main findings
-2. Research methods
-3. Clinical implications
-4. Limitations
-Answer:"""
-    # Generate response
-    inputs = tokenizer(prompt, return_tensors="pt", max_length=max_length, truncation=True)
-    with torch.inference_mode():
-        outputs = model.generate(
-            **inputs,
-            max_length=max_length,
-            num_beams=4,
-            temperature=0.7,
-            top_p=0.9,
-            repetition_penalty=1.2,
-            early_stopping=True
-        )
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Format the response
-    formatted_response = response.replace(". ", ".\n").replace("• ", "\n• ")
-    return formatted_response
-# Streamlit App
-st.title("🧩 AMA Autism")
-st.write("This app searches through scientific papers to answer your questions about autism. For best results, be specific in your questions.")
-query = st.text_input("Please ask me anything about autism ✨")
-if query:
-    with st.status("Searching for answers...") as status:
-        # Search for papers
-        df = search_research_papers(query)
-        st.write("Searching for data in PubMed and arXiv...")
-        st.write("Data found!")
-        # Get relevant context
-        context = "\n".join([
-            f"{text[:1000]}" for text in df['text'].head(3)
-        ])
-        # Generate answer
-        answer = generate_answer(query, context)
-        st.write("Generating answer...")
-        status.update(
-            label="Search complete!", state="complete", expanded=False
-        )
-    if answer and not answer.isspace():
-        st.success("Answer found!")
-        st.write(answer)
-        st.write("### Sources used:")
-        for _, row in df.head(3).iterrows():
-            st.markdown(f"**[{row['title']}]({row['url']})** ({row['published']})")
-            st.write(f"**Summary:** {row['text'][:200]}...")
-            st.write("---")
-    else:
-        st.warning("I couldn't find a specific answer in the research papers. Try rephrasing your question.")
-    if df.empty:
-        st.warning("I couldn't find any relevant research papers about this topic. Please try rephrasing your question or ask something else about autism.")

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ DATA_DIR = "/data" if os.path.exists("/data") else "."
 DATASET_DIR = os.path.join(DATA_DIR, "rag_dataset")
 DATASET_PATH = os.path.join(DATASET_DIR, "dataset")
 TOKENIZER_MODEL = "google/flan-t5-small"
-SUMMARIZATION_MODEL= "Falconsai/text_summarization"
 # SUMMARIZATION_MODEL="rhaymison/t5-portuguese-small-summarization"
 @st.cache_resource
@@ -211,26 +211,25 @@ def generate_answer(question, context, max_length=512):
     # Format the input for T5 (it expects a specific format)
     input_text = f"""Objective:
-Generate a clear, informative, and well-structured answer about autism, making the content easy to understand for a general audience. Use the provided research papers to support your explanations.
 Question: {clean_question}
 Research Papers:
 {clean_context}
 Instructions:
-Start with a simple explanation
-- Clearly define what autism is in an easy-to-understand way, avoiding overly complex technical terms.
 - Use real-life examples
-- Whenever possible, include practical examples to illustrate key concepts.
-- Relates research in an accessible way
-- Instead of just referencing papers, explain their findings in a way that anyone can understand. Example: "A study from X University found that..."
-- Avoid scientific jargon
-- If a technical term is necessary, provide a simple explanation.
-- Organize the response into sections
-- Use lists and short paragraphs to improve readability.
-Write your answer in a friendly and accessible tone, ensuring that anyone, regardless of their background knowledge, can understand the information provided."""
     try:
         # T5 expects a specific format for the input

 DATASET_DIR = os.path.join(DATA_DIR, "rag_dataset")
 DATASET_PATH = os.path.join(DATASET_DIR, "dataset")
 TOKENIZER_MODEL = "google/flan-t5-small"
+SUMMARIZATION_MODEL= "HuggingFaceTB/SmolVLM-256M-Instruct"
 # SUMMARIZATION_MODEL="rhaymison/t5-portuguese-small-summarization"
 @st.cache_resource
     # Format the input for T5 (it expects a specific format)
     input_text = f"""Objective:
+Provide a clear, simple, and well-structured answer about autism that is easy to understand for a general audience. Use the provided research papers as references.
 Question: {clean_question}
 Research Papers:
 {clean_context}
 Instructions:
+Start with a simple definition
+- Explain what autism is in a short and clear way, avoiding technical terms.
 - Use real-life examples
+- Give practical and relatable examples to help illustrate key points.
+- Explain research in simple words
+- Instead of just citing studies, summarize their key findings in a way that anyone can understand. Example: "A study from X University found that..."
+- Avoid complex words
+- If a scientific term is needed, provide a short and simple explanation.
+- Use clear formatting
+- Write in short paragraphs, bullet points, or numbered lists to improve readability.
+- Keep a friendly tone
+- Make the response engaging and easy to follow, so people without prior knowledge can understand."""
     try:
         # T5 expects a specific format for the input

faiss_index/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- # This file makes the faiss_index directory a Python package

faiss_index/index.py DELETED Viewed

@@ -1,232 +0,0 @@
-import numpy as np
-import faiss
-import arxiv
-from datasets import Dataset
-import os
-from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
-import torch
-import logging
-import requests
-from datetime import datetime
-import xml.etree.ElementTree as ET
-from time import sleep
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-# Define data paths
-DATA_DIR = os.getenv("DATA_DIR", "/data" if os.path.exists("/data") else ".")
-DATASET_DIR = os.path.join(DATA_DIR, "rag_dataset")
-def fetch_arxiv_papers(query, max_results=10):
-    """Fetch papers from arXiv and format them for RAG"""
-    client = arxiv.Client()
-    # Clean and prepare the search query
-    query = query.replace('and', '').strip()
-    terms = [term.strip() for term in query.split() if term.strip()]
-    # Always include autism in the search
-    if 'autism' not in [t.lower() for t in terms]:
-        terms.insert(0, 'autism')
-    # Create search query with required autism term
-    term_queries = []
-    for term in terms:
-        if term.lower() != "autism":
-            term_queries.append(f'abs:"{term}" OR ti:"{term}"')
-    search_query = '(abs:"autism" OR ti:"autism")'
-    if term_queries:
-        search_query += f' AND ({" OR ".join(term_queries)})'
-    search_query = f'({search_query}) AND (cat:q-bio* OR cat:med*)'
-    logging.info(f"Searching arXiv with query: {search_query}")
-    search = arxiv.Search(
-        query=search_query,
-        max_results=max_results * 2,
-        sort_by=arxiv.SortCriterion.Relevance
-    )
-    try:
-        results = list(client.results(search))
-        papers = []
-        for i, result in enumerate(results):
-            # Only include papers that mention autism
-            text = (result.title + " " + result.summary).lower()
-            if 'autism' in text:
-                papers.append({
-                    "id": f"arxiv_{i}",
-                    "text": result.summary,
-                    "title": result.title,
-                    "url": result.entry_id,
-                    "published": result.published.strftime("%Y-%m-%d")
-                })
-                if len(papers) >= max_results:
-                    break
-        logging.info(f"Found {len(papers)} relevant papers about autism from arXiv")
-        return papers
-    except Exception as e:
-        logging.error(f"Error fetching papers from arXiv: {str(e)}")
-        return []
-def fetch_pubmed_papers(query, max_results=10):
-    """Fetch papers from PubMed using E-utilities"""
-    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
-    # Search for papers
-    search_url = f"{base_url}/esearch.fcgi"
-    search_params = {
-        'db': 'pubmed',
-        'term': f"{query} AND autism",
-        'retmax': max_results,
-        'sort': 'relevance',
-        'retmode': 'xml'
-    }
-    try:
-        # Get paper IDs
-        response = requests.get(search_url, params=search_params)
-        root = ET.fromstring(response.content)
-        id_list = [id_elem.text for id_elem in root.findall('.//Id')]
-        if not id_list:
-            return []
-        # Fetch paper details
-        fetch_url = f"{base_url}/efetch.fcgi"
-        fetch_params = {
-            'db': 'pubmed',
-            'id': ','.join(id_list),
-            'retmode': 'xml'
-        }
-        response = requests.get(fetch_url, params=fetch_params)
-        root = ET.fromstring(response.content)
-        papers = []
-        for article in root.findall('.//PubmedArticle'):
-            try:
-                # Extract article information
-                title = article.find('.//ArticleTitle').text
-                abstract = article.find('.//Abstract/AbstractText')
-                abstract = abstract.text if abstract is not None else ""
-                if 'autism' in (title + abstract).lower():
-                    pmid = article.find('.//PMID').text
-                    date = article.find('.//PubDate')
-                    year = date.find('Year').text if date.find('Year') is not None else "Unknown"
-                    papers.append({
-                        "id": f"pubmed_{pmid}",
-                        "text": abstract,
-                        "title": title,
-                        "url": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
-                        "published": year
-                    })
-            except Exception as e:
-                logging.warning(f"Error processing PubMed article: {str(e)}")
-                continue
-        logging.info(f"Found {len(papers)} relevant papers from PubMed")
-        return papers
-    except Exception as e:
-        logging.error(f"Error fetching papers from PubMed: {str(e)}")
-        return []
-def fetch_papers(query, max_results=10):
-    """Fetch papers from both arXiv and PubMed"""
-    arxiv_papers = fetch_arxiv_papers(query, max_results=max_results)
-    sleep(1)  # Respect rate limits
-    pubmed_papers = fetch_pubmed_papers(query, max_results=max_results)
-    # Combine and deduplicate papers based on title similarity
-    all_papers = arxiv_papers + pubmed_papers
-    unique_papers = []
-    seen_titles = set()
-    for paper in all_papers:
-        title_lower = paper['title'].lower()
-        if not any(title_lower in seen_title or seen_title in title_lower for seen_title in seen_titles):
-            unique_papers.append(paper)
-            seen_titles.add(title_lower)
-    # Sort by relevance (papers with 'autism' in title first)
-    unique_papers.sort(key=lambda x: 'autism' in x['title'].lower(), reverse=True)
-    return unique_papers[:max_results]
-def build_faiss_index(papers, dataset_dir=DATASET_DIR):
-    """Build and save dataset with FAISS index for RAG"""
-    if not papers:
-        logging.warning("No papers found. Creating empty dataset.")
-        # Create an empty dataset with the expected structure
-        dataset = Dataset.from_dict({
-            "text": [],
-            "embeddings": [],
-            "title": []
-        })
-        os.makedirs(dataset_dir, exist_ok=True)
-        dataset.save_to_disk(os.path.join(dataset_dir, "dataset"))
-        return dataset_dir
-    # Initialize smaller DPR encoder
-    ctx_encoder = DPRContextEncoder.from_pretrained(
-        "facebook/dpr-ctx_encoder-single-nq-base",
-        torch_dtype=torch.float16,
-        low_cpu_mem_usage=True
-    )
-    ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
-    # Create embeddings with smaller batches and memory optimization
-    texts = [p["text"] for p in papers]
-    embeddings = []
-    batch_size = 4  # Smaller batch size
-    with torch.inference_mode():
-        for i in range(0, len(texts), batch_size):
-            batch_texts = texts[i:i + batch_size]
-            inputs = ctx_tokenizer(
-                batch_texts,
-                max_length=256,  # Reduced from default
-                padding=True,
-                truncation=True,
-                return_tensors="pt"
-            )
-            outputs = ctx_encoder(**inputs)
-            embeddings.extend(outputs.pooler_output.cpu().numpy())
-            # Clear memory
-            del outputs
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-    # Convert to numpy array and build FAISS index
-    embeddings = np.array(embeddings, dtype=np.float32)  # Ensure float32 type
-    dimension = embeddings.shape[1]
-    # Normalize the vectors manually
-    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
-    embeddings = embeddings / norms
-    # Create FAISS index
-    index = faiss.IndexFlatIP(dimension)
-    index.add(embeddings)
-    # Create and save the dataset
-    dataset = Dataset.from_dict({
-        "text": texts,
-        "embeddings": embeddings.tolist(),  # Convert to list for storage
-        "title": [p["title"] for p in papers]
-    })
-    # Create directory if it doesn't exist
-    os.makedirs(dataset_dir, exist_ok=True)
-    # Save dataset
-    dataset.save_to_disk(os.path.join(dataset_dir, "dataset"))
-    logging.info(f"Dataset saved to {dataset_dir}")
-    return dataset_dir