Spaces:

wakeupmh
/

ama-autism

Running

App Files Files Community

wakeupmh commited on Feb 16

Commit

87866cd

1 Parent(s): 7a11d41

refactor: use classes

Browse files

Files changed (1) hide show

app.py +309 -367

app.py CHANGED Viewed

@@ -8,6 +8,10 @@ import arxiv
 import requests
 import xml.etree.ElementTree as ET
 import re
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -16,392 +20,330 @@ logging.basicConfig(level=logging.INFO)
 DATA_DIR = "/data" if os.path.exists("/data") else "."
 DATASET_DIR = os.path.join(DATA_DIR, "rag_dataset")
 DATASET_PATH = os.path.join(DATASET_DIR, "dataset")
-TOKENIZER_MODEL = "google/flan-t5-small"
-SUMMARIZATION_MODEL= "Falconsai/text_summarization"
-# SUMMARIZATION_MODEL="rhaymison/t5-portuguese-small-summarization"
-@st.cache_resource
-def load_local_model():
-    """Load the local Hugging Face model"""
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_MODEL)
-        model = T5ForConditionalGeneration.from_pretrained(
-            SUMMARIZATION_MODEL,
-            device_map={"": "cpu"},  # Force CPU
-            torch_dtype=torch.float32
-        )
-        return model, tokenizer
-    except Exception as e:
-        st.error(f"Error loading model: {str(e)}")
-        return None, None
-def clean_text(text):
-    """Clean and normalize text content"""
-    if not text:
-        return ""
-    # Remove special characters and normalize spaces
-    text = re.sub(r'[^\w\s.,;:()\-\'"]', ' ', text)
-    text = re.sub(r'\s+', ' ', text)
-    text = text.replace('â€™', "'").replace('â€œ', '"').replace('â€', '"')
-    # Remove any remaining weird characters
-    text = ''.join(char for char in text if ord(char) < 128)
-    return text.strip()
-def format_paper(title, abstract):
-    """Format paper information consistently"""
-    title = clean_text(title)
-    abstract = clean_text(abstract)
-    if len(abstract) > 1000:
-        abstract = abstract[:997] + "..."
-    return f"""Title: {title}
-Abstract: {abstract}
----"""
-def fetch_arxiv_papers(query, max_results=5):
-    """Fetch papers from arXiv"""
-    client = arxiv.Client()
-    # Always include autism in the search query
-    search_query = f"(ti:autism OR abs:autism) AND (ti:\"{query}\" OR abs:\"{query}\") AND cat:q-bio"
-    # Search arXiv
-    search = arxiv.Search(
-        query=search_query,
-        max_results=max_results,
-        sort_by=arxiv.SortCriterion.Relevance
-    )
-    papers = []
-    for result in client.results(search):
-        # Only include papers that mention autism in title or abstract
-        if ('autism' in result.title.lower() or
-            'asd' in result.title.lower() or
-            'autism' in result.summary.lower() or
-            'asd' in result.summary.lower()):
-            papers.append({
-                'title': result.title,
-                'abstract': result.summary,
-                'url': result.pdf_url,
-                'published': result.published.strftime("%Y-%m-%d"),
-                'relevance_score': 1 if 'autism' in result.title.lower() else 0.5
-            })
-    return papers
-def fetch_pubmed_papers(query, max_results=5):
-    """Fetch papers from PubMed"""
-    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
-    # Always include autism in the search term
-    search_term = f"(autism[Title/Abstract] OR ASD[Title/Abstract]) AND ({query}[Title/Abstract])"
-    # Search for papers
-    search_url = f"{base_url}/esearch.fcgi"
-    search_params = {
-        'db': 'pubmed',
-        'term': search_term,
-        'retmax': max_results,
-        'sort': 'relevance',
-        'retmode': 'xml'
-    }
-    papers = []
-    try:
-        # Get paper IDs
-        response = requests.get(search_url, params=search_params)
-        root = ET.fromstring(response.content)
-        id_list = [id_elem.text for id_elem in root.findall('.//Id')]
-        if not id_list:
-            return papers
-        # Fetch paper details
-        fetch_url = f"{base_url}/efetch.fcgi"
-        fetch_params = {
-            'db': 'pubmed',
-            'id': ','.join(id_list),
-            'retmode': 'xml'
-        }
-        response = requests.get(fetch_url, params=fetch_params)
-        articles = ET.fromstring(response.content)
-        for article in articles.findall('.//PubmedArticle'):
             title = article.find('.//ArticleTitle')
             abstract = article.find('.//Abstract/AbstractText')
             year = article.find('.//PubDate/Year')
-            pmid = article.find('.//PMID')
             if title is not None and abstract is not None:
                 title_text = title.text.lower()
                 abstract_text = abstract.text.lower()
-                # Only include papers that mention autism
-                if ('autism' in title_text or 'asd' in title_text or
-                    'autism' in abstract_text or 'asd' in abstract_text):
-                    papers.append({
-                        'title': title.text,
-                        'abstract': abstract.text,
-                        'url': f"https://pubmed.ncbi.nlm.nih.gov/{pmid.text}/",
-                        'published': year.text if year is not None else 'Unknown',
-                        'relevance_score': 1 if ('autism' in title_text or 'asd' in title_text) else 0.5
-                    })
-    except Exception as e:
-        st.error(f"Error fetching PubMed papers: {str(e)}")
-    return papers
-def search_research_papers(query):
-    """Search both arXiv and PubMed for papers"""
-    arxiv_papers = fetch_arxiv_papers(query)
-    pubmed_papers = fetch_pubmed_papers(query)
-    # Combine and format papers
-    all_papers = []
-    for paper in arxiv_papers + pubmed_papers:
-        if paper['abstract'] and len(paper['abstract'].strip()) > 0:
-            # Clean and format the paper content
-            clean_title = clean_text(paper['title'])
-            clean_abstract = clean_text(paper['abstract'])
-            # Check if the paper is actually about autism
-            if ('autism' in clean_title.lower() or
-                'asd' in clean_title.lower() or
-                'autism' in clean_abstract.lower() or
-                'asd' in clean_abstract.lower()):
-                formatted_text = format_paper(clean_title, clean_abstract)
-                all_papers.append({
-                    'title': clean_title,
-                    'text': formatted_text,
-                    'url': paper['url'],
-                    'published': paper['published'],
-                    'relevance_score': paper.get('relevance_score', 0.5)
-                })
-    # Sort papers by relevance score and convert to DataFrame
-    all_papers.sort(key=lambda x: x['relevance_score'], reverse=True)
-    df = pd.DataFrame(all_papers)
-    if df.empty:
-        st.warning("No autism-related papers found. Please try a different search term.")
-        return pd.DataFrame(columns=['title', 'text', 'url', 'published', 'relevance_score'])
-    return df
-def generate_answer(question, context, max_length=512):
-    """Generate a comprehensive answer using the local model"""
-    model, tokenizer = load_local_model()
-    if model is None or tokenizer is None:
-        return "Error: Could not load the model. Please try again later."
-    # Clean and format the context
-    clean_context = clean_text(context)
-    clean_question = clean_text(question)
-    # Format the input for T5 (it expects a specific format)
-    input_text = f"""Context
-Input Question: {clean_question}
-Source Materials: {clean_context}
-Primary Objective
-Generate a comprehensive yet accessible summary of autism research that bridges the gap between academic knowledge and public understanding. The response should be evidence-based while remaining engaging and practical for general readers.
-Content Structure
-1. Opening Overview
-Begin with a concise, jargon-free definition of autism
-Frame the topic within everyday experiences
-Establish relevance to the reader's understanding
-2. Key Concepts Breakdown
-Transform complex research findings into digestible information
-Structure information in a logical progression
-Connect each point to real-world scenarios
-3. Research Integration
-Present research findings using this framework:
-Main finding: [Clear statement of what was discovered]
-Real-world meaning: [Practical implications]
-Context: [How this fits into broader understanding]
-4. Examples and Applications
-Include:
-Concrete, relatable scenarios
-Day-to-day situations
-Practical implications for families and individuals
-Writing Guidelines
-Language Requirements
-Target reading level: 8th grade
-Sentence length: Maximum 20 words
-Paragraph length: 2-4 sentences
-Technical terms: Must include plain language explanation in parentheses
-Tone and Style
-Empathetic and respectful
-Solution-focused approach
-Balanced perspective
-Inclusive language
-Formatting Specifications
-Use headers for major sections
-Include white space between concepts
-Implement bullet points for lists
-Bold key terms with immediate explanations
-Research Citation Format
-When referencing studies, follow this pattern:
-"Research from [Institution] shows [finding in simple terms]. This means [practical interpretation]."
-Quality Checks
-Before finalizing, ensure the summary:
-Answers the original question directly
-Maintains scientific accuracy while being accessible
-Provides actionable insights
-Respects neurodiversity perspectives
-Balances depth with clarity
-Response Framework
-Introduction (2-3 sentences)
-Core definition
-Relevance statement
-Main Body (3-4 key points)
-Evidence-based insights
-Practical examples
-Real-world applications
-Conclusion (2-3 sentences)
-Summary of key takeaways
-Actionable next steps or implications
-Engagement Elements
-Include thought-provoking questions
-Provide relatable scenarios
-Connect to common experiences
-Offer practical applications
-Modified Output Analysis
-The response should be evaluated against these criteria:
-Clarity: Is the information immediately understandable?
-Accuracy: Does it reflect the research correctly?
-Relevance: Does it address the specific question?
-Practicality: Are the insights actionable?
-Engagement: Does it maintain reader interest?
-Special Considerations
-Acknowledge spectrum nature of autism
-Respect diverse perspectives
-Focus on strengths and challenges
-Avoid deficit-based language
-Include support-oriented information
-Remember to adapt the depth and complexity based on the specific question while maintaining accessibility and scientific accuracy."""
-    try:
-        # T5 expects a specific format for the input
-        inputs = tokenizer(input_text,
-                    return_tensors="pt",
-                    max_length=1024,
-                    truncation=True,
-                    padding=True)
-        with torch.inference_mode():
-            outputs = model.generate(
-                **inputs,
-                max_length=max_length,
-                min_length=200,
-                num_beams=3,  # Reduzindo para mais variedade
-                length_penalty=1.2,  # Melhor equilíbrio entre concisão e detalhes
-                temperature=0.8,  # Aumentando um pouco para mais fluidez
-                repetition_penalty=1.2,
-                early_stopping=True,
-                no_repeat_ngram_size=2,  # Mantendo variação no texto
-                do_sample=True,
-                top_k=30,  # Reduzindo para respostas mais coerentes
-                top_p=0.9  # Equilibrando diversidade e precisão
-            )
-        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        response = clean_text(response)
-        # If response is too short or empty, provide a general overview
-        if len(response.strip()) < 100:
-            return """Autism Spectrum Disorder (ASD) is a complex neurodevelopmental condition. Unfortunately, the provided papers don't contain specific information about this aspect of autism.
-To get research-based information, try asking more specific questions about:
-- Genetics and environmental factors
-- Early intervention
-- Treatments and therapies
-- Neurological development
-This will allow us to provide accurate information supported by recent scientific research."""
-        # Format the response for better readability
-        formatted_response = response.replace(". ", ".\n").replace("• ", "\n• ")
-        return formatted_response
-    except Exception as e:
-        st.error(f"Error generating response: {str(e)}")
-        return "Error: Could not generate response. Please try again with a different question."
-# Streamlit App
-st.title("🧩 AMA Autism")
-st.write("""
-This app searches through scientific papers to answer your questions about autism.
-For best results, be specific in your questions.
-""")
-query = st.text_input("Please ask me anything about autism ✨")
-if query:
-    with st.status("Searching for answers...") as status:
-        # Search for papers
-        df = search_research_papers(query)
-        st.write("Searching for data in PubMed and arXiv...")
-        st.write(f"Found {len(df)} relevant papers!")
-        # Get relevant context
-        context = "\n".join([
-            f"{text[:1000]}" for text in df['text'].head(3)
-        ])
-        # Generate answer
-        st.write("Generating answer...")
-        answer = generate_answer(query, context)
-    # Display paper sources
-    with st.expander("View source papers"):
-        for _, paper in df.iterrows():
-            st.markdown(f"- [{paper['title']}]({paper['url']}) ({paper['published']})")
-    st.success("Answer found!")
-    st.markdown(answer)

 import requests
 import xml.etree.ElementTree as ET
 import re
+from functools import lru_cache
+from typing import List, Dict, Optional
+from dataclasses import dataclass
+from concurrent.futures import ThreadPoolExecutor
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 DATA_DIR = "/data" if os.path.exists("/data") else "."
 DATASET_DIR = os.path.join(DATA_DIR, "rag_dataset")
 DATASET_PATH = os.path.join(DATASET_DIR, "dataset")
+MODEL_PATH = "google/mt5-base"
+# Constants for better maintainability
+MAX_ABSTRACT_LENGTH = 1000
+MAX_PAPERS = 5
+CACHE_SIZE = 128
+@dataclass
+class Paper:
+    title: str
+    abstract: str
+    url: str
+    published: str
+    relevance_score: float
+class TextProcessor:
+    @staticmethod
+    def clean_text(text: str) -> str:
+        """Clean and normalize text content with improved handling"""
+        if not text:
+            return ""
+        # Improved text cleaning
+        text = re.sub(r'[^\w\s.,;:()\-\'"]', ' ', text)
+        text = re.sub(r'\s+', ' ', text)
+        text = text.encode('ascii', 'ignore').decode('ascii')  # Better character handling
+        return text.strip()
+    @staticmethod
+    def format_paper(title: str, abstract: str) -> str:
+        """Format paper information with improved structure"""
+        title = TextProcessor.clean_text(title)
+        abstract = TextProcessor.clean_text(abstract)
+        if len(abstract) > MAX_ABSTRACT_LENGTH:
+            abstract = abstract[:MAX_ABSTRACT_LENGTH-3] + "..."
+        return f"""Title: {title}\nAbstract: {abstract}\n---"""
+class ResearchFetcher:
+    def __init__(self):
+        self.session = requests.Session()  # Reuse connection
+    @lru_cache(maxsize=CACHE_SIZE)
+    def fetch_arxiv_papers(self, query: str) -> List[Paper]:
+        """Fetch papers from arXiv with improved filtering"""
+        client = arxiv.Client()
+        search_query = f"(ti:autism OR abs:autism) AND (ti:\"{query}\" OR abs:\"{query}\") AND cat:q-bio"
+        search = arxiv.Search(
+            query=search_query,
+            max_results=MAX_PAPERS,
+            sort_by=arxiv.SortCriterion.Relevance
+        )
+        papers = []
+        for result in client.results(search):
+            title_lower = result.title.lower()
+            summary_lower = result.summary.lower()
+            if any(term in title_lower or term in summary_lower
+                  for term in ['autism', 'asd']):
+                papers.append(Paper(
+                    title=result.title,
+                    abstract=result.summary,
+                    url=result.pdf_url,
+                    published=result.published.strftime("%Y-%m-%d"),
+                    relevance_score=1.0 if 'autism' in title_lower else 0.5
+                ))
+        return papers
+    @lru_cache(maxsize=CACHE_SIZE)
+    def fetch_pubmed_papers(self, query: str) -> List[Paper]:
+        """Fetch papers from PubMed with improved error handling"""
+        base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
+        search_term = f"(autism[Title/Abstract] OR ASD[Title/Abstract]) AND ({query}[Title/Abstract])"
+        try:
+            # Fetch IDs efficiently
+            response = self.session.get(
+                f"{base_url}/esearch.fcgi",
+                params={
+                    'db': 'pubmed',
+                    'term': search_term,
+                    'retmax': MAX_PAPERS,
+                    'sort': 'relevance',
+                    'retmode': 'xml'
+                },
+                timeout=10
+            )
+            response.raise_for_status()
+            root = ET.fromstring(response.content)
+            id_list = root.findall('.//Id')
+            if not id_list:
+                return []
+            # Fetch details in parallel
+            with ThreadPoolExecutor(max_workers=3) as executor:
+                paper_futures = [
+                    executor.submit(self._fetch_paper_details, base_url, id_elem.text)
+                    for id_elem in id_list
+                ]
+                return [paper for future in paper_futures
+                       for paper in [future.result()] if paper is not None]
+        except Exception as e:
+            logging.error(f"Error fetching PubMed papers: {str(e)}")
+            return []
+    def _fetch_paper_details(self, base_url: str, paper_id: str) -> Optional[Paper]:
+        """Fetch individual paper details with timeout"""
+        try:
+            response = self.session.get(
+                f"{base_url}/efetch.fcgi",
+                params={
+                    'db': 'pubmed',
+                    'id': paper_id,
+                    'retmode': 'xml'
+                },
+                timeout=5
+            )
+            response.raise_for_status()
+            article = ET.fromstring(response.content).find('.//PubmedArticle')
+            if article is None:
+                return None
             title = article.find('.//ArticleTitle')
             abstract = article.find('.//Abstract/AbstractText')
             year = article.find('.//PubDate/Year')
             if title is not None and abstract is not None:
                 title_text = title.text.lower()
                 abstract_text = abstract.text.lower()
+                if any(term in title_text or term in abstract_text
+                      for term in ['autism', 'asd']):
+                    return Paper(
+                        title=title.text,
+                        abstract=abstract.text,
+                        url=f"https://pubmed.ncbi.nlm.nih.gov/{paper_id}/",
+                        published=year.text if year is not None else 'Unknown',
+                        relevance_score=1.0 if any(term in title_text
+                                                 for term in ['autism', 'asd']) else 0.5
+                    )
+        except Exception as e:
+            logging.error(f"Error fetching paper {paper_id}: {str(e)}")
+            return None
+class ModelHandler:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+    @st.cache_resource
+    def load_model(self):
+        """Load model with improved error handling and resource management"""
+        if self.model is None:
+            try:
+                self.tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+                self.model = T5ForConditionalGeneration.from_pretrained(
+                    MODEL_PATH,
+                    device_map={"": "cpu"},
+                    torch_dtype=torch.float32,
+                    low_cpu_mem_usage=True
+                )
+                return True
+            except Exception as e:
+                logging.error(f"Error loading model: {str(e)}")
+                return False
+        return True
+    def generate_answer(self, question: str, context: str, max_length: int = 512) -> str:
+        """Generate answer with improved prompt engineering and parameters"""
+        if not self.load_model():
+            return "Error: Model loading failed. Please try again later."
+        try:
+            # Improved prompt template
+            input_text = self._create_enhanced_prompt(question, context)
+            inputs = self.tokenizer(
+                input_text,
+                return_tensors="pt",
+                max_length=1024,
+                truncation=True,
+                padding=True
+            )
+            with torch.inference_mode():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_length=max_length,
+                    min_length=200,
+                    num_beams=4,
+                    length_penalty=1.5,
+                    temperature=0.7,
+                    repetition_penalty=1.3,
+                    early_stopping=True,
+                    no_repeat_ngram_size=3,
+                    do_sample=True,
+                    top_k=40,
+                    top_p=0.95
+                )
+            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            response = TextProcessor.clean_text(response)
+            if len(response.strip()) < 100:
+                return self._get_fallback_response()
+            return self._format_response(response)
+        except Exception as e:
+            logging.error(f"Error generating response: {str(e)}")
+            return "Error: Could not generate response. Please try again."
+    @staticmethod
+    def _create_enhanced_prompt(question: str, context: str) -> str:
+        """Create an enhanced prompt for better response quality"""
+        return f"""Context: {context}
+Question: {question}
+Instructions:
+1. Provide a clear, evidence-based answer
+2. Include specific findings from the research
+3. Explain practical implications
+4. Use accessible language
+5. Address the question directly
+6. Include relevant examples
+Response should be:
+- Accurate and scientific
+- Easy to understand
+- Practical and applicable
+- Respectful of neurodiversity
+- Supported by the provided research
+Generate a comprehensive response:"""
+    @staticmethod
+    def _get_fallback_response() -> str:
+        """Provide a structured fallback response"""
+        return """Based on the available research, I cannot provide a specific answer to your question. However, I can suggest:
+1. Try rephrasing your question to focus on specific aspects of autism
+2. Consider asking about:
+   - Specific behaviors or characteristics
+   - Intervention strategies
+   - Research findings
+   - Support approaches
+This will help me provide more accurate, research-based information."""
+    @staticmethod
+    def _format_response(response: str) -> str:
+        """Format the response for better readability"""
+        # Add section headers
+        sections = response.split('\n\n')
+        formatted_sections = []
+        for i, section in enumerate(sections):
+            if i == 0:
+                formatted_sections.append(f"### Overview\n{section}")
+            elif i == len(sections) - 1:
+                formatted_sections.append(f"### Key Takeaways\n{section}")
+            else:
+                formatted_sections.append(section)
+        return '\n\n'.join(formatted_sections)
+def main():
+    st.title("🧩 AMA Autism")
+    st.write("""
+    Ask questions about autism and get research-based answers from scientific papers.
+    For best results, be specific in your questions.
+    """)
+    query = st.text_input("What would you like to know about autism? ✨")
+    if query:
+        with st.status("Researching your question...") as status:
+            # Initialize handlers
+            research_fetcher = ResearchFetcher()
+            model_handler = ModelHandler()
+            # Fetch papers concurrently
+            with ThreadPoolExecutor(max_workers=2) as executor:
+                arxiv_future = executor.submit(research_fetcher.fetch_arxiv_papers, query)
+                pubmed_future = executor.submit(research_fetcher.fetch_pubmed_papers, query)
+                papers = arxiv_future.result() + pubmed_future.result()
+            if not papers:
+                st.warning("No relevant research papers found. Please try a different search term.")
+                return
+            # Sort papers by relevance
+            papers.sort(key=lambda x: x.relevance_score, reverse=True)
+            # Prepare context from top papers
+            context = "\n".join(
+                TextProcessor.format_paper(paper.title, paper.abstract)
+                for paper in papers[:3]
+            )
+            # Generate answer
+            st.write("Analyzing research papers...")
+            answer = model_handler.generate_answer(query, context)
+            # Display sources
+            with st.expander("📚 View source papers"):
+                for paper in papers:
+                    st.markdown(f"- [{paper.title}]({paper.url}) ({paper.published})")
+            st.success("Research analysis complete!")
+            st.markdown(answer)
+if __name__ == "__main__":
+    main()