Spaces:

wakeupmh
/

ama-autism

Sleeping

App Files Files Community

wakeupmh commited on Feb 16

Commit

3af593c

1 Parent(s): 208e459

refactor: structure

Browse files

Files changed (9) hide show

app.py +60 -324
models/__init__.py +1 -0
models/paper.py +10 -0
requirements.txt +2 -1
services/__init__.py +1 -0
services/model_handler.py +133 -0
services/research_fetcher.py +274 -0
utils/__init__.py +1 -0
utils/text_processor.py +26 -0

app.py CHANGED Viewed

@@ -1,341 +1,77 @@
 import streamlit as st
-import pandas as pd
-import torch
 import logging
-import os
-from transformers import AutoTokenizer, T5ForConditionalGeneration
-import arxiv
-import requests
-import xml.etree.ElementTree as ET
-import re
-from functools import lru_cache
-from typing import List, Dict, Optional
-from dataclasses import dataclass
-from concurrent.futures import ThreadPoolExecutor
 # Configure logging
-logging.basicConfig(level=logging.INFO)
-# Define data paths and constants
-DATA_DIR = "/data" if os.path.exists("/data") else "."
-DATASET_DIR = os.path.join(DATA_DIR, "rag_dataset")
-DATASET_PATH = os.path.join(DATASET_DIR, "dataset")
-MODEL_PATH = "google/flan-t5-small"
-# Constants for better maintainability
-MAX_ABSTRACT_LENGTH = 1000
-MAX_PAPERS = 5
-CACHE_SIZE = 128
-@dataclass
-class Paper:
-    title: str
-    abstract: str
-    url: str
-    published: str
-    relevance_score: float
-class TextProcessor:
-    @staticmethod
-    def clean_text(text: str) -> str:
-        """Clean and normalize text content with improved handling"""
-        if not text:
-            return ""
-        # Improved text cleaning
-        text = re.sub(r'[^\w\s.,;:()\-\'"]', ' ', text)
-        text = re.sub(r'\s+', ' ', text)
-        text = text.encode('ascii', 'ignore').decode('ascii')  # Better character handling
-        return text.strip()
-    @staticmethod
-    def format_paper(title: str, abstract: str) -> str:
-        """Format paper information with improved structure"""
-        title = TextProcessor.clean_text(title)
-        abstract = TextProcessor.clean_text(abstract)
-        if len(abstract) > MAX_ABSTRACT_LENGTH:
-            abstract = abstract[:MAX_ABSTRACT_LENGTH-3] + "..."
-        return f"""Title: {title}\nAbstract: {abstract}\n---"""
-class ResearchFetcher:
     def __init__(self):
-        self.session = requests.Session()  # Reuse connection
-    @lru_cache(maxsize=CACHE_SIZE)
-    def fetch_arxiv_papers(self, query: str) -> List[Paper]:
-        """Fetch papers from arXiv with improved filtering"""
-        client = arxiv.Client()
-        search_query = f"(ti:autism OR abs:autism) AND (ti:\"{query}\" OR abs:\"{query}\") AND cat:q-bio"
-        search = arxiv.Search(
-            query=search_query,
-            max_results=MAX_PAPERS,
-            sort_by=arxiv.SortCriterion.Relevance
-        )
-        papers = []
-        for result in client.results(search):
-            title_lower = result.title.lower()
-            summary_lower = result.summary.lower()
-            if any(term in title_lower or term in summary_lower
-                for term in ['autism', 'asd', 'autism spectrum disorder']):
-                papers.append(Paper(
-                    title=result.title,
-                    abstract=result.summary,
-                    url=result.pdf_url,
-                    published=result.published.strftime("%Y-%m-%d"),
-                    relevance_score=1.0 if 'autism' in title_lower else 0.8
-                ))
-        return papers
-    @lru_cache(maxsize=CACHE_SIZE)
-    def fetch_pubmed_papers(self, query: str) -> List[Paper]:
-        """Fetch papers from PubMed with improved error handling"""
-        base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
-        search_term = f"(autism[Title/Abstract] OR ASD[Title/Abstract]) AND ({query}[Title/Abstract])"
-        try:
-            # Fetch IDs efficiently
-            response = self.session.get(
-                f"{base_url}/esearch.fcgi",
-                params={
-                    'db': 'pubmed',
-                    'term': search_term,
-                    'retmax': MAX_PAPERS,
-                    'sort': 'relevance',
-                    'retmode': 'xml'
-                },
-                timeout=10
-            )
-            response.raise_for_status()
-            root = ET.fromstring(response.content)
-            id_list = root.findall('.//Id')
-            if not id_list:
-                return []
-            # Fetch details in parallel
-            with ThreadPoolExecutor(max_workers=2) as executor:
-                paper_futures = [
-                    executor.submit(self._fetch_paper_details, base_url, id_elem.text)
-                    for id_elem in id_list
-                ]
-                return [paper for future in paper_futures
-                       for paper in [future.result()] if paper is not None]
-        except Exception as e:
-            logging.error(f"Error fetching PubMed papers: {str(e)}")
-            return []
-    def _fetch_paper_details(self, base_url: str, paper_id: str) -> Optional[Paper]:
-        """Fetch individual paper details with timeout"""
-        try:
-            response = self.session.get(
-                f"{base_url}/efetch.fcgi",
-                params={
-                    'db': 'pubmed',
-                    'id': paper_id,
-                    'retmode': 'xml'
-                },
-                timeout=5
-            )
-            response.raise_for_status()
-            article = ET.fromstring(response.content).find('.//PubmedArticle')
-            if article is None:
-                return None
-            title = article.find('.//ArticleTitle')
-            abstract = article.find('.//Abstract/AbstractText')
-            year = article.find('.//PubDate/Year')
-            if title is not None and abstract is not None:
-                title_text = title.text.lower()
-                abstract_text = abstract.text.lower()
-                if any(term in title_text or term in abstract_text
-                      for term in ['autism', 'asd']):
-                    return Paper(
-                        title=title.text,
-                        abstract=abstract.text,
-                        url=f"https://pubmed.ncbi.nlm.nih.gov/{paper_id}/",
-                        published=year.text if year is not None else 'Unknown',
-                        relevance_score=1.0 if any(term in title_text
-                                                 for term in ['autism', 'asd']) else 0.5
-                    )
-        except Exception as e:
-            logging.error(f"Error fetching paper {paper_id}: {str(e)}")
             return None
-class ModelHandler:
-    def __init__(self):
-        self.model = None
-        self.tokenizer = None
-        self._initialize_model()
-    @staticmethod
-    @st.cache_resource
-    def _load_model():
-        """Load FLAN-T5 Small model with optimized settings"""
-        try:
-            tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
-            model = T5ForConditionalGeneration.from_pretrained(
-                MODEL_PATH,
-                device_map={"": "cpu"},
-                torch_dtype=torch.float32,
-                low_cpu_mem_usage=True
-            )
-            return model, tokenizer
-        except Exception as e:
-            logging.error(f"Error loading model: {str(e)}")
-            return None, None
-    def _initialize_model(self):
-        """Initialize model and tokenizer"""
-        self.model, self.tokenizer = self._load_model()
-    def generate_answer(self, question: str, context: str, max_length: int = 512) -> str:
-        """Generate answer with FLAN-T5 optimized parameters"""
-        if self.model is None or self.tokenizer is None:
-            return "Error: Model loading failed. Please try again later."
-        try:
-            # FLAN-T5 responds better to direct instruction prompts
-            input_text = f"""You are an expert in autism research. Provide a clear, structured, and evidence-based explanation of autism using the provided research context.
-                Research Context:
-                {context}
-                Instructions:
-                1. Start with a concise definition of autism.
-                2. Explain the key characteristics and symptoms.
-                3. Discuss potential causes and contributing factors (e.g., genetic, environmental).
-                4. Mention current research findings and treatments.
-                5. Use clear, accessible language.
-                6. Cite specific studies or papers when relevant.
-                Answer:"""
-            inputs = self.tokenizer(
-                input_text,
-                return_tensors="pt",
-                max_length=1024,
-                truncation=True,
-                padding=True
-            )
-            with torch.inference_mode():
-                outputs = self.model.generate(
-                    **inputs,
-                    max_length=max_length,
-                    min_length=100,
-                    num_beams=3,
-                    length_penalty=1.0,
-                    temperature=0.6,
-                    repetition_penalty=1.2,
-                    early_stopping=True,
-                    no_repeat_ngram_size=2,
-                    do_sample=True,
-                    top_k=30,
-                    top_p=0.92
-                )
-            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            response = TextProcessor.clean_text(response)
-            if len(response.strip()) < 50:
-                return self._get_fallback_response()
-            return self._format_response(response)
-        except Exception as e:
-            logging.error(f"Error generating response: {str(e)}")
-            return "Error: Could not generate response. Please try again."
-    @staticmethod
-    def _get_fallback_response() -> str:
-        """Provide a structured fallback response"""
-        return """Based on the available research, I cannot provide a specific answer to your question. Please try:
-1. Rephrasing your question to be more specific
-2. Asking about:
-   - Specific behaviors or characteristics
-   - Intervention strategies
-   - Research findings
-   - Support approaches
-This will help me provide more accurate, research-based information."""
-    @staticmethod
-    def _format_response(response: str) -> str:
-        """Format the response for better readability"""
-        sections = response.split('\n\n')
-        formatted_sections = []
-        for i, section in enumerate(sections):
-            if i == 0:
-                formatted_sections.append(f"### Overview\n{section}")
-            elif i == len(sections) - 1:
-                formatted_sections.append(f"### Key Takeaways\n{section}")
-            else:
-                formatted_sections.append(section)
-        return '\n\n'.join(formatted_sections)
-def main():
-    st.title("🧩 AMA Autism")
-    st.write("""
-    Ask questions about autism and get research-based answers from scientific papers.
-    For best results, be specific in your questions.
-    """)
-    query = st.text_input("What would you like to know about autism? ✨")
-    if query:
-        with st.status("Researching your question...") as status:
-            # Initialize handlers
-            research_fetcher = ResearchFetcher()
-            model_handler = ModelHandler()
-            # Fetch papers concurrently
-            with ThreadPoolExecutor(max_workers=2) as executor:
-                arxiv_future = executor.submit(research_fetcher.fetch_arxiv_papers, query)
-                pubmed_future = executor.submit(research_fetcher.fetch_pubmed_papers, query)
-                papers = arxiv_future.result() + pubmed_future.result()
-            if not papers:
-                st.warning("No relevant research papers found. Please try a different search term.")
-                return
-            # Sort papers by relevance
-            papers.sort(key=lambda x: x.relevance_score, reverse=True)
-            # Prepare context from top papers
-            context = "\n".join(
-                TextProcessor.format_paper(paper.title, paper.abstract)
-                for paper in papers[:3]
-            )
-            # Generate answer
-            st.write("Analyzing research papers...")
-            answer = model_handler.generate_answer(query, context)
-            status.write("I've got it!")
         with st.expander("📚 View source papers"):
             for paper in papers:
                 st.markdown(f"- [{paper.title}]({paper.url}) ({paper.published})")
-        st.success("Research analysis complete!")
-        st.markdown(answer)
 if __name__ == "__main__":
     main()

 import streamlit as st
 import logging
+from services.research_fetcher import ResearchFetcher
+from services.model_handler import ModelHandler
+from utils.text_processor import TextProcessor
 # Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+class AutismResearchApp:
     def __init__(self):
+        """Initialize the application components"""
+        self.research_fetcher = ResearchFetcher()
+        self.model_handler = ModelHandler()
+        self.text_processor = TextProcessor()
+        self._setup_streamlit()
+    def _setup_streamlit(self):
+        """Setup Streamlit UI components"""
+        st.title("🧩 AMA Autism")
+        st.write("""
+        Ask questions about autism and get research-based answers from scientific papers.
+        For best results, be specific in your questions.
+        """)
+    def _fetch_research(self, query: str):
+        """Fetch research papers for the given query"""
+        papers = self.research_fetcher.fetch_all_papers(query)
+        if not papers:
+            st.warning("No relevant research papers found. Please try a different search term.")
             return None
+        return papers
+    def _generate_answer(self, query: str, papers):
+        """Generate answer based on research papers"""
+        context = "\n".join(
+            self.text_processor.format_paper(paper.title, paper.abstract)
+            for paper in papers[:3]
+        )
+        return self.model_handler.generate_answer(query, context)
+    def _display_sources(self, papers):
+        """Display source papers in an expander"""
         with st.expander("📚 View source papers"):
             for paper in papers:
                 st.markdown(f"- [{paper.title}]({paper.url}) ({paper.published})")
+    def run(self):
+        """Run the main application loop"""
+        query = st.text_input("What would you like to know about autism? ✨")
+        if query:
+            with st.status("Researching your question...") as status:
+                # Fetch papers
+                papers = self._fetch_research(query)
+                if not papers:
+                    return
+                # Generate and display answer
+                st.write("Analyzing research papers...")
+                answer = self._generate_answer(query, papers)
+                status.write("I've got it!")
+                # Display results
+                self._display_sources(papers)
+                st.success("Research analysis complete!")
+                st.markdown(answer)
+def main():
+    app = AutismResearchApp()
+    app.run()
 if __name__ == "__main__":
     main()

models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

models/paper.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from dataclasses import dataclass
+@dataclass
+class Paper:
+    title: str
+    abstract: str
+    url: str
+    published: str
+    relevance_score: float
+    source: str = "unknown"  # Track where the paper came from

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ accelerate>=0.26.0
 numpy>=1.24.0
 pandas>=2.2.0
 requests>=2.31.0
-arxiv>=2.1.0

 numpy>=1.24.0
 pandas>=2.2.0
 requests>=2.31.0
+arxiv>=2.1.0
+scholarly==1.7.11

services/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

services/model_handler.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import torch
+import logging
+from transformers import AutoTokenizer, T5ForConditionalGeneration
+import streamlit as st
+from utils.text_processor import TextProcessor
+MODEL_PATH = "google/flan-t5-small"
+class ModelHandler:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self._initialize_model()
+    @staticmethod
+    @st.cache_resource
+    def _load_model():
+        """Load FLAN-T5 Small model with optimized settings"""
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+            model = T5ForConditionalGeneration.from_pretrained(
+                MODEL_PATH,
+                device_map={"": "cpu"},
+                torch_dtype=torch.float32,
+                low_cpu_mem_usage=True
+            )
+            return model, tokenizer
+        except Exception as e:
+            logging.error(f"Error loading model: {str(e)}")
+            return None, None
+    def _initialize_model(self):
+        """Initialize model and tokenizer"""
+        self.model, self.tokenizer = self._load_model()
+    def generate_answer(self, question: str, context: str, max_length: int = 512) -> str:
+        """Generate natural, human-readable answers using research context"""
+        if self.model is None or self.tokenizer is None:
+            return "Error: Model loading failed. Please try again later."
+        try:
+            input_text = f"""You are an expert explaining autism research to a general audience. Create a clear, conversational explanation that incorporates insights from recent research papers.
+Question: {question}
+Available Research:
+{context}
+Instructions:
+1. Write in a clear, conversational style
+2. Start with a brief, general explanation
+3. Support your points with research, using phrases like "According to [Paper Title]..." or "Research has shown..."
+4. Focus on making complex concepts understandable
+5. Maintain a helpful and informative tone
+Remember to write like you're explaining to someone interested in learning about autism, not like you're writing a technical paper."""
+            inputs = self.tokenizer(
+                input_text,
+                return_tensors="pt",
+                max_length=1024,
+                truncation=True,
+                padding=True
+            )
+            with torch.inference_mode():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_length=max_length,
+                    min_length=150,
+                    num_beams=4,
+                    length_penalty=1.0,
+                    temperature=0.8,
+                    repetition_penalty=1.3,
+                    early_stopping=True,
+                    no_repeat_ngram_size=3,
+                    do_sample=True,
+                    top_k=40,
+                    top_p=0.95
+                )
+            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            response = TextProcessor.clean_text(response)
+            if len(response.strip()) < 50:
+                return self._get_fallback_response()
+            return self._format_response(response)
+        except Exception as e:
+            logging.error(f"Error generating response: {str(e)}")
+            return "Error: Could not generate response. Please try again."
+    @staticmethod
+    def _get_fallback_response() -> str:
+        """Provide a friendly, helpful fallback response"""
+        return """I apologize, but I couldn't find enough specific research to properly answer your question. To help you get better information, you could:
+• Ask about specific aspects of autism you're interested in
+• Focus on particular topics like:
+  - Early signs and diagnosis
+  - Treatment approaches
+  - Latest research findings
+  - Support strategies
+This will help me provide more detailed, research-backed information that's relevant to your interests."""
+    @staticmethod
+    def _format_response(response: str) -> str:
+        """Format the response to be more readable and engaging"""
+        # Clean up the response
+        response = response.replace(" 1.", "\n\n1.")
+        response = response.replace(" 2.", "\n2.")
+        response = response.replace(" 3.", "\n3.")
+        # Split into paragraphs for better readability
+        paragraphs = response.split('\n\n')
+        formatted_paragraphs = []
+        for paragraph in paragraphs:
+            # Format citations to stand out
+            if "According to" in paragraph or "Research" in paragraph:
+                paragraph = f"*{paragraph}*"
+            # Add bullet points for lists
+            if paragraph.strip().startswith(('1.', '2.', '3.')):
+                paragraph = paragraph.replace('1.', '•')
+                paragraph = paragraph.replace('2.', '•')
+                paragraph = paragraph.replace('3.', '•')
+            formatted_paragraphs.append(paragraph)
+        return '\n\n'.join(formatted_paragraphs)

services/research_fetcher.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import time
+import logging
+import random
+import arxiv
+import requests
+import xml.etree.ElementTree as ET
+from typing import List, Optional
+from functools import lru_cache
+from scholarly import scholarly
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from models.paper import Paper
+from utils.text_processor import TextProcessor
+# Constants
+CACHE_SIZE = 128
+MAX_PAPERS = 5
+SCHOLAR_MAX_PAPERS = 3
+MAX_WORKERS = 3  # One thread per data source
+class ResearchFetcher:
+    def __init__(self):
+        self.session = requests.Session()
+        self._last_request_time = 0
+        self._min_request_interval = 0.34
+        self._max_retries = 3
+        self._setup_scholarly()
+        self.executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
+    def __del__(self):
+        """Cleanup executor on deletion"""
+        self.executor.shutdown(wait=False)
+    def _setup_scholarly(self):
+        """Configure scholarly with rotating user agents"""
+        self.user_agents = [
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
+        ]
+        scholarly.use_proxy(None)
+    def _rotate_user_agent(self):
+        """Rotate user agent for Google Scholar requests"""
+        return random.choice(self.user_agents)
+    def _wait_for_rate_limit(self):
+        """Ensure we don't exceed PubMed's rate limit"""
+        current_time = time.time()
+        time_since_last = current_time - self._last_request_time
+        if time_since_last < self._min_request_interval:
+            time.sleep(self._min_request_interval - time_since_last)
+        self._last_request_time = time.time()
+    def _make_request_with_retry(self, url: str, params: dict, timeout: int = 10) -> Optional[requests.Response]:
+        """Make a request with retries and rate limiting"""
+        for attempt in range(self._max_retries):
+            try:
+                self._wait_for_rate_limit()
+                response = self.session.get(url, params=params, timeout=timeout)
+                response.raise_for_status()
+                return response
+            except requests.exceptions.RequestException as e:
+                if isinstance(e, requests.exceptions.HTTPError) and e.response.status_code == 429:
+                    wait_time = (attempt + 1) * self._min_request_interval * 2
+                    logging.warning(f"Rate limit hit, waiting {wait_time} seconds...")
+                    time.sleep(wait_time)
+                    continue
+                if attempt == self._max_retries - 1:
+                    logging.error(f"Error after {self._max_retries} retries: {str(e)}")
+                    return None
+        return None
+    @lru_cache(maxsize=CACHE_SIZE)
+    def fetch_arxiv_papers(self, query: str) -> List[Paper]:
+        """Fetch papers from arXiv with improved filtering"""
+        try:
+            client = arxiv.Client()
+            search_query = f"(ti:autism OR abs:autism) AND (ti:\"{query}\" OR abs:\"{query}\") AND cat:q-bio"
+            search = arxiv.Search(
+                query=search_query,
+                max_results=MAX_PAPERS,
+                sort_by=arxiv.SortCriterion.Relevance
+            )
+            papers = []
+            for result in client.results(search):
+                title_lower = result.title.lower()
+                summary_lower = result.summary.lower()
+                if any(term in title_lower or term in summary_lower
+                    for term in ['autism', 'asd', 'autism spectrum disorder']):
+                    papers.append(Paper(
+                        title=result.title,
+                        abstract=result.summary,
+                        url=result.pdf_url,
+                        published=result.published.strftime("%Y-%m-%d"),
+                        relevance_score=1.0 if 'autism' in title_lower else 0.8,
+                        source='arxiv'
+                    ))
+            return papers
+        except Exception as e:
+            logging.error(f"Error fetching arXiv papers: {str(e)}")
+            return []
+    @lru_cache(maxsize=CACHE_SIZE)
+    def fetch_pubmed_papers(self, query: str) -> List[Paper]:
+        """Fetch papers from PubMed with improved error handling and rate limiting"""
+        try:
+            base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
+            search_term = f"(autism[Title/Abstract] OR ASD[Title/Abstract]) AND ({query}[Title/Abstract])"
+            response = self._make_request_with_retry(
+                f"{base_url}/esearch.fcgi",
+                params={
+                    'db': 'pubmed',
+                    'term': search_term,
+                    'retmax': MAX_PAPERS,
+                    'sort': 'relevance',
+                    'retmode': 'xml'
+                }
+            )
+            if not response:
+                return []
+            root = ET.fromstring(response.content)
+            id_list = root.findall('.//Id')
+            if not id_list:
+                return []
+            papers = []
+            for id_elem in id_list:
+                paper = self._fetch_paper_details(base_url, id_elem.text)
+                if paper:
+                    papers.append(paper)
+            return papers
+        except Exception as e:
+            logging.error(f"Error fetching PubMed papers: {str(e)}")
+            return []
+    def _fetch_paper_details(self, base_url: str, paper_id: str) -> Optional[Paper]:
+        """Fetch individual paper details with rate limiting and retries"""
+        try:
+            response = self._make_request_with_retry(
+                f"{base_url}/efetch.fcgi",
+                params={
+                    'db': 'pubmed',
+                    'id': paper_id,
+                    'retmode': 'xml'
+                }
+            )
+            if not response:
+                return None
+            article = ET.fromstring(response.content).find('.//PubmedArticle')
+            if article is None:
+                return None
+            title = article.find('.//ArticleTitle')
+            abstract = article.find('.//Abstract/AbstractText')
+            year = article.find('.//PubDate/Year')
+            if title is not None and abstract is not None:
+                title_text = title.text.lower()
+                abstract_text = abstract.text.lower()
+                if any(term in title_text or term in abstract_text
+                      for term in ['autism', 'asd']):
+                    return Paper(
+                        title=title.text,
+                        abstract=abstract.text,
+                        url=f"https://pubmed.ncbi.nlm.nih.gov/{paper_id}/",
+                        published=year.text if year is not None else 'Unknown',
+                        relevance_score=1.0 if any(term in title_text
+                                                 for term in ['autism', 'asd']) else 0.5,
+                        source='pubmed'
+                    )
+        except Exception as e:
+            logging.error(f"Error fetching paper {paper_id}: {str(e)}")
+            return None
+    @lru_cache(maxsize=CACHE_SIZE)
+    def fetch_scholar_papers(self, query: str) -> List[Paper]:
+        """Fetch papers from Google Scholar with rate limiting"""
+        papers = []
+        try:
+            if 'autism' not in query.lower():
+                search_query = f"autism {query}"
+            else:
+                search_query = query
+            scholarly.set_headers({'User-Agent': self._rotate_user_agent()})
+            search_results = scholarly.search_pubs(search_query)
+            count = 0
+            for result in search_results:
+                if count >= SCHOLAR_MAX_PAPERS:
+                    break
+                try:
+                    pub = result['bib']
+                    title_abstract = f"{pub.get('title', '')} {pub.get('abstract', '')}".lower()
+                    if not any(term in title_abstract for term in ['autism', 'asd']):
+                        continue
+                    abstract = pub.get('abstract', '')
+                    if not abstract and 'eprint' in result:
+                        abstract = "Abstract not available. Please refer to the full paper."
+                    url = pub.get('url', '')
+                    if not url and 'eprint' in result:
+                        url = result['eprint']
+                    papers.append(Paper(
+                        title=pub.get('title', 'Untitled'),
+                        abstract=abstract[:1000] + '...' if len(abstract) > 1000 else abstract,
+                        url=url,
+                        published=str(pub.get('year', 'Unknown')),
+                        relevance_score=1.0 if 'autism' in pub.get('title', '').lower() else 0.5,
+                        source='scholar'
+                    ))
+                    count += 1
+                    time.sleep(random.uniform(1.0, 2.0))
+                except Exception as e:
+                    logging.error(f"Error processing Scholar result: {str(e)}")
+                    continue
+        except Exception as e:
+            logging.error(f"Error fetching Google Scholar papers: {str(e)}")
+        return papers
+    def fetch_all_papers(self, query: str) -> List[Paper]:
+        """Fetch papers from all sources concurrently and combine results"""
+        all_papers = []
+        futures = []
+        # Submit tasks to thread pool
+        try:
+            futures.append(self.executor.submit(self.fetch_arxiv_papers, query))
+            futures.append(self.executor.submit(self.fetch_pubmed_papers, query))
+            futures.append(self.executor.submit(self.fetch_scholar_papers, query))
+            # Collect results as they complete
+            for future in as_completed(futures):
+                try:
+                    papers = future.result()
+                    all_papers.extend(papers)
+                except Exception as e:
+                    logging.error(f"Error collecting papers from source: {str(e)}")
+        except Exception as e:
+            logging.error(f"Error in concurrent paper fetching: {str(e)}")
+        # Sort and deduplicate papers
+        seen_titles = set()
+        unique_papers = []
+        for paper in sorted(all_papers, key=lambda x: x.relevance_score, reverse=True):
+            title_key = paper.title.lower()
+            if title_key not in seen_titles:
+                seen_titles.add(title_key)
+                unique_papers.append(paper)
+        return unique_papers[:MAX_PAPERS]

utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

utils/text_processor.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import re
+class TextProcessor:
+    @staticmethod
+    def clean_text(text: str) -> str:
+        """Clean and normalize text content with improved handling"""
+        if not text:
+            return ""
+        # Improved text cleaning
+        text = re.sub(r'[^\w\s.,;:()\-\'"]', ' ', text)
+        text = re.sub(r'\s+', ' ', text)
+        text = text.encode('ascii', 'ignore').decode('ascii')  # Better character handling
+        return text.strip()
+    @staticmethod
+    def format_paper(title: str, abstract: str, max_length: int = 1000) -> str:
+        """Format paper information with improved structure"""
+        title = TextProcessor.clean_text(title)
+        abstract = TextProcessor.clean_text(abstract)
+        if len(abstract) > max_length:
+            abstract = abstract[:max_length-3] + "..."
+        return f"""Title: {title}\nAbstract: {abstract}\n---"""