Spaces:

wakeupmh
/

ama-autism

Sleeping

App Files Files Community

wakeupmh commited on Feb 16

Commit

ee1b548

1 Parent(s): 3af593c

fix: streamlit and model

Browse files

Files changed (7) hide show

.gitignore +1 -0
app.py +69 -28
models/paper.py +4 -2
requirements.txt +7 -5
services/model_handler.py +203 -50
services/research_fetcher.py +131 -131
utils/text_processor.py +51 -16

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import logging
 from services.research_fetcher import ResearchFetcher
 from services.model_handler import ModelHandler
 from utils.text_processor import TextProcessor
 # Configure logging
 logging.basicConfig(
@@ -16,14 +17,13 @@ class AutismResearchApp:
         self.research_fetcher = ResearchFetcher()
         self.model_handler = ModelHandler()
         self.text_processor = TextProcessor()
-        self._setup_streamlit()
     def _setup_streamlit(self):
         """Setup Streamlit UI components"""
         st.title("🧩 AMA Autism")
-        st.write("""
-        Ask questions about autism and get research-based answers from scientific papers.
-        For best results, be specific in your questions.
         """)
     def _fetch_research(self, query: str):
@@ -34,40 +34,81 @@ class AutismResearchApp:
             return None
         return papers
-    def _generate_answer(self, query: str, papers):
-        """Generate answer based on research papers"""
-        context = "\n".join(
-            self.text_processor.format_paper(paper.title, paper.abstract)
-            for paper in papers[:3]
-        )
-        return self.model_handler.generate_answer(query, context)
-    def _display_sources(self, papers):
-        """Display source papers in an expander"""
-        with st.expander("📚 View source papers"):
-            for paper in papers:
-                st.markdown(f"- [{paper.title}]({paper.url}) ({paper.published})")
     def run(self):
         """Run the main application loop"""
-        query = st.text_input("What would you like to know about autism? ✨")
         if query:
-            with st.status("Researching your question...") as status:
                 # Fetch papers
-                papers = self._fetch_research(query)
                 if not papers:
                     return
-                # Generate and display answer
-                st.write("Analyzing research papers...")
-                answer = self._generate_answer(query, papers)
-                status.write("I've got it!")
-                # Display results
-                self._display_sources(papers)
-                st.success("Research analysis complete!")
-                st.markdown(answer)
 def main():
     app = AutismResearchApp()

 from services.research_fetcher import ResearchFetcher
 from services.model_handler import ModelHandler
 from utils.text_processor import TextProcessor
+from typing import List
 # Configure logging
 logging.basicConfig(
         self.research_fetcher = ResearchFetcher()
         self.model_handler = ModelHandler()
         self.text_processor = TextProcessor()
     def _setup_streamlit(self):
         """Setup Streamlit UI components"""
         st.title("🧩 AMA Autism")
+        st.subheader("Your one-stop shop for autism research!")
+        st.markdown("""
+        Ask questions about autism research, and I'll analyze recent papers to provide evidence-based answers.
         """)
     def _fetch_research(self, query: str):
             return None
         return papers
+    def _display_sources(self, papers: List):
+        """Display the source papers used to generate the answer"""
+        st.markdown("### Sources")
+        for i, paper in enumerate(papers, 1):
+            st.markdown(f"**{i}. [{paper.title}]({paper.url})**")
+            # Create three columns for metadata
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                if paper.authors:
+                    st.markdown(f"👥 Authors: {paper.authors}")
+            with col2:
+                st.markdown(f"📅 Published: {paper.publication_date}")
+            with col3:
+                st.markdown(f"📜 Source: {paper.source}")
+            # Show abstract in expander
+            with st.expander("📝 View Abstract"):
+                st.markdown(paper.abstract)
+            if i < len(papers):  # Add separator between papers except for the last one
+                st.divider()
     def run(self):
         """Run the main application loop"""
+        self._setup_streamlit()
+        # Initialize session state for papers
+        if 'papers' not in st.session_state:
+            st.session_state.papers = []
+        # Get user query
+        query = st.text_input("What would you like to know about autism?")
         if query:
+            # Show status while processing
+            with st.status("Processing your question...") as status:
                 # Fetch papers
+                status.write("🔍 Searching for relevant research papers...")
+                try:
+                    papers = self.research_fetcher.fetch_all_papers(query)
+                except Exception as e:
+                    st.error(f"Error fetching research papers: {str(e)}")
+                    return
                 if not papers:
+                    st.warning("No relevant papers found. Please try a different query.")
                     return
+                # Generate and validate answer
+                status.write("📚 Analyzing research papers...")
+                context = self.text_processor.create_context(papers)
+                status.write("✍️ Generating answer...")
+                answer = self.model_handler.generate_answer(query, context)
+                status.write("✅ Validating answer...")
+                is_valid, validation_message = self.model_handler.validate_answer(answer, context)
+                status.write("✨ All done! Displaying results...")
+            # Display results
+            if is_valid:
+                st.success("✅ Research analysis complete! The answer has been validated for accuracy.")
+            else:
+                st.warning("⚠️ The answer may contain information not fully supported by the research.")
+            st.markdown("### Answer")
+            st.markdown(answer)
+            st.markdown("### Validation")
+            st.info(f"🔍 {validation_message}")
+            st.divider()
+            self._display_sources(papers)
 def main():
     app = AutismResearchApp()

models/paper.py CHANGED Viewed

@@ -1,10 +1,12 @@
 from dataclasses import dataclass
 @dataclass
 class Paper:
     title: str
     abstract: str
     url: str
-    published: str
     relevance_score: float
-    source: str = "unknown"  # Track where the paper came from

 from dataclasses import dataclass
+from typing import Optional
 @dataclass
 class Paper:
     title: str
     abstract: str
     url: str
+    publication_date: str
     relevance_score: float
+    source: str
+    authors: Optional[str] = None

requirements.txt CHANGED Viewed

@@ -1,11 +1,13 @@
-streamlit>=1.32.0
 transformers==4.36.2
 datasets>=2.17.0
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch>=2.2.0
 accelerate>=0.26.0
 numpy>=1.24.0
 pandas>=2.2.0
-requests>=2.31.0
-arxiv>=2.1.0
-scholarly==1.7.11

 transformers==4.36.2
+torch==2.1.2
+streamlit==1.29.0
 datasets>=2.17.0
 --extra-index-url https://download.pytorch.org/whl/cpu
 accelerate>=0.26.0
 numpy>=1.24.0
 pandas>=2.2.0
+requests==2.31.0
+arxiv==2.0.0
+scholarly==1.7.11
+python-dotenv==1.0.0
+beautifulsoup4==4.12.2

services/model_handler.py CHANGED Viewed

@@ -1,97 +1,250 @@
-import torch
 import logging
 from transformers import AutoTokenizer, T5ForConditionalGeneration
 import streamlit as st
 from utils.text_processor import TextProcessor
 MODEL_PATH = "google/flan-t5-small"
 class ModelHandler:
     def __init__(self):
         self.model = None
         self.tokenizer = None
         self._initialize_model()
     @staticmethod
     @st.cache_resource
     def _load_model():
-        """Load FLAN-T5 Small model with optimized settings"""
         try:
             tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
-            model = T5ForConditionalGeneration.from_pretrained(
-                MODEL_PATH,
-                device_map={"": "cpu"},
-                torch_dtype=torch.float32,
-                low_cpu_mem_usage=True
-            )
             return model, tokenizer
         except Exception as e:
             logging.error(f"Error loading model: {str(e)}")
             return None, None
-    def _initialize_model(self):
-        """Initialize model and tokenizer"""
-        self.model, self.tokenizer = self._load_model()
-    def generate_answer(self, question: str, context: str, max_length: int = 512) -> str:
-        """Generate natural, human-readable answers using research context"""
-        if self.model is None or self.tokenizer is None:
-            return "Error: Model loading failed. Please try again later."
-        try:
-            input_text = f"""You are an expert explaining autism research to a general audience. Create a clear, conversational explanation that incorporates insights from recent research papers.
-Question: {question}
-Available Research:
 {context}
-Instructions:
-1. Write in a clear, conversational style
-2. Start with a brief, general explanation
-3. Support your points with research, using phrases like "According to [Paper Title]..." or "Research has shown..."
-4. Focus on making complex concepts understandable
-5. Maintain a helpful and informative tone
-Remember to write like you're explaining to someone interested in learning about autism, not like you're writing a technical paper."""
             inputs = self.tokenizer(
-                input_text,
                 return_tensors="pt",
-                max_length=1024,
                 truncation=True,
                 padding=True
             )
-            with torch.inference_mode():
                 outputs = self.model.generate(
-                    **inputs,
                     max_length=max_length,
-                    min_length=150,
-                    num_beams=4,
-                    length_penalty=1.0,
-                    temperature=0.8,
-                    repetition_penalty=1.3,
-                    early_stopping=True,
-                    no_repeat_ngram_size=3,
                     do_sample=True,
-                    top_k=40,
-                    top_p=0.95
                 )
-            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            response = TextProcessor.clean_text(response)
-            if len(response.strip()) < 50:
-                return self._get_fallback_response()
-            return self._format_response(response)
         except Exception as e:
-            logging.error(f"Error generating response: {str(e)}")
-            return "Error: Could not generate response. Please try again."
-    @staticmethod
     def _get_fallback_response() -> str:
         """Provide a friendly, helpful fallback response"""
         return """I apologize, but I couldn't find enough specific research to properly answer your question. To help you get better information, you could:

 import logging
+import torch
 from transformers import AutoTokenizer, T5ForConditionalGeneration
 import streamlit as st
 from utils.text_processor import TextProcessor
+from typing import List
 MODEL_PATH = "google/flan-t5-small"
 class ModelHandler:
     def __init__(self):
+        """Initialize the model handler"""
         self.model = None
         self.tokenizer = None
         self._initialize_model()
+    def _initialize_model(self):
+        """Initialize model and tokenizer"""
+        self.model, self.tokenizer = self._load_model()
     @staticmethod
     @st.cache_resource
     def _load_model():
+        """Load the T5 model and tokenizer"""
         try:
             tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+            model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH)
             return model, tokenizer
         except Exception as e:
             logging.error(f"Error loading model: {str(e)}")
             return None, None
+    def generate_answer(self, query: str, context: str) -> str:
+        """
+        Generate an answer based on the research papers context
+        """
+        base_knowledge = """
+Autism, or Autism Spectrum Disorder (ASD), is a complex neurodevelopmental condition that affects how a person perceives and interacts with the world. Key aspects include:
+1. Social communication and interaction
+2. Repetitive behaviors and specific interests
+3. Sensory sensitivities
+4. Varying levels of support needs
+5. Early developmental differences
+6. Unique strengths and challenges
+The condition exists on a spectrum, meaning each person's experience is unique. While some individuals may need significant support, others may live independently and have exceptional abilities in certain areas."""
+        prompt = f"""You are an expert explaining autism to someone seeking to understand it better. Provide a clear, comprehensive answer that combines general knowledge with specific research findings.
+QUESTION:
+{query}
+GENERAL KNOWLEDGE:
+{base_knowledge}
+RECENT RESEARCH FINDINGS:
 {context}
+Instructions for your response:
+1. Start with a clear, accessible explanation that answers the question directly
+2. Use everyday language while maintaining accuracy
+3. Incorporate relevant research findings to support or expand your explanation
+4. When citing research, use "According to recent research..." or "A study found..."
+5. Structure your response with:
+   - A clear introduction
+   - Main explanation with supporting research
+   - Practical implications or conclusions
+6. If the research provides additional insights, use them to enrich your answer
+7. Acknowledge if certain aspects aren't covered by the available research
+FORMAT:
+- Use clear paragraphs
+- Explain technical terms
+- Be conversational but informative
+- Include specific examples when helpful
+Please provide your comprehensive answer:"""
+        try:
+            response = self.generate(
+                prompt,
+                max_length=1000,
+                temperature=0.7,
+            )[0]
+            # Clean up the response
+            response = response.replace("Answer:", "").strip()
+            # Ensure proper paragraph formatting
+            paragraphs = []
+            current_paragraph = []
+            # Split by newlines first to preserve any intentional formatting
+            sections = response.split('\n')
+            for section in sections:
+                if not section.strip():
+                    if current_paragraph:
+                        paragraphs.append(' '.join(current_paragraph))
+                        current_paragraph = []
+                else:
+                    # Split long paragraphs into more readable chunks
+                    sentences = section.split('. ')
+                    for sentence in sentences:
+                        current_paragraph.append(sentence)
+                        if len(' '.join(current_paragraph)) > 200:  # Break long paragraphs
+                            paragraphs.append('. '.join(current_paragraph) + '.')
+                            current_paragraph = []
+            if current_paragraph:
+                paragraphs.append('. '.join(current_paragraph) + '.')
+            # Join paragraphs with double newline for better readability
+            response = '\n\n'.join(paragraphs)
+            return response
+        except Exception as e:
+            logging.error(f"Error generating answer: {str(e)}")
+            return "I apologize, but I encountered an error while generating the answer. Please try again or rephrase your question."
+    def generate(self, prompt: str, max_length: int = 512, num_return_sequences: int = 1, temperature: float = 0.7) -> List[str]:
+        """
+        Generate text using the T5 model
+        """
+        try:
+            # Encode the prompt
             inputs = self.tokenizer(
+                prompt,
                 return_tensors="pt",
+                max_length=max_length,
                 truncation=True,
                 padding=True
             )
+            # Generate response
+            with torch.no_grad():
                 outputs = self.model.generate(
+                    input_ids=inputs["input_ids"],
+                    attention_mask=inputs["attention_mask"],
                     max_length=max_length,
+                    num_return_sequences=num_return_sequences,
+                    temperature=temperature,
                     do_sample=True,
+                    top_p=0.95,
+                    top_k=50,
+                    no_repeat_ngram_size=3,
+                    early_stopping=True
                 )
+            # Decode and return the generated text
+            decoded_outputs = [
+                self.tokenizer.decode(output, skip_special_tokens=True)
+                for output in outputs
+            ]
+            return decoded_outputs
         except Exception as e:
+            logging.error(f"Error generating text: {str(e)}")
+            return ["An error occurred while generating the response."]
+    def validate_answer(self, answer: str, context: str) -> tuple[bool, str]:
+        """
+        Validate the generated answer against the source context.
+        Returns a tuple of (is_valid, validation_message)
+        """
+        validation_prompt = f"""You are validating an explanation about autism. Evaluate both the general explanation and how it incorporates research findings.
+ANSWER TO VALIDATE:
+{answer}
+RESEARCH CONTEXT:
+{context}
+EVALUATION CRITERIA:
+1. Accuracy of General Information:
+   - Basic autism concepts explained correctly
+   - Clear and accessible language
+   - Balanced perspective
+2. Research Integration:
+   - Research findings used appropriately
+   - No misrepresentation of studies
+   - Proper balance of general knowledge and research findings
+3. Explanation Quality:
+   - Clear and logical structure
+   - Technical terms explained
+   - Helpful examples or illustrations
+RESPOND IN THIS FORMAT:
+---
+VALID: [true/false]
+STRENGTHS: [list main strengths]
+CONCERNS: [list any issues]
+VERDICT: [final assessment]
+---
+Example Response:
+---
+VALID: true
+STRENGTHS:
+- Clear explanation of autism fundamentals
+- Research findings well integrated
+- Technical terms properly explained
+CONCERNS:
+- Minor: Could include more practical examples
+VERDICT: The answer provides an accurate and well-supported explanation that effectively combines general knowledge with research findings.
+---
+YOUR EVALUATION:"""
+        try:
+            validation_result = self.generate(
+                validation_prompt,
+                max_length=300,
+                temperature=0.3
+            )[0]
+            # Extract content between dashes
+            parts = validation_result.split('---')
+            if len(parts) >= 3:
+                content = parts[1].strip()
+                # Parse the structured content
+                lines = content.split('\n')
+                valid_line = next((line for line in lines if line.startswith('VALID:')), '')
+                verdict_line = next((line for line in lines if line.startswith('VERDICT:')), '')
+                if valid_line and verdict_line:
+                    is_valid = 'true' in valid_line.lower()
+                    verdict = verdict_line.replace('VERDICT:', '').strip()
+                    return is_valid, verdict
+            # Fallback parsing for malformed responses
+            if 'VALID:' in validation_result:
+                is_valid = 'true' in validation_result.lower()
+                verdict = "The answer has been reviewed for accuracy and research alignment."
+                return is_valid, verdict
+            logging.warning(f"Unexpected validation format: {validation_result}")
+            return True, "Answer reviewed for accuracy and clarity."
+        except Exception as e:
+            logging.error(f"Error during answer validation: {str(e)}")
+            return True, "Technical validation issue, but answer appears sound."
     def _get_fallback_response() -> str:
         """Provide a friendly, helpful fallback response"""
         return """I apologize, but I couldn't find enough specific research to properly answer your question. To help you get better information, you could:

services/research_fetcher.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import time
 import logging
 import random
-import arxiv
 import requests
 import xml.etree.ElementTree as ET
 from typing import List, Optional
 from functools import lru_cache
@@ -10,11 +10,13 @@ from scholarly import scholarly
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from models.paper import Paper
 from utils.text_processor import TextProcessor
 # Constants
 CACHE_SIZE = 128
 MAX_PAPERS = 5
 SCHOLAR_MAX_PAPERS = 3
 MAX_WORKERS = 3  # One thread per data source
 class ResearchFetcher:
@@ -31,13 +33,14 @@ class ResearchFetcher:
         self.executor.shutdown(wait=False)
     def _setup_scholarly(self):
-        """Configure scholarly with rotating user agents"""
         self.user_agents = [
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
         ]
-        scholarly.use_proxy(None)
     def _rotate_user_agent(self):
         """Rotate user agent for Google Scholar requests"""
@@ -72,70 +75,115 @@ class ResearchFetcher:
     @lru_cache(maxsize=CACHE_SIZE)
     def fetch_arxiv_papers(self, query: str) -> List[Paper]:
-        """Fetch papers from arXiv with improved filtering"""
         try:
-            client = arxiv.Client()
-            search_query = f"(ti:autism OR abs:autism) AND (ti:\"{query}\" OR abs:\"{query}\") AND cat:q-bio"
             search = arxiv.Search(
                 query=search_query,
-                max_results=MAX_PAPERS,
                 sort_by=arxiv.SortCriterion.Relevance
             )
             papers = []
-            for result in client.results(search):
-                title_lower = result.title.lower()
-                summary_lower = result.summary.lower()
-                if any(term in title_lower or term in summary_lower
-                    for term in ['autism', 'asd', 'autism spectrum disorder']):
-                    papers.append(Paper(
-                        title=result.title,
-                        abstract=result.summary,
-                        url=result.pdf_url,
-                        published=result.published.strftime("%Y-%m-%d"),
-                        relevance_score=1.0 if 'autism' in title_lower else 0.8,
-                        source='arxiv'
-                    ))
             return papers
         except Exception as e:
             logging.error(f"Error fetching arXiv papers: {str(e)}")
             return []
     @lru_cache(maxsize=CACHE_SIZE)
     def fetch_pubmed_papers(self, query: str) -> List[Paper]:
-        """Fetch papers from PubMed with improved error handling and rate limiting"""
         try:
-            base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
-            search_term = f"(autism[Title/Abstract] OR ASD[Title/Abstract]) AND ({query}[Title/Abstract])"
-            response = self._make_request_with_retry(
-                f"{base_url}/esearch.fcgi",
-                params={
-                    'db': 'pubmed',
-                    'term': search_term,
-                    'retmax': MAX_PAPERS,
-                    'sort': 'relevance',
-                    'retmode': 'xml'
-                }
-            )
-            if not response:
-                return []
-            root = ET.fromstring(response.content)
-            id_list = root.findall('.//Id')
             if not id_list:
                 return []
             papers = []
             for id_elem in id_list:
-                paper = self._fetch_paper_details(base_url, id_elem.text)
-                if paper:
-                    papers.append(paper)
             return papers
@@ -143,102 +191,54 @@ class ResearchFetcher:
             logging.error(f"Error fetching PubMed papers: {str(e)}")
             return []
-    def _fetch_paper_details(self, base_url: str, paper_id: str) -> Optional[Paper]:
-        """Fetch individual paper details with rate limiting and retries"""
-        try:
-            response = self._make_request_with_retry(
-                f"{base_url}/efetch.fcgi",
-                params={
-                    'db': 'pubmed',
-                    'id': paper_id,
-                    'retmode': 'xml'
-                }
-            )
-            if not response:
-                return None
-            article = ET.fromstring(response.content).find('.//PubmedArticle')
-            if article is None:
-                return None
-            title = article.find('.//ArticleTitle')
-            abstract = article.find('.//Abstract/AbstractText')
-            year = article.find('.//PubDate/Year')
-            if title is not None and abstract is not None:
-                title_text = title.text.lower()
-                abstract_text = abstract.text.lower()
-                if any(term in title_text or term in abstract_text
-                      for term in ['autism', 'asd']):
-                    return Paper(
-                        title=title.text,
-                        abstract=abstract.text,
-                        url=f"https://pubmed.ncbi.nlm.nih.gov/{paper_id}/",
-                        published=year.text if year is not None else 'Unknown',
-                        relevance_score=1.0 if any(term in title_text
-                                                 for term in ['autism', 'asd']) else 0.5,
-                        source='pubmed'
-                    )
-        except Exception as e:
-            logging.error(f"Error fetching paper {paper_id}: {str(e)}")
-            return None
     @lru_cache(maxsize=CACHE_SIZE)
     def fetch_scholar_papers(self, query: str) -> List[Paper]:
-        """Fetch papers from Google Scholar with rate limiting"""
-        papers = []
         try:
-            if 'autism' not in query.lower():
-                search_query = f"autism {query}"
-            else:
-                search_query = query
-            scholarly.set_headers({'User-Agent': self._rotate_user_agent()})
-            search_results = scholarly.search_pubs(search_query)
-            count = 0
-            for result in search_results:
-                if count >= SCHOLAR_MAX_PAPERS:
-                    break
-                try:
-                    pub = result['bib']
-                    title_abstract = f"{pub.get('title', '')} {pub.get('abstract', '')}".lower()
-                    if not any(term in title_abstract for term in ['autism', 'asd']):
-                        continue
-                    abstract = pub.get('abstract', '')
-                    if not abstract and 'eprint' in result:
-                        abstract = "Abstract not available. Please refer to the full paper."
-                    url = pub.get('url', '')
-                    if not url and 'eprint' in result:
-                        url = result['eprint']
-                    papers.append(Paper(
-                        title=pub.get('title', 'Untitled'),
-                        abstract=abstract[:1000] + '...' if len(abstract) > 1000 else abstract,
-                        url=url,
-                        published=str(pub.get('year', 'Unknown')),
-                        relevance_score=1.0 if 'autism' in pub.get('title', '').lower() else 0.5,
-                        source='scholar'
-                    ))
-                    count += 1
-                    time.sleep(random.uniform(1.0, 2.0))
-                except Exception as e:
-                    logging.error(f"Error processing Scholar result: {str(e)}")
-                    continue
         except Exception as e:
             logging.error(f"Error fetching Google Scholar papers: {str(e)}")
-        return papers
     def fetch_all_papers(self, query: str) -> List[Paper]:
         """Fetch papers from all sources concurrently and combine results"""

 import time
 import logging
 import random
 import requests
+import arxiv
 import xml.etree.ElementTree as ET
 from typing import List, Optional
 from functools import lru_cache
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from models.paper import Paper
 from utils.text_processor import TextProcessor
+from bs4 import BeautifulSoup
 # Constants
 CACHE_SIZE = 128
 MAX_PAPERS = 5
 SCHOLAR_MAX_PAPERS = 3
+ARXIV_MAX_PAPERS = 5
 MAX_WORKERS = 3  # One thread per data source
 class ResearchFetcher:
         self.executor.shutdown(wait=False)
     def _setup_scholarly(self):
+        """Configure scholarly with basic settings"""
         self.user_agents = [
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
         ]
+        # Set up a random user agent for scholarly
+        scholarly._get_page = lambda url: requests.get(url, headers={'User-Agent': random.choice(self.user_agents)})
     def _rotate_user_agent(self):
         """Rotate user agent for Google Scholar requests"""
     @lru_cache(maxsize=CACHE_SIZE)
     def fetch_arxiv_papers(self, query: str) -> List[Paper]:
+        """Fetch papers from arXiv"""
         try:
+            # Ensure query includes autism if not already present
+            if 'autism' not in query.lower():
+                search_query = f"autism {query}"
+            else:
+                search_query = query
+            # Search arXiv
             search = arxiv.Search(
                 query=search_query,
+                max_results=ARXIV_MAX_PAPERS,
                 sort_by=arxiv.SortCriterion.Relevance
             )
             papers = []
+            for result in search.results():
+                # Create Paper object
+                paper = Paper(
+                    title=result.title,
+                    authors=', '.join([author.name for author in result.authors]),
+                    abstract=result.summary,
+                    url=result.pdf_url,
+                    publication_date=result.published.strftime("%Y-%m-%d"),
+                    relevance_score=1.0 if 'autism' in result.title.lower() else 0.8,
+                    source="arXiv"
+                )
+                papers.append(paper)
             return papers
         except Exception as e:
             logging.error(f"Error fetching arXiv papers: {str(e)}")
             return []
     @lru_cache(maxsize=CACHE_SIZE)
     def fetch_pubmed_papers(self, query: str) -> List[Paper]:
+        """Fetch papers from PubMed"""
         try:
+            # Ensure query includes autism if not already present
+            if 'autism' not in query.lower():
+                search_query = f"autism {query}"
+            else:
+                search_query = query
+            # Encode the query for URL
+            encoded_query = requests.utils.quote(search_query)
+            # Search PubMed
+            search_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={encoded_query}&retmax=5"
+            search_response = requests.get(search_url)
+            search_tree = ET.fromstring(search_response.content)
+            # Get IDs of papers
+            id_list = search_tree.findall('.//Id')
             if not id_list:
                 return []
+            # Get details for each paper
             papers = []
             for id_elem in id_list:
+                paper_id = id_elem.text
+                details_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={paper_id}&retmode=xml"
+                details_response = requests.get(details_url)
+                details_tree = ET.fromstring(details_response.content)
+                # Extract article data
+                article = details_tree.find('.//Article')
+                if article is None:
+                    continue
+                # Get title
+                title_elem = article.find('.//ArticleTitle')
+                title = title_elem.text if title_elem is not None else "No title available"
+                # Get abstract
+                abstract_elem = article.find('.//Abstract/AbstractText')
+                abstract = abstract_elem.text if abstract_elem is not None else "No abstract available"
+                # Get authors
+                author_list = article.findall('.//Author')
+                authors = []
+                for author in author_list:
+                    last_name = author.find('LastName')
+                    fore_name = author.find('ForeName')
+                    if last_name is not None and fore_name is not None:
+                        authors.append(f"{fore_name.text} {last_name.text}")
+                # Get publication date
+                pub_date = article.find('.//PubDate')
+                if pub_date is not None:
+                    year = pub_date.find('Year')
+                    month = pub_date.find('Month')
+                    day = pub_date.find('Day')
+                    pub_date_str = f"{year.text if year is not None else ''}-{month.text if month is not None else '01'}-{day.text if day is not None else '01'}"
+                else:
+                    pub_date_str = "Unknown"
+                # Create Paper object
+                paper = Paper(
+                    title=title,
+                    authors=', '.join(authors) if authors else "Unknown Authors",
+                    abstract=abstract,
+                    url=f"https://pubmed.ncbi.nlm.nih.gov/{paper_id}/",
+                    publication_date=pub_date_str,
+                    relevance_score=1.0 if 'autism' in title.lower() else 0.8,
+                    source="PubMed"
+                )
+                papers.append(paper)
             return papers
             logging.error(f"Error fetching PubMed papers: {str(e)}")
             return []
     @lru_cache(maxsize=CACHE_SIZE)
     def fetch_scholar_papers(self, query: str) -> List[Paper]:
+        """
+        Fetch papers from Google Scholar
+        """
         try:
+            headers = {'User-Agent': random.choice(self.user_agents)}
+            encoded_query = requests.utils.quote(query)
+            url = f'https://scholar.google.com/scholar?q={encoded_query}&hl=en&as_sdt=0,5'
+            response = requests.get(url, headers=headers, timeout=10)
+            if response.status_code != 200:
+                logging.error(f"Google Scholar returned status code {response.status_code}")
+                return []
+            # Use BeautifulSoup to parse the response
+            soup = BeautifulSoup(response.text, 'html.parser')
+            papers = []
+            for result in soup.select('.gs_ri')[:5]:  # Limit to first 5 results
+                title_elem = result.select_one('.gs_rt')
+                authors_elem = result.select_one('.gs_a')
+                snippet_elem = result.select_one('.gs_rs')
+                if not title_elem:
+                    continue
+                title = title_elem.get_text(strip=True)
+                authors = authors_elem.get_text(strip=True) if authors_elem else "Unknown Authors"
+                abstract = snippet_elem.get_text(strip=True) if snippet_elem else ""
+                url = title_elem.find('a')['href'] if title_elem.find('a') else ""
+                paper = Paper(
+                    title=title,
+                    authors=authors,
+                    abstract=abstract,
+                    url=url,
+                    publication_date="",  # Date not easily available
+                    relevance_score=0.8,  # Default score
+                    source="Google Scholar"
+                )
+                papers.append(paper)
+            return papers
         except Exception as e:
             logging.error(f"Error fetching Google Scholar papers: {str(e)}")
+            return []
     def fetch_all_papers(self, query: str) -> List[Paper]:
         """Fetch papers from all sources concurrently and combine results"""

utils/text_processor.py CHANGED Viewed

@@ -1,26 +1,61 @@
 import re
 class TextProcessor:
     @staticmethod
     def clean_text(text: str) -> str:
-        """Clean and normalize text content with improved handling"""
-        if not text:
-            return ""
-        # Improved text cleaning
         text = re.sub(r'[^\w\s.,;:()\-\'"]', ' ', text)
-        text = re.sub(r'\s+', ' ', text)
-        text = text.encode('ascii', 'ignore').decode('ascii')  # Better character handling
         return text.strip()
-    @staticmethod
-    def format_paper(title: str, abstract: str, max_length: int = 1000) -> str:
-        """Format paper information with improved structure"""
-        title = TextProcessor.clean_text(title)
-        abstract = TextProcessor.clean_text(abstract)
-        if len(abstract) > max_length:
-            abstract = abstract[:max_length-3] + "..."
-        return f"""Title: {title}\nAbstract: {abstract}\n---"""

 import re
+from typing import List
+from models.paper import Paper
 class TextProcessor:
     @staticmethod
     def clean_text(text: str) -> str:
+        """Clean and normalize text content"""
+        # Remove special characters but keep basic punctuation
         text = re.sub(r'[^\w\s.,;:()\-\'"]', ' ', text)
         return text.strip()
+    def format_paper(self, title: str, abstract: str) -> str:
+        """Format paper title and abstract for context"""
+        title = self.clean_text(title)
+        abstract = self.clean_text(abstract)
+        return f"Title: {title}\nAbstract: {abstract}"
+    def create_context(self, papers: List[Paper]) -> str:
+        """Create a context string from a list of papers"""
+        context_parts = []
+        for i, paper in enumerate(papers, 1):
+            # Format the paper information with clear structure
+            paper_context = f"""
+Research Paper {i}:
+Title: {self.clean_text(paper.title)}
+Key Points:
+- Authors: {paper.authors if paper.authors else 'Not specified'}
+- Publication Date: {paper.publication_date}
+- Source: {paper.source}
+Main Findings:
+{self.format_abstract(paper.abstract)}
+"""
+            context_parts.append(paper_context)
+        # Join all paper contexts with clear separation
+        full_context = "\n" + "="*50 + "\n".join(context_parts)
+        return full_context
+    def format_abstract(self, abstract: str) -> str:
+        """Format abstract into bullet points for better readability"""
+        # Clean the abstract
+        clean_abstract = self.clean_text(abstract)
+        # Split into sentences
+        sentences = [s.strip() for s in clean_abstract.split('.') if s.strip()]
+        # Format as bullet points, combining short sentences
+        bullet_points = []
+        current_point = []
+        for sentence in sentences:
+            current_point.append(sentence)
+            if len(' '.join(current_point)) > 100 or sentence == sentences[-1]:
+                bullet_points.append('- ' + '. '.join(current_point) + '.')
+                current_point = []
+        return '\n'.join(bullet_points)