wakeupmh commited on
Commit
3af593c
·
1 Parent(s): 208e459

refactor: structure

Browse files
app.py CHANGED
@@ -1,341 +1,77 @@
1
  import streamlit as st
2
- import pandas as pd
3
- import torch
4
  import logging
5
- import os
6
- from transformers import AutoTokenizer, T5ForConditionalGeneration
7
- import arxiv
8
- import requests
9
- import xml.etree.ElementTree as ET
10
- import re
11
- from functools import lru_cache
12
- from typing import List, Dict, Optional
13
- from dataclasses import dataclass
14
- from concurrent.futures import ThreadPoolExecutor
15
 
16
  # Configure logging
17
- logging.basicConfig(level=logging.INFO)
 
 
 
18
 
19
- # Define data paths and constants
20
- DATA_DIR = "/data" if os.path.exists("/data") else "."
21
- DATASET_DIR = os.path.join(DATA_DIR, "rag_dataset")
22
- DATASET_PATH = os.path.join(DATASET_DIR, "dataset")
23
- MODEL_PATH = "google/flan-t5-small"
24
-
25
- # Constants for better maintainability
26
- MAX_ABSTRACT_LENGTH = 1000
27
- MAX_PAPERS = 5
28
- CACHE_SIZE = 128
29
-
30
- @dataclass
31
- class Paper:
32
- title: str
33
- abstract: str
34
- url: str
35
- published: str
36
- relevance_score: float
37
-
38
- class TextProcessor:
39
- @staticmethod
40
- def clean_text(text: str) -> str:
41
- """Clean and normalize text content with improved handling"""
42
- if not text:
43
- return ""
44
-
45
- # Improved text cleaning
46
- text = re.sub(r'[^\w\s.,;:()\-\'"]', ' ', text)
47
- text = re.sub(r'\s+', ' ', text)
48
- text = text.encode('ascii', 'ignore').decode('ascii') # Better character handling
49
-
50
- return text.strip()
51
-
52
- @staticmethod
53
- def format_paper(title: str, abstract: str) -> str:
54
- """Format paper information with improved structure"""
55
- title = TextProcessor.clean_text(title)
56
- abstract = TextProcessor.clean_text(abstract)
57
-
58
- if len(abstract) > MAX_ABSTRACT_LENGTH:
59
- abstract = abstract[:MAX_ABSTRACT_LENGTH-3] + "..."
60
-
61
- return f"""Title: {title}\nAbstract: {abstract}\n---"""
62
-
63
- class ResearchFetcher:
64
  def __init__(self):
65
- self.session = requests.Session() # Reuse connection
 
 
 
 
66
 
67
- @lru_cache(maxsize=CACHE_SIZE)
68
- def fetch_arxiv_papers(self, query: str) -> List[Paper]:
69
- """Fetch papers from arXiv with improved filtering"""
70
- client = arxiv.Client()
71
- search_query = f"(ti:autism OR abs:autism) AND (ti:\"{query}\" OR abs:\"{query}\") AND cat:q-bio"
72
-
73
- search = arxiv.Search(
74
- query=search_query,
75
- max_results=MAX_PAPERS,
76
- sort_by=arxiv.SortCriterion.Relevance
77
- )
78
-
79
- papers = []
80
- for result in client.results(search):
81
- title_lower = result.title.lower()
82
- summary_lower = result.summary.lower()
83
-
84
- if any(term in title_lower or term in summary_lower
85
- for term in ['autism', 'asd', 'autism spectrum disorder']):
86
- papers.append(Paper(
87
- title=result.title,
88
- abstract=result.summary,
89
- url=result.pdf_url,
90
- published=result.published.strftime("%Y-%m-%d"),
91
- relevance_score=1.0 if 'autism' in title_lower else 0.8
92
- ))
93
-
94
- return papers
95
-
96
- @lru_cache(maxsize=CACHE_SIZE)
97
- def fetch_pubmed_papers(self, query: str) -> List[Paper]:
98
- """Fetch papers from PubMed with improved error handling"""
99
- base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
100
- search_term = f"(autism[Title/Abstract] OR ASD[Title/Abstract]) AND ({query}[Title/Abstract])"
101
-
102
- try:
103
- # Fetch IDs efficiently
104
- response = self.session.get(
105
- f"{base_url}/esearch.fcgi",
106
- params={
107
- 'db': 'pubmed',
108
- 'term': search_term,
109
- 'retmax': MAX_PAPERS,
110
- 'sort': 'relevance',
111
- 'retmode': 'xml'
112
- },
113
- timeout=10
114
- )
115
- response.raise_for_status()
116
-
117
- root = ET.fromstring(response.content)
118
- id_list = root.findall('.//Id')
119
-
120
- if not id_list:
121
- return []
122
-
123
- # Fetch details in parallel
124
- with ThreadPoolExecutor(max_workers=2) as executor:
125
- paper_futures = [
126
- executor.submit(self._fetch_paper_details, base_url, id_elem.text)
127
- for id_elem in id_list
128
- ]
129
-
130
- return [paper for future in paper_futures
131
- for paper in [future.result()] if paper is not None]
132
-
133
- except Exception as e:
134
- logging.error(f"Error fetching PubMed papers: {str(e)}")
135
- return []
136
-
137
- def _fetch_paper_details(self, base_url: str, paper_id: str) -> Optional[Paper]:
138
- """Fetch individual paper details with timeout"""
139
- try:
140
- response = self.session.get(
141
- f"{base_url}/efetch.fcgi",
142
- params={
143
- 'db': 'pubmed',
144
- 'id': paper_id,
145
- 'retmode': 'xml'
146
- },
147
- timeout=5
148
- )
149
- response.raise_for_status()
150
-
151
- article = ET.fromstring(response.content).find('.//PubmedArticle')
152
- if article is None:
153
- return None
154
-
155
- title = article.find('.//ArticleTitle')
156
- abstract = article.find('.//Abstract/AbstractText')
157
- year = article.find('.//PubDate/Year')
158
-
159
- if title is not None and abstract is not None:
160
- title_text = title.text.lower()
161
- abstract_text = abstract.text.lower()
162
-
163
- if any(term in title_text or term in abstract_text
164
- for term in ['autism', 'asd']):
165
- return Paper(
166
- title=title.text,
167
- abstract=abstract.text,
168
- url=f"https://pubmed.ncbi.nlm.nih.gov/{paper_id}/",
169
- published=year.text if year is not None else 'Unknown',
170
- relevance_score=1.0 if any(term in title_text
171
- for term in ['autism', 'asd']) else 0.5
172
- )
173
-
174
- except Exception as e:
175
- logging.error(f"Error fetching paper {paper_id}: {str(e)}")
176
  return None
177
-
178
- class ModelHandler:
179
- def __init__(self):
180
- self.model = None
181
- self.tokenizer = None
182
- self._initialize_model()
183
 
184
- @staticmethod
185
- @st.cache_resource
186
- def _load_model():
187
- """Load FLAN-T5 Small model with optimized settings"""
188
- try:
189
- tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
190
- model = T5ForConditionalGeneration.from_pretrained(
191
- MODEL_PATH,
192
- device_map={"": "cpu"},
193
- torch_dtype=torch.float32,
194
- low_cpu_mem_usage=True
195
- )
196
- return model, tokenizer
197
- except Exception as e:
198
- logging.error(f"Error loading model: {str(e)}")
199
- return None, None
200
 
201
- def _initialize_model(self):
202
- """Initialize model and tokenizer"""
203
- self.model, self.tokenizer = self._load_model()
204
-
205
- def generate_answer(self, question: str, context: str, max_length: int = 512) -> str:
206
- """Generate answer with FLAN-T5 optimized parameters"""
207
- if self.model is None or self.tokenizer is None:
208
- return "Error: Model loading failed. Please try again later."
209
-
210
- try:
211
- # FLAN-T5 responds better to direct instruction prompts
212
- input_text = f"""You are an expert in autism research. Provide a clear, structured, and evidence-based explanation of autism using the provided research context.
213
-
214
- Research Context:
215
- {context}
216
-
217
- Instructions:
218
- 1. Start with a concise definition of autism.
219
- 2. Explain the key characteristics and symptoms.
220
- 3. Discuss potential causes and contributing factors (e.g., genetic, environmental).
221
- 4. Mention current research findings and treatments.
222
- 5. Use clear, accessible language.
223
- 6. Cite specific studies or papers when relevant.
224
-
225
- Answer:"""
226
-
227
- inputs = self.tokenizer(
228
- input_text,
229
- return_tensors="pt",
230
- max_length=1024,
231
- truncation=True,
232
- padding=True
233
- )
234
-
235
- with torch.inference_mode():
236
- outputs = self.model.generate(
237
- **inputs,
238
- max_length=max_length,
239
- min_length=100,
240
- num_beams=3,
241
- length_penalty=1.0,
242
- temperature=0.6,
243
- repetition_penalty=1.2,
244
- early_stopping=True,
245
- no_repeat_ngram_size=2,
246
- do_sample=True,
247
- top_k=30,
248
- top_p=0.92
249
- )
250
-
251
- response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
252
- response = TextProcessor.clean_text(response)
253
-
254
- if len(response.strip()) < 50:
255
- return self._get_fallback_response()
256
-
257
- return self._format_response(response)
258
-
259
- except Exception as e:
260
- logging.error(f"Error generating response: {str(e)}")
261
- return "Error: Could not generate response. Please try again."
262
-
263
- @staticmethod
264
- def _get_fallback_response() -> str:
265
- """Provide a structured fallback response"""
266
- return """Based on the available research, I cannot provide a specific answer to your question. Please try:
267
-
268
- 1. Rephrasing your question to be more specific
269
- 2. Asking about:
270
- - Specific behaviors or characteristics
271
- - Intervention strategies
272
- - Research findings
273
- - Support approaches
274
-
275
- This will help me provide more accurate, research-based information."""
276
-
277
- @staticmethod
278
- def _format_response(response: str) -> str:
279
- """Format the response for better readability"""
280
- sections = response.split('\n\n')
281
- formatted_sections = []
282
-
283
- for i, section in enumerate(sections):
284
- if i == 0:
285
- formatted_sections.append(f"### Overview\n{section}")
286
- elif i == len(sections) - 1:
287
- formatted_sections.append(f"### Key Takeaways\n{section}")
288
- else:
289
- formatted_sections.append(section)
290
-
291
- return '\n\n'.join(formatted_sections)
292
-
293
- def main():
294
- st.title("🧩 AMA Autism")
295
- st.write("""
296
- Ask questions about autism and get research-based answers from scientific papers.
297
- For best results, be specific in your questions.
298
- """)
299
-
300
- query = st.text_input("What would you like to know about autism? ✨")
301
-
302
- if query:
303
- with st.status("Researching your question...") as status:
304
- # Initialize handlers
305
- research_fetcher = ResearchFetcher()
306
- model_handler = ModelHandler()
307
-
308
- # Fetch papers concurrently
309
- with ThreadPoolExecutor(max_workers=2) as executor:
310
- arxiv_future = executor.submit(research_fetcher.fetch_arxiv_papers, query)
311
- pubmed_future = executor.submit(research_fetcher.fetch_pubmed_papers, query)
312
-
313
- papers = arxiv_future.result() + pubmed_future.result()
314
-
315
- if not papers:
316
- st.warning("No relevant research papers found. Please try a different search term.")
317
- return
318
-
319
- # Sort papers by relevance
320
- papers.sort(key=lambda x: x.relevance_score, reverse=True)
321
-
322
- # Prepare context from top papers
323
- context = "\n".join(
324
- TextProcessor.format_paper(paper.title, paper.abstract)
325
- for paper in papers[:3]
326
- )
327
-
328
- # Generate answer
329
- st.write("Analyzing research papers...")
330
- answer = model_handler.generate_answer(query, context)
331
- status.write("I've got it!")
332
-
333
  with st.expander("📚 View source papers"):
334
  for paper in papers:
335
  st.markdown(f"- [{paper.title}]({paper.url}) ({paper.published})")
 
 
 
 
336
 
337
- st.success("Research analysis complete!")
338
- st.markdown(answer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
  if __name__ == "__main__":
341
  main()
 
1
  import streamlit as st
 
 
2
  import logging
3
+ from services.research_fetcher import ResearchFetcher
4
+ from services.model_handler import ModelHandler
5
+ from utils.text_processor import TextProcessor
 
 
 
 
 
 
 
6
 
7
  # Configure logging
8
+ logging.basicConfig(
9
+ level=logging.INFO,
10
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
11
+ )
12
 
13
+ class AutismResearchApp:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def __init__(self):
15
+ """Initialize the application components"""
16
+ self.research_fetcher = ResearchFetcher()
17
+ self.model_handler = ModelHandler()
18
+ self.text_processor = TextProcessor()
19
+ self._setup_streamlit()
20
 
21
+ def _setup_streamlit(self):
22
+ """Setup Streamlit UI components"""
23
+ st.title("🧩 AMA Autism")
24
+ st.write("""
25
+ Ask questions about autism and get research-based answers from scientific papers.
26
+ For best results, be specific in your questions.
27
+ """)
28
+
29
+ def _fetch_research(self, query: str):
30
+ """Fetch research papers for the given query"""
31
+ papers = self.research_fetcher.fetch_all_papers(query)
32
+ if not papers:
33
+ st.warning("No relevant research papers found. Please try a different search term.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  return None
35
+ return papers
 
 
 
 
 
36
 
37
+ def _generate_answer(self, query: str, papers):
38
+ """Generate answer based on research papers"""
39
+ context = "\n".join(
40
+ self.text_processor.format_paper(paper.title, paper.abstract)
41
+ for paper in papers[:3]
42
+ )
43
+ return self.model_handler.generate_answer(query, context)
 
 
 
 
 
 
 
 
 
44
 
45
+ def _display_sources(self, papers):
46
+ """Display source papers in an expander"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  with st.expander("📚 View source papers"):
48
  for paper in papers:
49
  st.markdown(f"- [{paper.title}]({paper.url}) ({paper.published})")
50
+
51
+ def run(self):
52
+ """Run the main application loop"""
53
+ query = st.text_input("What would you like to know about autism? ✨")
54
 
55
+ if query:
56
+ with st.status("Researching your question...") as status:
57
+ # Fetch papers
58
+ papers = self._fetch_research(query)
59
+ if not papers:
60
+ return
61
+
62
+ # Generate and display answer
63
+ st.write("Analyzing research papers...")
64
+ answer = self._generate_answer(query, papers)
65
+ status.write("I've got it!")
66
+
67
+ # Display results
68
+ self._display_sources(papers)
69
+ st.success("Research analysis complete!")
70
+ st.markdown(answer)
71
+
72
+ def main():
73
+ app = AutismResearchApp()
74
+ app.run()
75
 
76
  if __name__ == "__main__":
77
  main()
models/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
models/paper.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ @dataclass
4
+ class Paper:
5
+ title: str
6
+ abstract: str
7
+ url: str
8
+ published: str
9
+ relevance_score: float
10
+ source: str = "unknown" # Track where the paper came from
requirements.txt CHANGED
@@ -7,4 +7,5 @@ accelerate>=0.26.0
7
  numpy>=1.24.0
8
  pandas>=2.2.0
9
  requests>=2.31.0
10
- arxiv>=2.1.0
 
 
7
  numpy>=1.24.0
8
  pandas>=2.2.0
9
  requests>=2.31.0
10
+ arxiv>=2.1.0
11
+ scholarly==1.7.11
services/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
services/model_handler.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import logging
3
+ from transformers import AutoTokenizer, T5ForConditionalGeneration
4
+ import streamlit as st
5
+ from utils.text_processor import TextProcessor
6
+
7
+ MODEL_PATH = "google/flan-t5-small"
8
+
9
+ class ModelHandler:
10
+ def __init__(self):
11
+ self.model = None
12
+ self.tokenizer = None
13
+ self._initialize_model()
14
+
15
+ @staticmethod
16
+ @st.cache_resource
17
+ def _load_model():
18
+ """Load FLAN-T5 Small model with optimized settings"""
19
+ try:
20
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
21
+ model = T5ForConditionalGeneration.from_pretrained(
22
+ MODEL_PATH,
23
+ device_map={"": "cpu"},
24
+ torch_dtype=torch.float32,
25
+ low_cpu_mem_usage=True
26
+ )
27
+ return model, tokenizer
28
+ except Exception as e:
29
+ logging.error(f"Error loading model: {str(e)}")
30
+ return None, None
31
+
32
+ def _initialize_model(self):
33
+ """Initialize model and tokenizer"""
34
+ self.model, self.tokenizer = self._load_model()
35
+
36
+ def generate_answer(self, question: str, context: str, max_length: int = 512) -> str:
37
+ """Generate natural, human-readable answers using research context"""
38
+ if self.model is None or self.tokenizer is None:
39
+ return "Error: Model loading failed. Please try again later."
40
+
41
+ try:
42
+ input_text = f"""You are an expert explaining autism research to a general audience. Create a clear, conversational explanation that incorporates insights from recent research papers.
43
+
44
+ Question: {question}
45
+
46
+ Available Research:
47
+ {context}
48
+
49
+ Instructions:
50
+ 1. Write in a clear, conversational style
51
+ 2. Start with a brief, general explanation
52
+ 3. Support your points with research, using phrases like "According to [Paper Title]..." or "Research has shown..."
53
+ 4. Focus on making complex concepts understandable
54
+ 5. Maintain a helpful and informative tone
55
+
56
+ Remember to write like you're explaining to someone interested in learning about autism, not like you're writing a technical paper."""
57
+
58
+ inputs = self.tokenizer(
59
+ input_text,
60
+ return_tensors="pt",
61
+ max_length=1024,
62
+ truncation=True,
63
+ padding=True
64
+ )
65
+
66
+ with torch.inference_mode():
67
+ outputs = self.model.generate(
68
+ **inputs,
69
+ max_length=max_length,
70
+ min_length=150,
71
+ num_beams=4,
72
+ length_penalty=1.0,
73
+ temperature=0.8,
74
+ repetition_penalty=1.3,
75
+ early_stopping=True,
76
+ no_repeat_ngram_size=3,
77
+ do_sample=True,
78
+ top_k=40,
79
+ top_p=0.95
80
+ )
81
+
82
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
83
+ response = TextProcessor.clean_text(response)
84
+
85
+ if len(response.strip()) < 50:
86
+ return self._get_fallback_response()
87
+
88
+ return self._format_response(response)
89
+
90
+ except Exception as e:
91
+ logging.error(f"Error generating response: {str(e)}")
92
+ return "Error: Could not generate response. Please try again."
93
+
94
+ @staticmethod
95
+ def _get_fallback_response() -> str:
96
+ """Provide a friendly, helpful fallback response"""
97
+ return """I apologize, but I couldn't find enough specific research to properly answer your question. To help you get better information, you could:
98
+
99
+ • Ask about specific aspects of autism you're interested in
100
+ • Focus on particular topics like:
101
+ - Early signs and diagnosis
102
+ - Treatment approaches
103
+ - Latest research findings
104
+ - Support strategies
105
+
106
+ This will help me provide more detailed, research-backed information that's relevant to your interests."""
107
+
108
+ @staticmethod
109
+ def _format_response(response: str) -> str:
110
+ """Format the response to be more readable and engaging"""
111
+ # Clean up the response
112
+ response = response.replace(" 1.", "\n\n1.")
113
+ response = response.replace(" 2.", "\n2.")
114
+ response = response.replace(" 3.", "\n3.")
115
+
116
+ # Split into paragraphs for better readability
117
+ paragraphs = response.split('\n\n')
118
+ formatted_paragraphs = []
119
+
120
+ for paragraph in paragraphs:
121
+ # Format citations to stand out
122
+ if "According to" in paragraph or "Research" in paragraph:
123
+ paragraph = f"*{paragraph}*"
124
+
125
+ # Add bullet points for lists
126
+ if paragraph.strip().startswith(('1.', '2.', '3.')):
127
+ paragraph = paragraph.replace('1.', '•')
128
+ paragraph = paragraph.replace('2.', '•')
129
+ paragraph = paragraph.replace('3.', '•')
130
+
131
+ formatted_paragraphs.append(paragraph)
132
+
133
+ return '\n\n'.join(formatted_paragraphs)
services/research_fetcher.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import logging
3
+ import random
4
+ import arxiv
5
+ import requests
6
+ import xml.etree.ElementTree as ET
7
+ from typing import List, Optional
8
+ from functools import lru_cache
9
+ from scholarly import scholarly
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+ from models.paper import Paper
12
+ from utils.text_processor import TextProcessor
13
+
14
+ # Constants
15
+ CACHE_SIZE = 128
16
+ MAX_PAPERS = 5
17
+ SCHOLAR_MAX_PAPERS = 3
18
+ MAX_WORKERS = 3 # One thread per data source
19
+
20
+ class ResearchFetcher:
21
+ def __init__(self):
22
+ self.session = requests.Session()
23
+ self._last_request_time = 0
24
+ self._min_request_interval = 0.34
25
+ self._max_retries = 3
26
+ self._setup_scholarly()
27
+ self.executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
28
+
29
+ def __del__(self):
30
+ """Cleanup executor on deletion"""
31
+ self.executor.shutdown(wait=False)
32
+
33
+ def _setup_scholarly(self):
34
+ """Configure scholarly with rotating user agents"""
35
+ self.user_agents = [
36
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
37
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
38
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
39
+ ]
40
+ scholarly.use_proxy(None)
41
+
42
+ def _rotate_user_agent(self):
43
+ """Rotate user agent for Google Scholar requests"""
44
+ return random.choice(self.user_agents)
45
+
46
+ def _wait_for_rate_limit(self):
47
+ """Ensure we don't exceed PubMed's rate limit"""
48
+ current_time = time.time()
49
+ time_since_last = current_time - self._last_request_time
50
+ if time_since_last < self._min_request_interval:
51
+ time.sleep(self._min_request_interval - time_since_last)
52
+ self._last_request_time = time.time()
53
+
54
+ def _make_request_with_retry(self, url: str, params: dict, timeout: int = 10) -> Optional[requests.Response]:
55
+ """Make a request with retries and rate limiting"""
56
+ for attempt in range(self._max_retries):
57
+ try:
58
+ self._wait_for_rate_limit()
59
+ response = self.session.get(url, params=params, timeout=timeout)
60
+ response.raise_for_status()
61
+ return response
62
+ except requests.exceptions.RequestException as e:
63
+ if isinstance(e, requests.exceptions.HTTPError) and e.response.status_code == 429:
64
+ wait_time = (attempt + 1) * self._min_request_interval * 2
65
+ logging.warning(f"Rate limit hit, waiting {wait_time} seconds...")
66
+ time.sleep(wait_time)
67
+ continue
68
+ if attempt == self._max_retries - 1:
69
+ logging.error(f"Error after {self._max_retries} retries: {str(e)}")
70
+ return None
71
+ return None
72
+
73
+ @lru_cache(maxsize=CACHE_SIZE)
74
+ def fetch_arxiv_papers(self, query: str) -> List[Paper]:
75
+ """Fetch papers from arXiv with improved filtering"""
76
+ try:
77
+ client = arxiv.Client()
78
+ search_query = f"(ti:autism OR abs:autism) AND (ti:\"{query}\" OR abs:\"{query}\") AND cat:q-bio"
79
+
80
+ search = arxiv.Search(
81
+ query=search_query,
82
+ max_results=MAX_PAPERS,
83
+ sort_by=arxiv.SortCriterion.Relevance
84
+ )
85
+
86
+ papers = []
87
+ for result in client.results(search):
88
+ title_lower = result.title.lower()
89
+ summary_lower = result.summary.lower()
90
+
91
+ if any(term in title_lower or term in summary_lower
92
+ for term in ['autism', 'asd', 'autism spectrum disorder']):
93
+ papers.append(Paper(
94
+ title=result.title,
95
+ abstract=result.summary,
96
+ url=result.pdf_url,
97
+ published=result.published.strftime("%Y-%m-%d"),
98
+ relevance_score=1.0 if 'autism' in title_lower else 0.8,
99
+ source='arxiv'
100
+ ))
101
+
102
+ return papers
103
+ except Exception as e:
104
+ logging.error(f"Error fetching arXiv papers: {str(e)}")
105
+ return []
106
+
107
+ @lru_cache(maxsize=CACHE_SIZE)
108
+ def fetch_pubmed_papers(self, query: str) -> List[Paper]:
109
+ """Fetch papers from PubMed with improved error handling and rate limiting"""
110
+ try:
111
+ base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
112
+ search_term = f"(autism[Title/Abstract] OR ASD[Title/Abstract]) AND ({query}[Title/Abstract])"
113
+
114
+ response = self._make_request_with_retry(
115
+ f"{base_url}/esearch.fcgi",
116
+ params={
117
+ 'db': 'pubmed',
118
+ 'term': search_term,
119
+ 'retmax': MAX_PAPERS,
120
+ 'sort': 'relevance',
121
+ 'retmode': 'xml'
122
+ }
123
+ )
124
+
125
+ if not response:
126
+ return []
127
+
128
+ root = ET.fromstring(response.content)
129
+ id_list = root.findall('.//Id')
130
+
131
+ if not id_list:
132
+ return []
133
+
134
+ papers = []
135
+ for id_elem in id_list:
136
+ paper = self._fetch_paper_details(base_url, id_elem.text)
137
+ if paper:
138
+ papers.append(paper)
139
+
140
+ return papers
141
+
142
+ except Exception as e:
143
+ logging.error(f"Error fetching PubMed papers: {str(e)}")
144
+ return []
145
+
146
+ def _fetch_paper_details(self, base_url: str, paper_id: str) -> Optional[Paper]:
147
+ """Fetch individual paper details with rate limiting and retries"""
148
+ try:
149
+ response = self._make_request_with_retry(
150
+ f"{base_url}/efetch.fcgi",
151
+ params={
152
+ 'db': 'pubmed',
153
+ 'id': paper_id,
154
+ 'retmode': 'xml'
155
+ }
156
+ )
157
+
158
+ if not response:
159
+ return None
160
+
161
+ article = ET.fromstring(response.content).find('.//PubmedArticle')
162
+ if article is None:
163
+ return None
164
+
165
+ title = article.find('.//ArticleTitle')
166
+ abstract = article.find('.//Abstract/AbstractText')
167
+ year = article.find('.//PubDate/Year')
168
+
169
+ if title is not None and abstract is not None:
170
+ title_text = title.text.lower()
171
+ abstract_text = abstract.text.lower()
172
+
173
+ if any(term in title_text or term in abstract_text
174
+ for term in ['autism', 'asd']):
175
+ return Paper(
176
+ title=title.text,
177
+ abstract=abstract.text,
178
+ url=f"https://pubmed.ncbi.nlm.nih.gov/{paper_id}/",
179
+ published=year.text if year is not None else 'Unknown',
180
+ relevance_score=1.0 if any(term in title_text
181
+ for term in ['autism', 'asd']) else 0.5,
182
+ source='pubmed'
183
+ )
184
+
185
+ except Exception as e:
186
+ logging.error(f"Error fetching paper {paper_id}: {str(e)}")
187
+ return None
188
+
189
+ @lru_cache(maxsize=CACHE_SIZE)
190
+ def fetch_scholar_papers(self, query: str) -> List[Paper]:
191
+ """Fetch papers from Google Scholar with rate limiting"""
192
+ papers = []
193
+ try:
194
+ if 'autism' not in query.lower():
195
+ search_query = f"autism {query}"
196
+ else:
197
+ search_query = query
198
+
199
+ scholarly.set_headers({'User-Agent': self._rotate_user_agent()})
200
+ search_results = scholarly.search_pubs(search_query)
201
+
202
+ count = 0
203
+ for result in search_results:
204
+ if count >= SCHOLAR_MAX_PAPERS:
205
+ break
206
+
207
+ try:
208
+ pub = result['bib']
209
+ title_abstract = f"{pub.get('title', '')} {pub.get('abstract', '')}".lower()
210
+
211
+ if not any(term in title_abstract for term in ['autism', 'asd']):
212
+ continue
213
+
214
+ abstract = pub.get('abstract', '')
215
+ if not abstract and 'eprint' in result:
216
+ abstract = "Abstract not available. Please refer to the full paper."
217
+
218
+ url = pub.get('url', '')
219
+ if not url and 'eprint' in result:
220
+ url = result['eprint']
221
+
222
+ papers.append(Paper(
223
+ title=pub.get('title', 'Untitled'),
224
+ abstract=abstract[:1000] + '...' if len(abstract) > 1000 else abstract,
225
+ url=url,
226
+ published=str(pub.get('year', 'Unknown')),
227
+ relevance_score=1.0 if 'autism' in pub.get('title', '').lower() else 0.5,
228
+ source='scholar'
229
+ ))
230
+ count += 1
231
+
232
+ time.sleep(random.uniform(1.0, 2.0))
233
+
234
+ except Exception as e:
235
+ logging.error(f"Error processing Scholar result: {str(e)}")
236
+ continue
237
+
238
+ except Exception as e:
239
+ logging.error(f"Error fetching Google Scholar papers: {str(e)}")
240
+
241
+ return papers
242
+
243
+ def fetch_all_papers(self, query: str) -> List[Paper]:
244
+ """Fetch papers from all sources concurrently and combine results"""
245
+ all_papers = []
246
+ futures = []
247
+
248
+ # Submit tasks to thread pool
249
+ try:
250
+ futures.append(self.executor.submit(self.fetch_arxiv_papers, query))
251
+ futures.append(self.executor.submit(self.fetch_pubmed_papers, query))
252
+ futures.append(self.executor.submit(self.fetch_scholar_papers, query))
253
+
254
+ # Collect results as they complete
255
+ for future in as_completed(futures):
256
+ try:
257
+ papers = future.result()
258
+ all_papers.extend(papers)
259
+ except Exception as e:
260
+ logging.error(f"Error collecting papers from source: {str(e)}")
261
+ except Exception as e:
262
+ logging.error(f"Error in concurrent paper fetching: {str(e)}")
263
+
264
+ # Sort and deduplicate papers
265
+ seen_titles = set()
266
+ unique_papers = []
267
+
268
+ for paper in sorted(all_papers, key=lambda x: x.relevance_score, reverse=True):
269
+ title_key = paper.title.lower()
270
+ if title_key not in seen_titles:
271
+ seen_titles.add(title_key)
272
+ unique_papers.append(paper)
273
+
274
+ return unique_papers[:MAX_PAPERS]
utils/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
utils/text_processor.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ class TextProcessor:
4
+ @staticmethod
5
+ def clean_text(text: str) -> str:
6
+ """Clean and normalize text content with improved handling"""
7
+ if not text:
8
+ return ""
9
+
10
+ # Improved text cleaning
11
+ text = re.sub(r'[^\w\s.,;:()\-\'"]', ' ', text)
12
+ text = re.sub(r'\s+', ' ', text)
13
+ text = text.encode('ascii', 'ignore').decode('ascii') # Better character handling
14
+
15
+ return text.strip()
16
+
17
+ @staticmethod
18
+ def format_paper(title: str, abstract: str, max_length: int = 1000) -> str:
19
+ """Format paper information with improved structure"""
20
+ title = TextProcessor.clean_text(title)
21
+ abstract = TextProcessor.clean_text(abstract)
22
+
23
+ if len(abstract) > max_length:
24
+ abstract = abstract[:max_length-3] + "..."
25
+
26
+ return f"""Title: {title}\nAbstract: {abstract}\n---"""