File size: 11,451 Bytes
3af593c
 
 
 
ee1b548
3af593c
 
 
 
 
 
 
ee1b548
3af593c
 
 
 
 
ee1b548
3af593c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee1b548
3af593c
 
 
 
 
ee1b548
 
3af593c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee1b548
3af593c
ee1b548
 
 
 
 
 
 
3af593c
 
ee1b548
3af593c
 
ee1b548
3af593c
ee1b548
 
 
 
 
 
 
 
 
 
 
 
 
3af593c
ee1b548
3af593c
 
 
 
 
 
ee1b548
3af593c
ee1b548
 
 
 
 
 
 
 
3af593c
ee1b548
 
 
 
3af593c
ee1b548
 
3af593c
 
 
ee1b548
3af593c
 
ee1b548
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3af593c
 
 
 
 
 
 
 
 
ee1b548
 
 
3af593c
ee1b548
 
 
 
 
 
 
 
3af593c
ee1b548
 
3af593c
ee1b548
 
 
 
 
3af593c
ee1b548
 
3af593c
ee1b548
 
 
 
3af593c
ee1b548
 
 
 
 
 
 
 
 
 
 
 
3af593c
 
 
ee1b548
3af593c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
import time
import logging
import random
import requests
import arxiv
import xml.etree.ElementTree as ET
from typing import List, Optional
from functools import lru_cache
from scholarly import scholarly
from concurrent.futures import ThreadPoolExecutor, as_completed
from models.paper import Paper
from utils.text_processor import TextProcessor
from bs4 import BeautifulSoup

# Constants
CACHE_SIZE = 128
MAX_PAPERS = 5
SCHOLAR_MAX_PAPERS = 3
ARXIV_MAX_PAPERS = 5
MAX_WORKERS = 3  # One thread per data source

class ResearchFetcher:
    def __init__(self):
        self.session = requests.Session()
        self._last_request_time = 0
        self._min_request_interval = 0.34
        self._max_retries = 3
        self._setup_scholarly()
        self.executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
    
    def __del__(self):
        """Cleanup executor on deletion"""
        self.executor.shutdown(wait=False)
    
    def _setup_scholarly(self):
        """Configure scholarly with basic settings"""
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
        ]
        # Set up a random user agent for scholarly
        scholarly._get_page = lambda url: requests.get(url, headers={'User-Agent': random.choice(self.user_agents)})
    
    def _rotate_user_agent(self):
        """Rotate user agent for Google Scholar requests"""
        return random.choice(self.user_agents)

    def _wait_for_rate_limit(self):
        """Ensure we don't exceed PubMed's rate limit"""
        current_time = time.time()
        time_since_last = current_time - self._last_request_time
        if time_since_last < self._min_request_interval:
            time.sleep(self._min_request_interval - time_since_last)
        self._last_request_time = time.time()

    def _make_request_with_retry(self, url: str, params: dict, timeout: int = 10) -> Optional[requests.Response]:
        """Make a request with retries and rate limiting"""
        for attempt in range(self._max_retries):
            try:
                self._wait_for_rate_limit()
                response = self.session.get(url, params=params, timeout=timeout)
                response.raise_for_status()
                return response
            except requests.exceptions.RequestException as e:
                if isinstance(e, requests.exceptions.HTTPError) and e.response.status_code == 429:
                    wait_time = (attempt + 1) * self._min_request_interval * 2
                    logging.warning(f"Rate limit hit, waiting {wait_time} seconds...")
                    time.sleep(wait_time)
                    continue
                if attempt == self._max_retries - 1:
                    logging.error(f"Error after {self._max_retries} retries: {str(e)}")
                    return None
        return None

    @lru_cache(maxsize=CACHE_SIZE)
    def fetch_arxiv_papers(self, query: str) -> List[Paper]:
        """Fetch papers from arXiv"""
        try:
            # Ensure query includes autism if not already present
            if 'autism' not in query.lower():
                search_query = f"autism {query}"
            else:
                search_query = query

            # Search arXiv
            search = arxiv.Search(
                query=search_query,
                max_results=ARXIV_MAX_PAPERS,
                sort_by=arxiv.SortCriterion.Relevance
            )

            papers = []
            for result in search.results():
                # Create Paper object
                paper = Paper(
                    title=result.title,
                    authors=', '.join([author.name for author in result.authors]),
                    abstract=result.summary,
                    url=result.pdf_url,
                    publication_date=result.published.strftime("%Y-%m-%d"),
                    relevance_score=1.0 if 'autism' in result.title.lower() else 0.8,
                    source="arXiv"
                )
                papers.append(paper)

            return papers

        except Exception as e:
            logging.error(f"Error fetching arXiv papers: {str(e)}")
            return []

    @lru_cache(maxsize=CACHE_SIZE)
    def fetch_pubmed_papers(self, query: str) -> List[Paper]:
        """Fetch papers from PubMed"""
        try:
            # Ensure query includes autism if not already present
            if 'autism' not in query.lower():
                search_query = f"autism {query}"
            else:
                search_query = query

            # Encode the query for URL
            encoded_query = requests.utils.quote(search_query)
            
            # Search PubMed
            search_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={encoded_query}&retmax=5"
            search_response = requests.get(search_url)
            search_tree = ET.fromstring(search_response.content)
            
            # Get IDs of papers
            id_list = search_tree.findall('.//Id')
            if not id_list:
                return []
            
            # Get details for each paper
            papers = []
            for id_elem in id_list:
                paper_id = id_elem.text
                details_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={paper_id}&retmode=xml"
                details_response = requests.get(details_url)
                details_tree = ET.fromstring(details_response.content)
                
                # Extract article data
                article = details_tree.find('.//Article')
                if article is None:
                    continue
                
                # Get title
                title_elem = article.find('.//ArticleTitle')
                title = title_elem.text if title_elem is not None else "No title available"
                
                # Get abstract
                abstract_elem = article.find('.//Abstract/AbstractText')
                abstract = abstract_elem.text if abstract_elem is not None else "No abstract available"
                
                # Get authors
                author_list = article.findall('.//Author')
                authors = []
                for author in author_list:
                    last_name = author.find('LastName')
                    fore_name = author.find('ForeName')
                    if last_name is not None and fore_name is not None:
                        authors.append(f"{fore_name.text} {last_name.text}")
                
                # Get publication date
                pub_date = article.find('.//PubDate')
                if pub_date is not None:
                    year = pub_date.find('Year')
                    month = pub_date.find('Month')
                    day = pub_date.find('Day')
                    pub_date_str = f"{year.text if year is not None else ''}-{month.text if month is not None else '01'}-{day.text if day is not None else '01'}"
                else:
                    pub_date_str = "Unknown"
                
                # Create Paper object
                paper = Paper(
                    title=title,
                    authors=', '.join(authors) if authors else "Unknown Authors",
                    abstract=abstract,
                    url=f"https://pubmed.ncbi.nlm.nih.gov/{paper_id}/",
                    publication_date=pub_date_str,
                    relevance_score=1.0 if 'autism' in title.lower() else 0.8,
                    source="PubMed"
                )
                papers.append(paper)
            
            return papers
            
        except Exception as e:
            logging.error(f"Error fetching PubMed papers: {str(e)}")
            return []

    @lru_cache(maxsize=CACHE_SIZE)
    def fetch_scholar_papers(self, query: str) -> List[Paper]:
        """
        Fetch papers from Google Scholar
        """
        try:
            headers = {'User-Agent': random.choice(self.user_agents)}
            encoded_query = requests.utils.quote(query)
            url = f'https://scholar.google.com/scholar?q={encoded_query}&hl=en&as_sdt=0,5'
            
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_code != 200:
                logging.error(f"Google Scholar returned status code {response.status_code}")
                return []

            # Use BeautifulSoup to parse the response
            soup = BeautifulSoup(response.text, 'html.parser')
            
            papers = []
            for result in soup.select('.gs_ri')[:5]:  # Limit to first 5 results
                title_elem = result.select_one('.gs_rt')
                authors_elem = result.select_one('.gs_a')
                snippet_elem = result.select_one('.gs_rs')
                
                if not title_elem:
                    continue
                    
                title = title_elem.get_text(strip=True)
                authors = authors_elem.get_text(strip=True) if authors_elem else "Unknown Authors"
                abstract = snippet_elem.get_text(strip=True) if snippet_elem else ""
                url = title_elem.find('a')['href'] if title_elem.find('a') else ""
                
                paper = Paper(
                    title=title,
                    authors=authors,
                    abstract=abstract,
                    url=url,
                    publication_date="",  # Date not easily available
                    relevance_score=0.8,  # Default score
                    source="Google Scholar"
                )
                papers.append(paper)
            
            return papers
            
        except Exception as e:
            logging.error(f"Error fetching Google Scholar papers: {str(e)}")
            return []

    def fetch_all_papers(self, query: str) -> List[Paper]:
        """Fetch papers from all sources concurrently and combine results"""
        all_papers = []
        futures = []

        # Submit tasks to thread pool
        try:
            futures.append(self.executor.submit(self.fetch_arxiv_papers, query))
            futures.append(self.executor.submit(self.fetch_pubmed_papers, query))
            futures.append(self.executor.submit(self.fetch_scholar_papers, query))

            # Collect results as they complete
            for future in as_completed(futures):
                try:
                    papers = future.result()
                    all_papers.extend(papers)
                except Exception as e:
                    logging.error(f"Error collecting papers from source: {str(e)}")
        except Exception as e:
            logging.error(f"Error in concurrent paper fetching: {str(e)}")

        # Sort and deduplicate papers
        seen_titles = set()
        unique_papers = []
        
        for paper in sorted(all_papers, key=lambda x: x.relevance_score, reverse=True):
            title_key = paper.title.lower()
            if title_key not in seen_titles:
                seen_titles.add(title_key)
                unique_papers.append(paper)
        
        return unique_papers[:MAX_PAPERS]