import time import logging import random import requests import arxiv import xml.etree.ElementTree as ET from typing import List, Optional from functools import lru_cache from scholarly import scholarly from concurrent.futures import ThreadPoolExecutor, as_completed from models.paper import Paper from utils.text_processor import TextProcessor from bs4 import BeautifulSoup # Constants CACHE_SIZE = 128 MAX_PAPERS = 5 SCHOLAR_MAX_PAPERS = 3 ARXIV_MAX_PAPERS = 5 MAX_WORKERS = 3 # One thread per data source class ResearchFetcher: def __init__(self): self.session = requests.Session() self._last_request_time = 0 self._min_request_interval = 0.34 self._max_retries = 3 self._setup_scholarly() self.executor = ThreadPoolExecutor(max_workers=MAX_WORKERS) def __del__(self): """Cleanup executor on deletion""" self.executor.shutdown(wait=False) def _setup_scholarly(self): """Configure scholarly with basic settings""" self.user_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0' ] # Set up a random user agent for scholarly scholarly._get_page = lambda url: requests.get(url, headers={'User-Agent': random.choice(self.user_agents)}) def _rotate_user_agent(self): """Rotate user agent for Google Scholar requests""" return random.choice(self.user_agents) def _wait_for_rate_limit(self): """Ensure we don't exceed PubMed's rate limit""" current_time = time.time() time_since_last = current_time - self._last_request_time if time_since_last < self._min_request_interval: time.sleep(self._min_request_interval - time_since_last) self._last_request_time = time.time() def _make_request_with_retry(self, url: str, params: dict, timeout: int = 10) -> Optional[requests.Response]: """Make a request with retries and rate limiting""" for attempt in range(self._max_retries): try: self._wait_for_rate_limit() response = self.session.get(url, params=params, timeout=timeout) response.raise_for_status() return response except requests.exceptions.RequestException as e: if isinstance(e, requests.exceptions.HTTPError) and e.response.status_code == 429: wait_time = (attempt + 1) * self._min_request_interval * 2 logging.warning(f"Rate limit hit, waiting {wait_time} seconds...") time.sleep(wait_time) continue if attempt == self._max_retries - 1: logging.error(f"Error after {self._max_retries} retries: {str(e)}") return None return None @lru_cache(maxsize=CACHE_SIZE) def fetch_arxiv_papers(self, query: str) -> List[Paper]: """Fetch papers from arXiv""" try: # Ensure query includes autism if not already present if 'autism' not in query.lower(): search_query = f"autism {query}" else: search_query = query # Search arXiv search = arxiv.Search( query=search_query, max_results=ARXIV_MAX_PAPERS, sort_by=arxiv.SortCriterion.Relevance ) papers = [] for result in search.results(): # Create Paper object paper = Paper( title=result.title, authors=', '.join([author.name for author in result.authors]), abstract=result.summary, url=result.pdf_url, publication_date=result.published.strftime("%Y-%m-%d"), relevance_score=1.0 if 'autism' in result.title.lower() else 0.8, source="arXiv" ) papers.append(paper) return papers except Exception as e: logging.error(f"Error fetching arXiv papers: {str(e)}") return [] @lru_cache(maxsize=CACHE_SIZE) def fetch_pubmed_papers(self, query: str) -> List[Paper]: """Fetch papers from PubMed""" try: # Ensure query includes autism if not already present if 'autism' not in query.lower(): search_query = f"autism {query}" else: search_query = query # Encode the query for URL encoded_query = requests.utils.quote(search_query) # Search PubMed search_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={encoded_query}&retmax=5" search_response = requests.get(search_url) search_tree = ET.fromstring(search_response.content) # Get IDs of papers id_list = search_tree.findall('.//Id') if not id_list: return [] # Get details for each paper papers = [] for id_elem in id_list: paper_id = id_elem.text details_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={paper_id}&retmode=xml" details_response = requests.get(details_url) details_tree = ET.fromstring(details_response.content) # Extract article data article = details_tree.find('.//Article') if article is None: continue # Get title title_elem = article.find('.//ArticleTitle') title = title_elem.text if title_elem is not None else "No title available" # Get abstract abstract_elem = article.find('.//Abstract/AbstractText') abstract = abstract_elem.text if abstract_elem is not None else "No abstract available" # Get authors author_list = article.findall('.//Author') authors = [] for author in author_list: last_name = author.find('LastName') fore_name = author.find('ForeName') if last_name is not None and fore_name is not None: authors.append(f"{fore_name.text} {last_name.text}") # Get publication date pub_date = article.find('.//PubDate') if pub_date is not None: year = pub_date.find('Year') month = pub_date.find('Month') day = pub_date.find('Day') pub_date_str = f"{year.text if year is not None else ''}-{month.text if month is not None else '01'}-{day.text if day is not None else '01'}" else: pub_date_str = "Unknown" # Create Paper object paper = Paper( title=title, authors=', '.join(authors) if authors else "Unknown Authors", abstract=abstract, url=f"https://pubmed.ncbi.nlm.nih.gov/{paper_id}/", publication_date=pub_date_str, relevance_score=1.0 if 'autism' in title.lower() else 0.8, source="PubMed" ) papers.append(paper) return papers except Exception as e: logging.error(f"Error fetching PubMed papers: {str(e)}") return [] @lru_cache(maxsize=CACHE_SIZE) def fetch_scholar_papers(self, query: str) -> List[Paper]: """ Fetch papers from Google Scholar """ try: headers = {'User-Agent': random.choice(self.user_agents)} encoded_query = requests.utils.quote(query) url = f'https://scholar.google.com/scholar?q={encoded_query}&hl=en&as_sdt=0,5' response = requests.get(url, headers=headers, timeout=10) if response.status_code != 200: logging.error(f"Google Scholar returned status code {response.status_code}") return [] # Use BeautifulSoup to parse the response soup = BeautifulSoup(response.text, 'html.parser') papers = [] for result in soup.select('.gs_ri')[:5]: # Limit to first 5 results title_elem = result.select_one('.gs_rt') authors_elem = result.select_one('.gs_a') snippet_elem = result.select_one('.gs_rs') if not title_elem: continue title = title_elem.get_text(strip=True) authors = authors_elem.get_text(strip=True) if authors_elem else "Unknown Authors" abstract = snippet_elem.get_text(strip=True) if snippet_elem else "" url = title_elem.find('a')['href'] if title_elem.find('a') else "" paper = Paper( title=title, authors=authors, abstract=abstract, url=url, publication_date="", # Date not easily available relevance_score=0.8, # Default score source="Google Scholar" ) papers.append(paper) return papers except Exception as e: logging.error(f"Error fetching Google Scholar papers: {str(e)}") return [] def fetch_all_papers(self, query: str) -> List[Paper]: """Fetch papers from all sources concurrently and combine results""" all_papers = [] futures = [] # Submit tasks to thread pool try: futures.append(self.executor.submit(self.fetch_arxiv_papers, query)) futures.append(self.executor.submit(self.fetch_pubmed_papers, query)) futures.append(self.executor.submit(self.fetch_scholar_papers, query)) # Collect results as they complete for future in as_completed(futures): try: papers = future.result() all_papers.extend(papers) except Exception as e: logging.error(f"Error collecting papers from source: {str(e)}") except Exception as e: logging.error(f"Error in concurrent paper fetching: {str(e)}") # Sort and deduplicate papers seen_titles = set() unique_papers = [] for paper in sorted(all_papers, key=lambda x: x.relevance_score, reverse=True): title_key = paper.title.lower() if title_key not in seen_titles: seen_titles.add(title_key) unique_papers.append(paper) return unique_papers[:MAX_PAPERS]