from serp.base import SERPBackendBase, SerpResultItem from lxml import etree class ArxivSerpBackend(SERPBackendBase): @property def name(self): return "arxiv" async def query(self, query, client): """Searches arXiv for the specified query and returns a list of results with titles and PDF URLs.""" ATOM_NAMESPACE = {'atom': 'http://www.w3.org/2005/Atom'} ARXIV_API_URL = 'https://export.arxiv.org/api/query?' search_params = { 'search_query': query.query, 'start': 0, 'max_results': query.n_results, 'sortBy': "submittedDate" if query.sort_by == "date" else "relevance" } query_url = ARXIV_API_URL response = await client.get(query_url, params=search_params) response.raise_for_status() root = etree.fromstring(response.content) entries = root.findall('atom:entry', ATOM_NAMESPACE) results = [] for entry in entries: title = entry.find( 'atom:title', ATOM_NAMESPACE).text.strip().replace('\n', ' ') id = entry.find('atom:id', ATOM_NAMESPACE).text.strip() pdf_url = entry.find( 'atom:id', ATOM_NAMESPACE).text.replace('/abs/', '/pdf/') summary = entry.find( 'atom:summary', ATOM_NAMESPACE).text.strip() results.append(SerpResultItem( title=title, href=pdf_url, body=summary, id=id)) return results @property def category(self): return "scholar"