SERPent2 / serp /arxiv.py
Game4all's picture
Initial commit
d907837
from serp.base import SERPBackendBase, SerpResultItem
from lxml import etree
class ArxivSerpBackend(SERPBackendBase):
@property
def name(self):
return "arxiv"
async def query(self, query, client):
"""Searches arXiv for the specified query and returns a list of results with titles and PDF URLs."""
ATOM_NAMESPACE = {'atom': 'http://www.w3.org/2005/Atom'}
ARXIV_API_URL = 'https://export.arxiv.org/api/query?'
search_params = {
'search_query': query.query,
'start': 0,
'max_results': query.n_results,
'sortBy': "submittedDate" if query.sort_by == "date" else "relevance"
}
query_url = ARXIV_API_URL
response = await client.get(query_url, params=search_params)
response.raise_for_status()
root = etree.fromstring(response.content)
entries = root.findall('atom:entry', ATOM_NAMESPACE)
results = []
for entry in entries:
title = entry.find(
'atom:title', ATOM_NAMESPACE).text.strip().replace('\n', ' ')
id = entry.find('atom:id', ATOM_NAMESPACE).text.strip()
pdf_url = entry.find(
'atom:id', ATOM_NAMESPACE).text.replace('/abs/', '/pdf/')
summary = entry.find(
'atom:summary', ATOM_NAMESPACE).text.strip()
results.append(SerpResultItem(
title=title, href=pdf_url, body=summary, id=id))
return results
@property
def category(self):
return "scholar"