|
from serp.base import SERPBackendBase, SerpResultItem |
|
from lxml import etree |
|
|
|
|
|
class ArxivSerpBackend(SERPBackendBase): |
|
@property |
|
def name(self): |
|
return "arxiv" |
|
|
|
async def query(self, query, client): |
|
"""Searches arXiv for the specified query and returns a list of results with titles and PDF URLs.""" |
|
ATOM_NAMESPACE = {'atom': 'http://www.w3.org/2005/Atom'} |
|
ARXIV_API_URL = 'https://export.arxiv.org/api/query?' |
|
|
|
search_params = { |
|
'search_query': query.query, |
|
'start': 0, |
|
'max_results': query.n_results, |
|
'sortBy': "submittedDate" if query.sort_by == "date" else "relevance" |
|
} |
|
query_url = ARXIV_API_URL |
|
|
|
response = await client.get(query_url, params=search_params) |
|
response.raise_for_status() |
|
|
|
root = etree.fromstring(response.content) |
|
entries = root.findall('atom:entry', ATOM_NAMESPACE) |
|
|
|
results = [] |
|
for entry in entries: |
|
title = entry.find( |
|
'atom:title', ATOM_NAMESPACE).text.strip().replace('\n', ' ') |
|
id = entry.find('atom:id', ATOM_NAMESPACE).text.strip() |
|
pdf_url = entry.find( |
|
'atom:id', ATOM_NAMESPACE).text.replace('/abs/', '/pdf/') |
|
summary = entry.find( |
|
'atom:summary', ATOM_NAMESPACE).text.strip() |
|
results.append(SerpResultItem( |
|
title=title, href=pdf_url, body=summary, id=id)) |
|
|
|
return results |
|
|
|
@property |
|
def category(self): |
|
return "scholar" |
|
|