File size: 1,565 Bytes
d907837
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from serp.base import SERPBackendBase, SerpResultItem
from lxml import etree


class ArxivSerpBackend(SERPBackendBase):
    @property
    def name(self):
        return "arxiv"

    async def query(self, query, client):
        """Searches arXiv for the specified query and returns a list of results with titles and PDF URLs."""
        ATOM_NAMESPACE = {'atom': 'http://www.w3.org/2005/Atom'}
        ARXIV_API_URL = 'https://export.arxiv.org/api/query?'

        search_params = {
            'search_query': query.query,
            'start': 0,
            'max_results': query.n_results,
            'sortBy': "submittedDate" if query.sort_by == "date" else "relevance"
        }
        query_url = ARXIV_API_URL

        response = await client.get(query_url, params=search_params)
        response.raise_for_status()

        root = etree.fromstring(response.content)
        entries = root.findall('atom:entry', ATOM_NAMESPACE)

        results = []
        for entry in entries:
            title = entry.find(
                'atom:title', ATOM_NAMESPACE).text.strip().replace('\n', ' ')
            id = entry.find('atom:id', ATOM_NAMESPACE).text.strip()
            pdf_url = entry.find(
                'atom:id', ATOM_NAMESPACE).text.replace('/abs/', '/pdf/')
            summary = entry.find(
                'atom:summary', ATOM_NAMESPACE).text.strip()
            results.append(SerpResultItem(
                title=title, href=pdf_url, body=summary, id=id))

        return results

    @property
    def category(self):
        return "scholar"