File size: 6,322 Bytes
fe3c056
348017a
fe3c056
 
 
 
348017a
fe3c056
 
 
 
 
 
 
348017a
fe3c056
348017a
 
 
 
 
fe3c056
348017a
fe3c056
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348017a
fe3c056
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348017a
fe3c056
 
 
 
 
 
 
 
 
348017a
fe3c056
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348017a
 
 
fe3c056
 
 
 
 
 
348017a
fe3c056
 
 
 
0841c28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import pathlib
import re

import feedparser

from src.engine import SearchAPI
from src.interfaces import Paper
from src.utils import download


class ArxivPaperList(SearchAPI):
    """arXiv API

    Inputs:
        cache_filepath: Filepath to save cached file
        use_cache: will use cached file if `True`
        raw: Raw api query, e.g. `cat:cs.CL AND ti:event`. If set, others will be disabled
        title: String of title you wanna search
        author: Author string
        abstract: Abstract string
        comment: Comment string
        category: arXiv category, e.g. "cs.CL"
        max_results: Maximal returned papers
        sort_by: `submittedDate` (default) or `lastUpdatedDate`
        sort_order: `descending` (default) or `ascending`

    Doc:
        prefix	explanation
        - ti	Title
        - au	Author
        - abs	Abstract
        - co	Comment
        - jr	Journal Reference
        - cat	Subject Category
        - rn	Report Number
        - id	Id (use id_list instead)
        - all	All of the above

        logics:
        - AND
        - OR
        - ANDNOT

        symbol	encoding	explanation
        - ( )	%28 %29	Used to group Boolean expressions for Boolean operator precedence.
        - double quotes	%22 %22	Used to group multiple words into phrases to search a particular field.
        - space	+	Used to extend a search_query to include multiple fields.

        e.g. https://export.arxiv.org/api/query?search_query=cat:cs.CL+AND+ti:event&start=0&max_results=2000&sortBy=submittedDate&sortOrder=descending

    References:
        https://arxiv.org/help/api/user-manual#title_id_published_updated
    """

    API_URL = "https://export.arxiv.org/api/query?search_query="

    def __init__(
        self,
        cache_filepath: str | pathlib.Path,
        use_cache: bool = False,
        raw: str = "",
        title: str = "",
        author: str = "",
        abstract: str = "",
        comment: str = "",
        category: str = "cs.CL",
        max_results: int = 5000,
        sort_by: str = "submittedDate",
        sort_order: str = "descending",
    ) -> None:
        super().__init__()

        if isinstance(cache_filepath, str):
            cache_filepath = pathlib.Path(cache_filepath)
        if (not cache_filepath.exists()) or (not use_cache):
            cache_filepath.parent.mkdir(parents=True, exist_ok=True)

            query: str = ""
            if raw:
                query = raw
            else:
                if title:
                    if len(query) > 0:
                        query += " AND "
                    query += f"ti:{title.strip()}"
                if author:
                    if len(query) > 0:
                        query += " AND "
                    query += f"au:{author.strip()}"
                if abstract:
                    if len(query) > 0:
                        query += " AND "
                    query += f"abs:{abstract.strip()}"
                if comment:
                    if len(query) > 0:
                        query += " AND "
                    query += f"co:{comment.strip()}"
                if category:
                    if len(query) > 0:
                        query += " AND "
                    query += f"cat:{category.strip()}"

            query = query.strip().replace(" ", "+")
            query = query.replace("(", "%28")
            query = query.replace(")", "%29")
            query = query.replace('"', "%22")

            url = f"{self.API_URL}{query}&start=0&max_results={max_results}&sortBy={sort_by}&sortOrder={sort_order}"
            download(url, cache_filepath)

        feed_string = cache_filepath.open("rt", encoding="utf8").read()
        feed = feedparser.parse(feed_string)
        for entry in feed.entries:
            author = ""
            if hasattr(entry, "authors"):
                author = " , ".join(author.name for author in entry.authors)
            url = ""
            doi = ""
            for link in entry.links:
                if link.rel == "alternate":
                    url = link.href
                if "doi" in link.href:
                    doi = link.href
            if not url:
                url = entry.links[0].href
            if sort_by == "submittedDate":
                date = entry.published_parsed
            else:
                date = entry.updated_parsed

            title = re.sub(r"[\s\n]+", " ", entry.title, flags=re.MULTILINE).strip()
            abstract = re.sub(
                r"[\s\n]+", " ", entry.summary, flags=re.MULTILINE
            ).strip()
            paper = Paper(
                title,
                author,
                abstract,
                url,
                doi,
                " , ".join([t["term"] for t in entry.tags]),
                str(date.tm_year),
                str(date.tm_mon),
            )
            self.papers.append(paper)

    @staticmethod
    def build_logic_string(req: list[list[str]]) -> str:
        if not req:
            return ""

        tmp_strings = []
        for and_strs in req:
            tmp_strings.append(f"({' AND '.join(and_strs)})")
        logic_string = " OR ".join(tmp_strings)
        return logic_string

    @classmethod
    def build_paper_list(
        cls, cache_filepath: str, query: dict, max_results: int = 5000
    ):
        title = query.get("title", [])
        ti_string = cls.build_logic_string(title)
        author = query.get("author", [])
        au_string = cls.build_logic_string(author)
        abstract = query.get("abstract", [])
        abs_string = cls.build_logic_string(abstract)
        venue = query.get("venue", [])
        # only subject category is used when caching
        if venue:
            cat_string = venue[0]
        else:
            cat_string = ""
        return cls(
            cache_filepath,
            use_cache=False,
            title=ti_string,
            author=au_string,
            abstract=abs_string,
            category=cat_string,
            max_results=max_results,
        )

    @classmethod
    def build_and_search(
        cls, cache_filepath: str, query: dict, max_results: int = -1
    ) -> list[Paper]:
        obj = cls.build_paper_list(cache_filepath, query, max_results=max_results)
        return obj.search(query)[:max_results]