Spaces:
Sleeping
Sleeping
File size: 6,322 Bytes
fe3c056 348017a fe3c056 348017a fe3c056 348017a fe3c056 348017a fe3c056 348017a fe3c056 348017a fe3c056 348017a fe3c056 348017a fe3c056 348017a fe3c056 348017a fe3c056 0841c28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
import pathlib
import re
import feedparser
from src.engine import SearchAPI
from src.interfaces import Paper
from src.utils import download
class ArxivPaperList(SearchAPI):
"""arXiv API
Inputs:
cache_filepath: Filepath to save cached file
use_cache: will use cached file if `True`
raw: Raw api query, e.g. `cat:cs.CL AND ti:event`. If set, others will be disabled
title: String of title you wanna search
author: Author string
abstract: Abstract string
comment: Comment string
category: arXiv category, e.g. "cs.CL"
max_results: Maximal returned papers
sort_by: `submittedDate` (default) or `lastUpdatedDate`
sort_order: `descending` (default) or `ascending`
Doc:
prefix explanation
- ti Title
- au Author
- abs Abstract
- co Comment
- jr Journal Reference
- cat Subject Category
- rn Report Number
- id Id (use id_list instead)
- all All of the above
logics:
- AND
- OR
- ANDNOT
symbol encoding explanation
- ( ) %28 %29 Used to group Boolean expressions for Boolean operator precedence.
- double quotes %22 %22 Used to group multiple words into phrases to search a particular field.
- space + Used to extend a search_query to include multiple fields.
e.g. https://export.arxiv.org/api/query?search_query=cat:cs.CL+AND+ti:event&start=0&max_results=2000&sortBy=submittedDate&sortOrder=descending
References:
https://arxiv.org/help/api/user-manual#title_id_published_updated
"""
API_URL = "https://export.arxiv.org/api/query?search_query="
def __init__(
self,
cache_filepath: str | pathlib.Path,
use_cache: bool = False,
raw: str = "",
title: str = "",
author: str = "",
abstract: str = "",
comment: str = "",
category: str = "cs.CL",
max_results: int = 5000,
sort_by: str = "submittedDate",
sort_order: str = "descending",
) -> None:
super().__init__()
if isinstance(cache_filepath, str):
cache_filepath = pathlib.Path(cache_filepath)
if (not cache_filepath.exists()) or (not use_cache):
cache_filepath.parent.mkdir(parents=True, exist_ok=True)
query: str = ""
if raw:
query = raw
else:
if title:
if len(query) > 0:
query += " AND "
query += f"ti:{title.strip()}"
if author:
if len(query) > 0:
query += " AND "
query += f"au:{author.strip()}"
if abstract:
if len(query) > 0:
query += " AND "
query += f"abs:{abstract.strip()}"
if comment:
if len(query) > 0:
query += " AND "
query += f"co:{comment.strip()}"
if category:
if len(query) > 0:
query += " AND "
query += f"cat:{category.strip()}"
query = query.strip().replace(" ", "+")
query = query.replace("(", "%28")
query = query.replace(")", "%29")
query = query.replace('"', "%22")
url = f"{self.API_URL}{query}&start=0&max_results={max_results}&sortBy={sort_by}&sortOrder={sort_order}"
download(url, cache_filepath)
feed_string = cache_filepath.open("rt", encoding="utf8").read()
feed = feedparser.parse(feed_string)
for entry in feed.entries:
author = ""
if hasattr(entry, "authors"):
author = " , ".join(author.name for author in entry.authors)
url = ""
doi = ""
for link in entry.links:
if link.rel == "alternate":
url = link.href
if "doi" in link.href:
doi = link.href
if not url:
url = entry.links[0].href
if sort_by == "submittedDate":
date = entry.published_parsed
else:
date = entry.updated_parsed
title = re.sub(r"[\s\n]+", " ", entry.title, flags=re.MULTILINE).strip()
abstract = re.sub(
r"[\s\n]+", " ", entry.summary, flags=re.MULTILINE
).strip()
paper = Paper(
title,
author,
abstract,
url,
doi,
" , ".join([t["term"] for t in entry.tags]),
str(date.tm_year),
str(date.tm_mon),
)
self.papers.append(paper)
@staticmethod
def build_logic_string(req: list[list[str]]) -> str:
if not req:
return ""
tmp_strings = []
for and_strs in req:
tmp_strings.append(f"({' AND '.join(and_strs)})")
logic_string = " OR ".join(tmp_strings)
return logic_string
@classmethod
def build_paper_list(
cls, cache_filepath: str, query: dict, max_results: int = 5000
):
title = query.get("title", [])
ti_string = cls.build_logic_string(title)
author = query.get("author", [])
au_string = cls.build_logic_string(author)
abstract = query.get("abstract", [])
abs_string = cls.build_logic_string(abstract)
venue = query.get("venue", [])
# only subject category is used when caching
if venue:
cat_string = venue[0]
else:
cat_string = ""
return cls(
cache_filepath,
use_cache=False,
title=ti_string,
author=au_string,
abstract=abs_string,
category=cat_string,
max_results=max_results,
)
@classmethod
def build_and_search(
cls, cache_filepath: str, query: dict, max_results: int = -1
) -> list[Paper]:
obj = cls.build_paper_list(cache_filepath, query, max_results=max_results)
return obj.search(query)[:max_results]
|