Spaces:
Sleeping
Sleeping
File size: 4,909 Bytes
fe3c056 348017a fe3c056 348017a fe3c056 348017a fe3c056 348017a fe3c056 348017a fe3c056 348017a fe3c056 348017a fe3c056 348017a fe3c056 348017a fe3c056 348017a fe3c056 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import pathlib
import re
import feedparser
from src.engine import SearchAPI
from src.interfaces import Paper
from src.utils import download
class ArxivPaperList(SearchAPI):
"""arXiv API
Inputs:
cache_filepath: Filepath to save cached file
use_cache: will use cached file if `True`
raw: Raw api query, e.g. `cat:cs.CL AND ti:event`. If set, others will be disabled
title: String of title you wanna search
author: Author string
abstract: Abstract string
comment: Comment string
category: arXiv category, e.g. "cs.CL"
max_results: Maximal returned papers
sort_by: `submittedDate` (default) or `lastUpdatedDate`
sort_order: `descending` (default) or `ascending`
Doc:
prefix explanation
- ti Title
- au Author
- abs Abstract
- co Comment
- jr Journal Reference
- cat Subject Category
- rn Report Number
- id Id (use id_list instead)
- all All of the above
logics:
- AND
- OR
- ANDNOT
symbol encoding explanation
- ( ) %28 %29 Used to group Boolean expressions for Boolean operator precedence.
- double quotes %22 %22 Used to group multiple words into phrases to search a particular field.
- space + Used to extend a search_query to include multiple fields.
e.g. https://export.arxiv.org/api/query?search_query=cat:cs.CL+AND+ti:event&start=0&max_results=2000&sortBy=submittedDate&sortOrder=descending
References:
https://arxiv.org/help/api/user-manual#title_id_published_updated
"""
API_URL = "https://export.arxiv.org/api/query?search_query="
def __init__(
self,
cache_filepath: str | pathlib.Path,
use_cache: bool = False,
raw: str = "",
title: str = "",
author: str = "",
abstract: str = "",
comment: str = "",
category: str = "cs.CL",
max_results: int = 5000,
sort_by: str = "submittedDate",
sort_order: str = "descending",
) -> None:
super().__init__()
if isinstance(cache_filepath, str):
cache_filepath = pathlib.Path(cache_filepath)
if (not cache_filepath.exists()) or (not use_cache):
cache_filepath.parent.mkdir(parents=True, exist_ok=True)
query: str = ""
if raw:
query = raw
else:
if title:
if len(query) > 0:
query += " AND "
query += f"ti:{title.strip()}"
if author:
if len(query) > 0:
query += " AND "
query += f"au:{author.strip()}"
if abstract:
if len(query) > 0:
query += " AND "
query += f"abs:{abstract.strip()}"
if comment:
if len(query) > 0:
query += " AND "
query += f"co:{comment.strip()}"
if category:
if len(query) > 0:
query += " AND "
query += f"cat:{category.strip()}"
query = query.strip().replace(" ", "+")
query = query.replace("(", "%28")
query = query.replace(")", "%29")
query = query.replace('"', "%22")
url = f"{self.API_URL}{query}&start=0&max_results={max_results}&sortBy={sort_by}&sortOrder={sort_order}"
download(url, cache_filepath)
feed_string = cache_filepath.open("rt", encoding="utf8").read()
feed = feedparser.parse(feed_string)
for entry in feed.entries:
author = ""
if hasattr(entry, "authors"):
author = " , ".join(author.name for author in entry.authors)
url = ""
doi = ""
for link in entry.links:
if link.rel == "alternate":
url = link.href
if "doi" in link.href:
doi = link.href
if not url:
url = entry.links[0].href
if sort_by == "submittedDate":
date = entry.published_parsed
else:
date = entry.updated_parsed
title = re.sub(r"[\s\n]+", " ", entry.title, flags=re.MULTILINE).strip()
abstract = re.sub(
r"[\s\n]+", " ", entry.summary, flags=re.MULTILINE
).strip()
paper = Paper(
title,
author,
abstract,
url,
doi,
" , ".join([t["term"] for t in entry.tags]),
str(date.tm_year),
str(date.tm_mon),
)
self.papers.append(paper)
|