File size: 3,357 Bytes
348017a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pathlib
import random
import re
import time

import requests
from tqdm import trange

from src.engine import SearchAPI
from src.interfaces import Paper
from src.utils import dump_json, load_json


class DblpPaperList(SearchAPI):
    """DBLP paper list

    Inputs:
        cache_filepath: Filepath to save cached file
        use_cache: will use cached file if `True`, otherwise download again
        query: Query string, basically the title
            you wanna search in a search box.
            Special logical grammars refer to the reference.
        max_results: Maximal returned papers
        request_time_inteval: Seconds to sleep when calling DBLP API

    References:
        https://dblp.org/faq/How+to+use+the+dblp+search+API.html
    """

    API_URL = "https://dblp.org/search/publ/api"

    def __init__(
        self,
        cache_filepath: pathlib.Path,
        use_cache: bool = False,
        query: str = "",
        max_results: int = 1000,
        request_time_inteval: float = 5,
    ) -> None:
        super().__init__()

        if isinstance(cache_filepath, str):
            cache_filepath = pathlib.Path(cache_filepath)
        if (not cache_filepath.exists()) or (not use_cache):
            query = query.strip()
            query = re.sub(r"\s+?\|\s+?", "|", query)
            query = re.sub(r"\s+", "+", query)

            searched_results = []
            # max capacity is 1000
            h = 1000
            for f in trange(0, max_results, h, desc="DBLP Downloading"):
                url = f"{self.API_URL}?q={query}&format=json&c=0&f={f}&h={h}"
                try:
                    response = requests.get(url)
                    response.raise_for_status()
                    page = response.json()
                    page_data = page["result"]["hits"]["hit"]
                    if page_data:
                        searched_results.extend(page_data)
                    else:
                        break
                except KeyboardInterrupt:
                    raise KeyboardInterrupt
                except Exception:
                    break
                time.sleep((random.random() + 0.5) * request_time_inteval)
            dump_json(searched_results, cache_filepath)

        data = load_json(cache_filepath)
        for d in data:
            # dblp does not provide abstract and month data
            authors = []
            if "authors" in d["info"]:
                if isinstance(d["info"]["authors"]["author"], dict):
                    authors.append(d["info"]["authors"]["author"]["text"])
                else:
                    authors = [a["text"] for a in d["info"]["authors"]["author"]]

            venues = []
            if "venue" in d["info"]:
                if isinstance(d["info"]["venue"], str):
                    venues.append(d["info"]["venue"])
                else:
                    for venue in d["info"]["venue"]:
                        venues.append(venue)
            paper = Paper(
                d["info"]["title"],
                " , ".join(authors),
                "",
                d["info"].get("ee", d["info"].get("url", "")),
                d["info"].get("doi", ""),
                " , ".join(venues),
                d["info"].get("year", "9999"),
                "99",
            )
            self.papers.append(paper)