Spaces:

Spico
/

paper-hero

Runtime error

Spico commited on Jan 24, 2023

Commit

0841c28

1 Parent(s): 7e2a16d

- add `build_paper_list` and `build_and_search` methods to help build demo (direct API)

- change `Paper.authors` into `Paper.author`, fix author searching
- add `**kwargs` in `dump_json`
- support `month` and `year` searching
- use cached acl requirements
- add cached files cleaning script
- add huggingface spaces support
- demo support (vuejs, fastapi and uvicorn)
- dockerize app

Files changed (16) hide show

.github/workflows/hf_spaces.yml +19 -0
Dockerfile +25 -0
README.md +14 -0
docker-compose.yml +12 -0
index.html +356 -0
requirements.txt +3 -1
run.py +14 -0
scripts/clean_tmp.sh +3 -0
scripts/get_aclanthology.sh +3 -1
server.py +153 -0
src/engine.py +42 -17
src/interfaces/__init__.py +7 -3
src/interfaces/aclanthology.py +11 -0
src/interfaces/arxiv.py +44 -0
src/interfaces/dblp.py +34 -3
src/utils.py +2 -2

.github/workflows/hf_spaces.yml ADDED Viewed

	@@ -0,0 +1,19 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push https://Spico:[email protected]/spaces/Spico/paper-hero main

Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+FROM python:3.10-slim
+#Install cron and git
+RUN apt-get update
+RUN apt-get -y install cron git
+# prepare scripts
+WORKDIR /app/
+COPY ./requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r /app/requirements.txt
+COPY ./scripts/ /app/scripts/
+RUN bash scripts/get_aclanthology.sh
+COPY ./src/ /app/src/
+COPY ./index.html /app/index.html
+COPY ./server.py /app/server.py
+# Add the cron job
+RUN crontab -l | { cat; echo "*/10 * * * * bash /app/scripts/clean_tmp.sh"; } | crontab -
+RUN crontab -l | { cat; echo "0 0 * * * bash /app/scripts/get_aclanthology.sh"; } | crontab -
+# Run the command on container startup
+CMD cron
+# start service
+EXPOSE 7860
+CMD ["python", "-u", "server.py"]

README.md CHANGED Viewed

@@ -1,3 +1,15 @@
 # 💪 Paper Hero
 A toolkit to help search for papers from aclanthology, arXiv and dblp.
@@ -60,3 +72,5 @@ if __name__ == "__main__":
 - [x] aclanthology
 - [x] arXiv
 - [x] dblp

+---
+title: Paper Hero
+emoji: 💪
+colorFrom: indigo
+colorTo: yellow
+sdk: docker
+app_port: 7860
+pinned: true
+license: apache-2.0
+---
 # 💪 Paper Hero
 A toolkit to help search for papers from aclanthology, arXiv and dblp.
 - [x] aclanthology
 - [x] arXiv
 - [x] dblp
+- [x] add frontend support for building a demo
+- [x] year and month searching

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,12 @@

+version: "3"
+services:
+  paper_hero_api:
+    build: .
+    container_name: paper_hero
+    ports:
+      - 127.0.0.1:7860:7860
+    volumes:
+      - .:/app
+      - phero_tmp:/tmp
+volumes:
+  phero_tmp:

index.html ADDED Viewed

	@@ -0,0 +1,356 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta http-equiv="X-UA-Compatible" content="IE=edge">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>paper-hero</title>
+  <link rel="stylesheet" href="https://unpkg.com/boltcss/bolt.min.css">
+  <script type="importmap">
+    {
+      "imports": {
+        "vue": "https://unpkg.com/vue@3/dist/vue.esm-browser.js"
+      }
+    }
+  </script>
+  <style>
+    body {
+      max-width: 800px;
+      margin: 40px auto;
+      padding: 0 20px;
+    }
+    .form-group {
+      display: flex;
+      flex-direction: row;
+      justify-content: flex-start;
+      align-items: center;
+    }
+    label {
+      margin-right: 1rem;
+    }
+    button {
+      margin: 0.2rem 0.2rem;
+    }
+    button:hover {
+      background-color: #dbdbdb;
+    }
+    footer {
+      text-align: center;
+      margin-top: 2rem;
+    }
+    .button-group {
+      margin-top: 3rem;
+    }
+    .search-button {
+      background-color: #ffc83d;
+      color: #d67d00;
+      font-weight: bold;
+    }
+    .search-button:hover {
+      background-color: #ffc83dc0;
+    }
+    .download-button {
+      background-color: #98ca56;
+      color: white;
+      font-weight: bold;
+    }
+    .download-button:hover {
+      background-color: #98ca56d1;
+    }
+    .output-title {
+      margin-top: 2rem;
+      margin-bottom: 0;
+      display: block;
+      background-color: #98ca56;
+      color: white;
+      font-weight: bold;
+      font-size: large;
+      padding: 6px 15px;
+      border-top-left-radius: 6px;
+      border-top-right-radius: 6px;
+    }
+    .output-box {
+      margin-top: 0;
+      padding: 6px 15px;
+      background-color: white;
+      border: 2px solid #98ca56;
+      border-bottom-left-radius: 6px;
+      border-bottom-right-radius: 6px;
+    }
+  </style>
+</head>
+<body>
+  <header>
+    <h1>💪 Paper Hero</h1>
+    <p>
+      Paper Hero is a toolkit to help search for papers from aclanthology, arXiv and dblp.
+    </p>
+    <p>GitHub Address: <a href="https://github.com/Spico197/paper-hero" target="_blank">Spico197/paper-hero</a></p>
+  </header>
+  <main>
+    <div id="app">
+      <div class="form-group">
+        <label for="method"><strong>Source</strong></label>
+        <select id="method" v-model="method">
+          <option value="" disabled>Please select a source</option>
+          <option value="aclanthology">ACL Anthology</option>
+          <option value="arxiv">ArXiv</option>
+          <option value="dblp">DBLP</option>
+        </select>
+      </div>
+      <div>
+        <label for="max-res"><strong>Max Results</strong></label>
+        <input id="max-res" type="number" v-model="maxResults">
+      </div>
+      <div class="form-group">
+        <label for="add-field"><strong>New Field</strong></label>
+        <select id="add-field" v-model="addField">
+          <option value="" disabled>Please select a field</option>
+          <option :value="field" v-for="field in restFields">{{ field }}</option>
+        </select>
+        <button @click.prevent="addNewField">Add</button>
+      </div>
+      <hr>
+      <div>
+        <p><strong>Fields</strong></p>
+        <p>
+          Add <code>&&</code> to represent <code>AND</code> logic, e.g. <code>span-based && event extraction</code>
+          means <em>span-based</em> and <em>event extraction</em> both appear in a field.
+        </p>
+        <p>
+          For <code>year</code> and <code>month</code> fields, the query should
+          follow the <code>start && end</code> format,
+          e.g. year <code>2006 && 2013</code> means searching for papers published
+          between <code>2006</code> and <code>2013</code>.
+        </p>
+        <div v-for="(groups, field) in query">
+          <label :for="field"><strong>{{ field }}</strong></label>
+          <div v-for="(group, index) in groups">
+            <input class="field-input" type="text" v-model="query[field][index]" placeholder="text1 && text2 && text3"
+              size="50">
+            <button @click.prevent="rmAnd(field, index)">X</button>
+          </div>
+          <button @click.prevent="addOr(field)">OR</button>
+        </div>
+      </div>
+      <div v-if="timerHandler">
+        <p>⏱️ {{ searchSecondsTwoDecimal }}</p>
+      </div>
+      <div v-if="output">
+        <p class="output-title">Output Info</p>
+        <p class="output-box">
+          {{ output }}
+          <br>
+          You are ready to download the results by clicking the download button below.
+          <br>
+          Like this tool? ⭐ me on <a href="https://github.com/Spico197/paper-hero" target="_blank">GitHub</a> !
+        </p>
+      </div>
+      <div class="button-group">
+        <button @click.prevent="resetQuery">Reset</button>
+        <button class="search-button" @click.prevent="search">Search</button>
+        <a :href="downloadHref" :download="`${method}.json`" v-if="downloadHref">
+          <button class="search-button download-button">Download</button>
+        </a>
+      </div>
+    </div>
+  </main>
+  <footer>
+    <hr>
+    Made by <a href="https://spico197.github.io" target="_blank">Tong Zhu</a> w/ 💖
+  </footer>
+  <script type="module">
+    import { createApp, ref, computed, toRaw, watch } from 'vue'
+    createApp(
+      {
+        setup() {
+          const method = ref("aclanthology")
+          const query = ref({ title: [[]] })
+          const maxResults = ref(2000)
+          const addField = ref("")
+          const allFields = ["title", "author", "abstract", "venue", "year", "month"]
+          const downloadUrl = ref('')
+          const downloadToken = ref('')
+          const downloadHref = ref('')
+          const output = ref('')
+          const timerHandler = ref(0)
+          const searchSeconds = ref(0.0)
+          const searchSecondsTwoDecimal = computed(() => {
+            return `${searchSeconds.value.toFixed(1)}s`
+          })
+          const restFields = computed(() => {
+            let rest = []
+            for (const field of allFields) {
+              if (!(field in query.value)) {
+                rest.push(field)
+              }
+            }
+            return rest
+          })
+          function addNewField() {
+            if (addField.value) {
+              query.value[addField.value] = [[]]
+              addField.value = ""
+            }
+          }
+          function rmAnd(field, index) {
+            if (query.value[field].length == 1) {
+              delete query.value[field]
+            } else {
+              query.value[field].splice(index, 1)
+            }
+          }
+          function addOr(field) {
+            query.value[field].push([])
+          }
+          function resetOutput() {
+            output.value = ""
+            downloadUrl.value = ""
+            downloadToken.value = ""
+            URL.revokeObjectURL(downloadHref.value)
+            downloadHref.value = ""
+            searchSeconds.value = 0.0
+            timerHandler.value = 0
+            searchSecondsTwoDecimal.value = ""
+          }
+          function resetQuery() {
+            query.value = { title: [[]] }
+            resetOutput()
+          }
+          function startTimer() {
+            console.log("start")
+            timerHandler.value = setInterval(() => {
+              searchSeconds.value += 0.1
+            }, 100)
+          }
+          function endTimer() {
+            console.log("end")
+            if (timerHandler.value > 0) {
+              console.log("endi")
+              clearInterval(timerHandler.value)
+            }
+          }
+          function search() {
+            resetOutput()
+            startTimer()
+            let q = {}
+            for (const prop in query.value) {
+              q[prop] = []
+              for (let i = 0; i < query.value[prop].length; i++) {
+                if (query.value[prop][i].length > 0) {
+                  let andString = toRaw(query.value[prop][i])
+                  let andStrings = andString.split('&&')
+                  for (let j = 0; j < andStrings.length; j++) {
+                    andStrings[j] = andStrings[j].trim()
+                  }
+                  q[prop].push(andStrings)
+                }
+              }
+              if (q[prop].length < 1) {
+                delete q[prop]
+              }
+            }
+            const postData = JSON.stringify({
+              "method": method.value,
+              "query": q,
+              "max_results": maxResults.value,
+              "return_content": false,
+            })
+            fetch(
+              "/api/",
+              {
+                method: "POST",
+                headers: {
+                  'Content-Type': 'application/json',
+                },
+                body: postData,
+              }
+            )
+              .then((response) => response.json())
+              .then((json) => {
+                if (json["ok"] === false) {
+                  alert(json["msg"])
+                } else {
+                  downloadUrl.value = json["url"]
+                  downloadToken.value = json["token"]
+                  output.value = `${json["msg"]}, #matched paper: ${json["paper"]}`
+                }
+              })
+              .catch((err) => { alert(err) })
+              .finally(() => endTimer())
+          }
+          watch(downloadUrl, (newUrl, oldUrl) => {
+            if (downloadToken.value) {
+              fetch(
+                `/download/?u=${downloadUrl.value}&t=${downloadToken.value}`,
+                {
+                  method: "GET",
+                }
+              )
+                .then((response) => response.blob())
+                .then((data) => {
+                  downloadHref.value = URL.createObjectURL(data)
+                })
+            }
+          })
+          return {
+            method,
+            query,
+            restFields,
+            addField,
+            addNewField,
+            search,
+            rmAnd,
+            addOr,
+            resetQuery,
+            maxResults,
+            output,
+            downloadUrl,
+            downloadHref,
+            searchSeconds,
+            timerHandler,
+            searchSecondsTwoDecimal,
+          }
+        }
+      }
+    ).mount("#app")
+  </script>
+</body>
+</html>

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
 tqdm>=4.64.1
 requests>=2.28.1
-feedparser>=6.0.10

 tqdm>=4.64.1
 requests>=2.28.1
+feedparser>=6.0.10
+fastapi>=0.88.0
+uvicorn>=0.20.0

run.py CHANGED Viewed

@@ -9,6 +9,7 @@ from src.utils import (
 if __name__ == "__main__":
     # use `bash scripts/get_aclanthology.sh` to download and prepare anthology data first
     acl_paper_list = AclanthologyPaperList("cache/aclanthology.json")
     ee_query = {
         "title": [
             ["information extraction"],
@@ -30,6 +31,19 @@ if __name__ == "__main__":
             ["tacl"],
             ["cl"],
         ],
     }
     ee_papers = acl_paper_list.search(ee_query)
     dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")

 if __name__ == "__main__":
     # use `bash scripts/get_aclanthology.sh` to download and prepare anthology data first
     acl_paper_list = AclanthologyPaperList("cache/aclanthology.json")
+    # `ee_query`` is an example, and you don't have to fill all the fields
     ee_query = {
         "title": [
             ["information extraction"],
             ["tacl"],
             ["cl"],
         ],
+        "author": [
+            ["Heng Ji"],
+            ["Dan Roth"],
+        ],
+        "year": [
+            # multiple time spans with closed interval: ["2006", "2013"] means 2006-2013
+            ["2006", "2013"],
+            ["2018", "2022"],
+        ],
+        "month": [
+            # the same as the `year` field
+            ["4", "11"],
+        ]
     }
     ee_papers = acl_paper_list.search(ee_query)
     dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")

scripts/clean_tmp.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+find /tmp -maxdepth 1 -type f -mmin +10 -type f -regextype posix-extended -regex '\/tmp\/arxiv\.cache\..*?\.xml' -delete
+find /tmp -maxdepth 1 -type f -mmin +10 -type f -regextype posix-extended -regex '\/tmp\/dblp\.cache\..*?\.json' -delete
+find /tmp -maxdepth 1 -type f -mmin +10 -type f -regextype posix-extended -regex '\/tmp\/(aclanthology|arxiv|dblp)\.(search|download)\..*?\.json' -delete

scripts/get_aclanthology.sh CHANGED Viewed

@@ -1,3 +1,5 @@
 mkdir cache
 cd cache
 if ! [ -f acl-anthology/bin/anthology/anthology.py ]; then
@@ -9,7 +11,7 @@ else
 fi
 cd acl-anthology/bin
-pip install -r https://raw.githubusercontent.com/acl-org/acl-anthology/master/bin/requirements.txt
 python -c '
 import json

+set -ex
 mkdir cache
 cd cache
 if ! [ -f acl-anthology/bin/anthology/anthology.py ]; then
 fi
 cd acl-anthology/bin
+pip install --no-cache-dir -r ./requirements.txt
 python -c '
 import json

server.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import logging
+import os
+import uuid
+import tempfile
+import pathlib
+import uvicorn
+from fastapi import FastAPI
+from fastapi.responses import FileResponse
+from pydantic import BaseModel
+from src.interfaces.aclanthology import AclanthologyPaperList
+from src.interfaces.arxiv import ArxivPaperList
+from src.interfaces.dblp import DblpPaperList
+from src.utils import dump_json, load_json
+class SearchQuery(BaseModel):
+    method: str
+    query: dict
+    max_results: int = 1000
+    return_content: bool = False
+REMOVE_CACHE = False
+ACL_CACHE_FILEPATH = "./cache/aclanthology.json"
+app = FastAPI()
+logger = logging.getLogger("uvicorn.default")
+def get_uid():
+    return uuid.uuid4().urn.split(":")[-1]
+@app.get("/")
+async def api():
+    return FileResponse("./index.html", media_type="text/html")
+@app.post("/api/")
+async def api(q: SearchQuery):  # noqa: F811
+    ret = {
+        "ok": False,
+        "cand": 0,
+        "paper": 0,
+        "url": "",
+        "token": "",
+        "msg": "",
+        "content": [],
+    }
+    if q.method not in ["aclanthology", "arxiv", "dblp"]:
+        ret["msg"] = f"{q.method} method not supported"
+        return ret
+    papers = []
+    cache_filepath = ""
+    if q.method == "aclanthology":
+        cache_filepath = ACL_CACHE_FILEPATH
+        plist = AclanthologyPaperList.build_paper_list(ACL_CACHE_FILEPATH)
+        papers = plist.search(q.query)[: q.max_results]
+        ret["ok"] = True
+        ret["msg"] = f"#candidates: {len(plist.papers)}"
+        ret["cand"] = len(plist.papers)
+    elif q.method == "arxiv":
+        _, cache_filepath = tempfile.mkstemp(
+            prefix="arxiv.cache.", suffix=".xml", text=True
+        )
+        plist = ArxivPaperList.build_paper_list(
+            cache_filepath, q.query, max_results=q.max_results
+        )
+        papers = plist.search(q.query)[: q.max_results]
+        ret["ok"] = True
+        ret["msg"] = f"#candidates: {len(plist.papers)}"
+        ret["cand"] = len(plist.papers)
+    elif q.method == "dblp":
+        _, cache_filepath = tempfile.mkstemp(
+            prefix="dblp.cache.", suffix=".json", text=True
+        )
+        plist = DblpPaperList.build_paper_list(
+            cache_filepath, q.query, max_results=q.max_results
+        )
+        papers = plist.search(q.query)[: q.max_results]
+        ret["ok"] = True
+        ret["msg"] = f"#candidates: {len(plist.papers)}"
+        ret["cand"] = len(plist.papers)
+    if papers:
+        papers = [p.as_dict() for p in papers]
+        ret["paper"] = len(papers)
+        if q.return_content:
+            ret["content"] = papers
+        else:
+            _, result_filepath = tempfile.mkstemp(
+                prefix=f"{q.method}.search.", suffix=".json", text=True
+            )
+            ret["url"] = result_filepath
+            ret["token"] = get_uid()
+            cache = {
+                "token": ret["token"],
+                "url": ret["url"],
+                "content": papers,
+            }
+            dump_json(cache, result_filepath)
+    if REMOVE_CACHE and q.method != "aclanthology":
+        os.remove(cache_filepath)
+    logger.info(
+        (
+            f"m: {q.method}, q: {q.query}, cands: {len(plist.papers)},"
+            f" max: {q.max_results}, #papers: {len(papers)}, cache: {cache_filepath}"
+            f" ret.url: {ret.get('url', '')}"
+        )
+    )
+    return ret
+@app.get("/download/")
+async def download(u: str, t: str):  # noqa: F811
+    logger.info(f"{u=}, {t=}")
+    results_filepath = pathlib.Path(u)
+    token = t
+    if results_filepath.exists():
+        data = load_json(results_filepath)
+        if data["token"] == token:
+            filename = results_filepath.name
+            prefix, _, middle, suffix = filename.split(".")
+            _, download_filepath = tempfile.mkstemp(
+                prefix=f"{prefix}.download.", suffix=".json"
+            )
+            dump_json(data["content"], download_filepath, indent=2)
+            logger.info(f"Download: from {u} to {download_filepath}")
+            return FileResponse(download_filepath, filename=f"{prefix}.json")
+    return {"ok": False, "msg": "file not exist or token mismatch"}
+if __name__ == "__main__":
+    log_config = uvicorn.config.LOGGING_CONFIG
+    log_config["formatters"]["access"]["fmt"] = (
+        "%(asctime)s | " + log_config["formatters"]["access"]["fmt"]
+    )
+    log_config["formatters"]["default"]["fmt"] = (
+        "%(asctime)s | " + log_config["formatters"]["default"]["fmt"]
+    )
+    uvicorn.run(
+        "server:app",
+        host="0.0.0.0",
+        port=7860,
+        log_level="debug",
+        log_config=log_config,
+        reload=False,
+    )

src/engine.py CHANGED Viewed

@@ -2,35 +2,52 @@ from src.interfaces import Paper
 class SearchAPI:
-    # fmt: off
-    SEARCH_PRIORITY = ["doi", "url", "year", "month", "venue", "authors", "title", "abstract"]
-    # fmt: on
     def __init__(self) -> None:
         self.papers: list[Paper] = []
     def exhausted_search(self, query: dict[str, tuple[tuple[str]]]) -> list[Paper]:
         """Exhausted search papers by matching query"""
         papers = self.papers
         for field in self.SEARCH_PRIORITY:
             if field in query:
                 req = query[field]
                 paper_indices = []
                 for i, p in enumerate(papers):
-                    for or_conditions in req:
-                        matched = True
-                        for and_cond_string in or_conditions:
-                            if " " in and_cond_string:
-                                if not and_cond_string.lower() in p[field].lower():
-                                    matched = False
-                                    break
-                            else:
-                                p_field = self.tokenize(p[field].lower())
-                                if not and_cond_string.lower() in p_field:
-                                    matched = False
-                                    break
-                        if matched:
-                            paper_indices.append(i)
                 papers = [papers[i] for i in paper_indices]
         return papers
@@ -68,3 +85,11 @@ class SearchAPI:
     def tokenize(self, string: str) -> list[str]:
         return string.lower().split()

 class SearchAPI:
+    SEARCH_PRIORITY = ["year", "month", "venue", "author", "title", "abstract"]
     def __init__(self) -> None:
         self.papers: list[Paper] = []
     def exhausted_search(self, query: dict[str, tuple[tuple[str]]]) -> list[Paper]:
         """Exhausted search papers by matching query"""
+        def _in_string(statement, string):
+            stmt_in_string = False
+            if " " in statement and statement.lower() in string.lower():
+                stmt_in_string = True
+            else:
+                tokens = self.tokenize(string.lower())
+                if statement.lower() in tokens:
+                    stmt_in_string = True
+            return stmt_in_string
         papers = self.papers
         for field in self.SEARCH_PRIORITY:
             if field in query:
                 req = query[field]
+                time_spans = []
+                if field in ["year", "month"]:
+                    for span in req:
+                        assert len(span) == 2
+                        assert all(num.isdigit() for num in span)
+                        time_spans.append((int(span[0]), int(span[1])))
                 paper_indices = []
                 for i, p in enumerate(papers):
+                    matched = False
+                    if time_spans:
+                        if any(s <= p[field] <= e for s, e in time_spans):
+                            matched = True
+                    else:
+                        if any(
+                            all(
+                                _in_string(stmt, p[field])
+                                for stmt in and_statements
+                            )
+                            for and_statements in req
+                        ):
+                            matched = True
+                    if matched:
+                        paper_indices.append(i)
                 papers = [papers[i] for i in paper_indices]
         return papers
     def tokenize(self, string: str) -> list[str]:
         return string.lower().split()
+    @classmethod
+    def build_paper_list(cls, *args, **kwargs):
+        raise NotImplementedError
+    @classmethod
+    def build_and_search(cls, *args, **kwargs) -> list[Paper]:
+        raise NotImplementedError

src/interfaces/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ from dataclasses import dataclass
 @dataclass
 class Paper:
     title: str
-    authors: str  # People Name1, People Name2: split by `, `
     abstract: str
     url: str
     doi: str
@@ -15,21 +15,25 @@ class Paper:
     def as_dict(self):
         return {
             "title": self.title,
-            "author": self.authors,
             "abstract": self.abstract,
             "url": self.url,
             "doi": self.doi,
             "venue": self.venue,
         }
     def as_tuple(self) -> tuple:
         return (
             self.title,
-            self.authors,
             self.abstract,
             self.url,
             self.doi,
             self.venue,
         )
     def __getitem__(self, attr_key: str):

 @dataclass
 class Paper:
     title: str
+    author: str  # People Name1, People Name2: split by `, `
     abstract: str
     url: str
     doi: str
     def as_dict(self):
         return {
             "title": self.title,
+            "author": self.author,
             "abstract": self.abstract,
             "url": self.url,
             "doi": self.doi,
             "venue": self.venue,
+            "year": self.year,
+            "month": self.month,
         }
     def as_tuple(self) -> tuple:
         return (
             self.title,
+            self.author,
             self.abstract,
             self.url,
             self.doi,
             self.venue,
+            self.year,
+            self.month,
         )
     def __getitem__(self, attr_key: str):

src/interfaces/aclanthology.py CHANGED Viewed

@@ -46,3 +46,14 @@ class AclanthologyPaperList(SearchAPI):
             full_name = f"{name['first']} {name['last']}"
         return full_name

             full_name = f"{name['first']} {name['last']}"
         return full_name
+    @classmethod
+    def build_paper_list(cls, cache_filepath: str):
+        return cls(cache_filepath)
+    @classmethod
+    def build_and_search(
+        cls, cache_filepath: str, query: dict, max_results: int = -1
+    ) -> list[Paper]:
+        obj = cls.build_paper_list(cache_filepath)
+        return obj.search(query)[:max_results]

src/interfaces/arxiv.py CHANGED Viewed

@@ -143,3 +143,47 @@ class ArxivPaperList(SearchAPI):
                 str(date.tm_mon),
             )
             self.papers.append(paper)

                 str(date.tm_mon),
             )
             self.papers.append(paper)
+    @staticmethod
+    def build_logic_string(req: list[list[str]]) -> str:
+        if not req:
+            return ""
+        tmp_strings = []
+        for and_strs in req:
+            tmp_strings.append(f"({' AND '.join(and_strs)})")
+        logic_string = " OR ".join(tmp_strings)
+        return logic_string
+    @classmethod
+    def build_paper_list(
+        cls, cache_filepath: str, query: dict, max_results: int = 5000
+    ):
+        title = query.get("title", [])
+        ti_string = cls.build_logic_string(title)
+        author = query.get("author", [])
+        au_string = cls.build_logic_string(author)
+        abstract = query.get("abstract", [])
+        abs_string = cls.build_logic_string(abstract)
+        venue = query.get("venue", [])
+        # only subject category is used when caching
+        if venue:
+            cat_string = venue[0]
+        else:
+            cat_string = ""
+        return cls(
+            cache_filepath,
+            use_cache=False,
+            title=ti_string,
+            author=au_string,
+            abstract=abs_string,
+            category=cat_string,
+            max_results=max_results,
+        )
+    @classmethod
+    def build_and_search(
+        cls, cache_filepath: str, query: dict, max_results: int = -1
+    ) -> list[Paper]:
+        obj = cls.build_paper_list(cache_filepath, query, max_results=max_results)
+        return obj.search(query)[:max_results]

src/interfaces/dblp.py CHANGED Viewed

@@ -2,6 +2,7 @@ import pathlib
 import random
 import re
 import time
 import requests
 from tqdm import trange
@@ -11,6 +12,9 @@ from src.interfaces import Paper
 from src.utils import dump_json, load_json
 class DblpPaperList(SearchAPI):
     """DBLP paper list
@@ -34,8 +38,8 @@ class DblpPaperList(SearchAPI):
         cache_filepath: pathlib.Path,
         use_cache: bool = False,
         query: str = "",
-        max_results: int = 1000,
-        request_time_inteval: float = 5,
     ) -> None:
         super().__init__()
@@ -62,7 +66,8 @@ class DblpPaperList(SearchAPI):
                         break
                 except KeyboardInterrupt:
                     raise KeyboardInterrupt
-                except Exception:
                     break
                 time.sleep((random.random() + 0.5) * request_time_inteval)
             dump_json(searched_results, cache_filepath)
@@ -95,3 +100,29 @@ class DblpPaperList(SearchAPI):
                 "99",
             )
             self.papers.append(paper)

 import random
 import re
 import time
+import logging
 import requests
 from tqdm import trange
 from src.utils import dump_json, load_json
+logger = logging.getLogger("uvicorn.default")
 class DblpPaperList(SearchAPI):
     """DBLP paper list
         cache_filepath: pathlib.Path,
         use_cache: bool = False,
         query: str = "",
+        max_results: int = 5000,
+        request_time_inteval: float = 3,
     ) -> None:
         super().__init__()
                         break
                 except KeyboardInterrupt:
                     raise KeyboardInterrupt
+                except Exception as err:
+                    logger.info(err)
                     break
                 time.sleep((random.random() + 0.5) * request_time_inteval)
             dump_json(searched_results, cache_filepath)
                 "99",
             )
             self.papers.append(paper)
+    @classmethod
+    def build_paper_list(
+        cls, cache_filepath: str, query: dict, max_results: int = 1000
+    ):
+        title = query.get("title", [])
+        abstract = query.get("abstract", [])
+        cls_q = ""
+        for t in title:
+            cls_q += " ".join(t)
+        for a in abstract:
+            cls_q += " ".join(a)
+        return cls(
+            cache_filepath,
+            use_cache=False,
+            query=cls_q,
+            max_results=max_results,
+        )
+    @classmethod
+    def build_and_search(
+        cls, cache_filepath: str, query: dict, max_results: int = 1000
+    ) -> list[Paper]:
+        obj = cls.build_paper_list(cache_filepath, query, max_results=max_results)
+        return obj.search(query)[:max_results]

src/utils.py CHANGED Viewed

@@ -117,9 +117,9 @@ def load_json(filepath: pathlib.Path) -> dict | list:
         return data
-def dump_json(data: list | dict, filepath: str | pathlib.Path):
     with open(filepath, "wt", encoding="utf8") as fout:
-        json.dump(data, fout, ensure_ascii=False)
 def load_jsonlines(filepath, **kwargs):

         return data
+def dump_json(data: list | dict, filepath: str | pathlib.Path, **kwargs):
     with open(filepath, "wt", encoding="utf8") as fout:
+        json.dump(data, fout, ensure_ascii=False, **kwargs)
 def load_jsonlines(filepath, **kwargs):