Spico commited on
Commit
0841c28
·
1 Parent(s): 7e2a16d

- add `build_paper_list` and `build_and_search` methods to help build demo (direct API)

Browse files

- change `Paper.authors` into `Paper.author`, fix author searching
- add `**kwargs` in `dump_json`
- support `month` and `year` searching
- use cached acl requirements
- add cached files cleaning script
- add huggingface spaces support
- demo support (vuejs, fastapi and uvicorn)
- dockerize app

.github/workflows/hf_spaces.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ - name: Push to hub
17
+ env:
18
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
19
+ run: git push https://Spico:[email protected]/spaces/Spico/paper-hero main
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ #Install cron and git
4
+ RUN apt-get update
5
+ RUN apt-get -y install cron git
6
+
7
+ # prepare scripts
8
+ WORKDIR /app/
9
+ COPY ./requirements.txt /app/requirements.txt
10
+ RUN pip install --no-cache-dir -r /app/requirements.txt
11
+ COPY ./scripts/ /app/scripts/
12
+ RUN bash scripts/get_aclanthology.sh
13
+ COPY ./src/ /app/src/
14
+ COPY ./index.html /app/index.html
15
+ COPY ./server.py /app/server.py
16
+
17
+ # Add the cron job
18
+ RUN crontab -l | { cat; echo "*/10 * * * * bash /app/scripts/clean_tmp.sh"; } | crontab -
19
+ RUN crontab -l | { cat; echo "0 0 * * * bash /app/scripts/get_aclanthology.sh"; } | crontab -
20
+ # Run the command on container startup
21
+ CMD cron
22
+
23
+ # start service
24
+ EXPOSE 7860
25
+ CMD ["python", "-u", "server.py"]
README.md CHANGED
@@ -1,3 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  # 💪 Paper Hero
2
 
3
  A toolkit to help search for papers from aclanthology, arXiv and dblp.
@@ -60,3 +72,5 @@ if __name__ == "__main__":
60
  - [x] aclanthology
61
  - [x] arXiv
62
  - [x] dblp
 
 
 
1
+ ---
2
+ title: Paper Hero
3
+ emoji: 💪
4
+ colorFrom: indigo
5
+ colorTo: yellow
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: true
9
+ license: apache-2.0
10
+ ---
11
+
12
+
13
  # 💪 Paper Hero
14
 
15
  A toolkit to help search for papers from aclanthology, arXiv and dblp.
 
72
  - [x] aclanthology
73
  - [x] arXiv
74
  - [x] dblp
75
+ - [x] add frontend support for building a demo
76
+ - [x] year and month searching
docker-compose.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: "3"
2
+ services:
3
+ paper_hero_api:
4
+ build: .
5
+ container_name: paper_hero
6
+ ports:
7
+ - 127.0.0.1:7860:7860
8
+ volumes:
9
+ - .:/app
10
+ - phero_tmp:/tmp
11
+ volumes:
12
+ phero_tmp:
index.html ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
7
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
8
+ <title>paper-hero</title>
9
+ <link rel="stylesheet" href="https://unpkg.com/boltcss/bolt.min.css">
10
+ <script type="importmap">
11
+ {
12
+ "imports": {
13
+ "vue": "https://unpkg.com/vue@3/dist/vue.esm-browser.js"
14
+ }
15
+ }
16
+ </script>
17
+ <style>
18
+ body {
19
+ max-width: 800px;
20
+ margin: 40px auto;
21
+ padding: 0 20px;
22
+ }
23
+
24
+ .form-group {
25
+ display: flex;
26
+ flex-direction: row;
27
+ justify-content: flex-start;
28
+ align-items: center;
29
+ }
30
+
31
+ label {
32
+ margin-right: 1rem;
33
+ }
34
+
35
+ button {
36
+ margin: 0.2rem 0.2rem;
37
+ }
38
+
39
+ button:hover {
40
+ background-color: #dbdbdb;
41
+ }
42
+
43
+ footer {
44
+ text-align: center;
45
+ margin-top: 2rem;
46
+ }
47
+
48
+ .button-group {
49
+ margin-top: 3rem;
50
+ }
51
+
52
+ .search-button {
53
+ background-color: #ffc83d;
54
+ color: #d67d00;
55
+ font-weight: bold;
56
+ }
57
+
58
+ .search-button:hover {
59
+ background-color: #ffc83dc0;
60
+ }
61
+
62
+ .download-button {
63
+ background-color: #98ca56;
64
+ color: white;
65
+ font-weight: bold;
66
+ }
67
+
68
+ .download-button:hover {
69
+ background-color: #98ca56d1;
70
+ }
71
+
72
+ .output-title {
73
+ margin-top: 2rem;
74
+ margin-bottom: 0;
75
+ display: block;
76
+ background-color: #98ca56;
77
+ color: white;
78
+ font-weight: bold;
79
+ font-size: large;
80
+ padding: 6px 15px;
81
+ border-top-left-radius: 6px;
82
+ border-top-right-radius: 6px;
83
+ }
84
+
85
+ .output-box {
86
+ margin-top: 0;
87
+ padding: 6px 15px;
88
+ background-color: white;
89
+ border: 2px solid #98ca56;
90
+ border-bottom-left-radius: 6px;
91
+ border-bottom-right-radius: 6px;
92
+ }
93
+ </style>
94
+ </head>
95
+
96
+ <body>
97
+ <header>
98
+ <h1>💪 Paper Hero</h1>
99
+ <p>
100
+ Paper Hero is a toolkit to help search for papers from aclanthology, arXiv and dblp.
101
+ </p>
102
+ <p>GitHub Address: <a href="https://github.com/Spico197/paper-hero" target="_blank">Spico197/paper-hero</a></p>
103
+ </header>
104
+
105
+ <main>
106
+ <div id="app">
107
+ <div class="form-group">
108
+ <label for="method"><strong>Source</strong></label>
109
+ <select id="method" v-model="method">
110
+ <option value="" disabled>Please select a source</option>
111
+ <option value="aclanthology">ACL Anthology</option>
112
+ <option value="arxiv">ArXiv</option>
113
+ <option value="dblp">DBLP</option>
114
+ </select>
115
+ </div>
116
+
117
+ <div>
118
+ <label for="max-res"><strong>Max Results</strong></label>
119
+ <input id="max-res" type="number" v-model="maxResults">
120
+ </div>
121
+
122
+ <div class="form-group">
123
+ <label for="add-field"><strong>New Field</strong></label>
124
+ <select id="add-field" v-model="addField">
125
+ <option value="" disabled>Please select a field</option>
126
+ <option :value="field" v-for="field in restFields">{{ field }}</option>
127
+ </select>
128
+ <button @click.prevent="addNewField">Add</button>
129
+ </div>
130
+
131
+ <hr>
132
+
133
+ <div>
134
+ <p><strong>Fields</strong></p>
135
+ <p>
136
+ Add <code>&&</code> to represent <code>AND</code> logic, e.g. <code>span-based && event extraction</code>
137
+ means <em>span-based</em> and <em>event extraction</em> both appear in a field.
138
+ </p>
139
+ <p>
140
+ For <code>year</code> and <code>month</code> fields, the query should
141
+ follow the <code>start && end</code> format,
142
+ e.g. year <code>2006 && 2013</code> means searching for papers published
143
+ between <code>2006</code> and <code>2013</code>.
144
+ </p>
145
+ <div v-for="(groups, field) in query">
146
+ <label :for="field"><strong>{{ field }}</strong></label>
147
+ <div v-for="(group, index) in groups">
148
+ <input class="field-input" type="text" v-model="query[field][index]" placeholder="text1 && text2 && text3"
149
+ size="50">
150
+ <button @click.prevent="rmAnd(field, index)">X</button>
151
+ </div>
152
+ <button @click.prevent="addOr(field)">OR</button>
153
+ </div>
154
+ </div>
155
+
156
+ <div v-if="timerHandler">
157
+ <p>⏱️ {{ searchSecondsTwoDecimal }}</p>
158
+ </div>
159
+
160
+ <div v-if="output">
161
+ <p class="output-title">Output Info</p>
162
+ <p class="output-box">
163
+ {{ output }}
164
+ <br>
165
+ You are ready to download the results by clicking the download button below.
166
+ <br>
167
+ Like this tool? ⭐ me on <a href="https://github.com/Spico197/paper-hero" target="_blank">GitHub</a> !
168
+ </p>
169
+ </div>
170
+
171
+ <div class="button-group">
172
+ <button @click.prevent="resetQuery">Reset</button>
173
+ <button class="search-button" @click.prevent="search">Search</button>
174
+ <a :href="downloadHref" :download="`${method}.json`" v-if="downloadHref">
175
+ <button class="search-button download-button">Download</button>
176
+ </a>
177
+ </div>
178
+
179
+ </div>
180
+ </main>
181
+
182
+ <footer>
183
+ <hr>
184
+ Made by <a href="https://spico197.github.io" target="_blank">Tong Zhu</a> w/ 💖
185
+ </footer>
186
+
187
+ <script type="module">
188
+ import { createApp, ref, computed, toRaw, watch } from 'vue'
189
+
190
+ createApp(
191
+ {
192
+ setup() {
193
+ const method = ref("aclanthology")
194
+ const query = ref({ title: [[]] })
195
+ const maxResults = ref(2000)
196
+ const addField = ref("")
197
+ const allFields = ["title", "author", "abstract", "venue", "year", "month"]
198
+ const downloadUrl = ref('')
199
+ const downloadToken = ref('')
200
+ const downloadHref = ref('')
201
+ const output = ref('')
202
+ const timerHandler = ref(0)
203
+ const searchSeconds = ref(0.0)
204
+ const searchSecondsTwoDecimal = computed(() => {
205
+ return `${searchSeconds.value.toFixed(1)}s`
206
+ })
207
+ const restFields = computed(() => {
208
+ let rest = []
209
+ for (const field of allFields) {
210
+ if (!(field in query.value)) {
211
+ rest.push(field)
212
+ }
213
+ }
214
+ return rest
215
+ })
216
+
217
+ function addNewField() {
218
+ if (addField.value) {
219
+ query.value[addField.value] = [[]]
220
+ addField.value = ""
221
+ }
222
+ }
223
+
224
+ function rmAnd(field, index) {
225
+ if (query.value[field].length == 1) {
226
+ delete query.value[field]
227
+ } else {
228
+ query.value[field].splice(index, 1)
229
+ }
230
+ }
231
+
232
+ function addOr(field) {
233
+ query.value[field].push([])
234
+ }
235
+
236
+ function resetOutput() {
237
+ output.value = ""
238
+ downloadUrl.value = ""
239
+ downloadToken.value = ""
240
+ URL.revokeObjectURL(downloadHref.value)
241
+ downloadHref.value = ""
242
+ searchSeconds.value = 0.0
243
+ timerHandler.value = 0
244
+ searchSecondsTwoDecimal.value = ""
245
+ }
246
+
247
+ function resetQuery() {
248
+ query.value = { title: [[]] }
249
+ resetOutput()
250
+ }
251
+
252
+ function startTimer() {
253
+ console.log("start")
254
+ timerHandler.value = setInterval(() => {
255
+ searchSeconds.value += 0.1
256
+ }, 100)
257
+ }
258
+
259
+ function endTimer() {
260
+ console.log("end")
261
+ if (timerHandler.value > 0) {
262
+ console.log("endi")
263
+ clearInterval(timerHandler.value)
264
+ }
265
+ }
266
+
267
+ function search() {
268
+ resetOutput()
269
+ startTimer()
270
+ let q = {}
271
+ for (const prop in query.value) {
272
+ q[prop] = []
273
+ for (let i = 0; i < query.value[prop].length; i++) {
274
+ if (query.value[prop][i].length > 0) {
275
+ let andString = toRaw(query.value[prop][i])
276
+ let andStrings = andString.split('&&')
277
+ for (let j = 0; j < andStrings.length; j++) {
278
+ andStrings[j] = andStrings[j].trim()
279
+ }
280
+ q[prop].push(andStrings)
281
+ }
282
+ }
283
+ if (q[prop].length < 1) {
284
+ delete q[prop]
285
+ }
286
+ }
287
+ const postData = JSON.stringify({
288
+ "method": method.value,
289
+ "query": q,
290
+ "max_results": maxResults.value,
291
+ "return_content": false,
292
+ })
293
+ fetch(
294
+ "/api/",
295
+ {
296
+ method: "POST",
297
+ headers: {
298
+ 'Content-Type': 'application/json',
299
+ },
300
+ body: postData,
301
+ }
302
+ )
303
+ .then((response) => response.json())
304
+ .then((json) => {
305
+ if (json["ok"] === false) {
306
+ alert(json["msg"])
307
+ } else {
308
+ downloadUrl.value = json["url"]
309
+ downloadToken.value = json["token"]
310
+ output.value = `${json["msg"]}, #matched paper: ${json["paper"]}`
311
+ }
312
+ })
313
+ .catch((err) => { alert(err) })
314
+ .finally(() => endTimer())
315
+ }
316
+
317
+ watch(downloadUrl, (newUrl, oldUrl) => {
318
+ if (downloadToken.value) {
319
+ fetch(
320
+ `/download/?u=${downloadUrl.value}&t=${downloadToken.value}`,
321
+ {
322
+ method: "GET",
323
+ }
324
+ )
325
+ .then((response) => response.blob())
326
+ .then((data) => {
327
+ downloadHref.value = URL.createObjectURL(data)
328
+ })
329
+ }
330
+ })
331
+
332
+ return {
333
+ method,
334
+ query,
335
+ restFields,
336
+ addField,
337
+ addNewField,
338
+ search,
339
+ rmAnd,
340
+ addOr,
341
+ resetQuery,
342
+ maxResults,
343
+ output,
344
+ downloadUrl,
345
+ downloadHref,
346
+ searchSeconds,
347
+ timerHandler,
348
+ searchSecondsTwoDecimal,
349
+ }
350
+ }
351
+ }
352
+ ).mount("#app")
353
+ </script>
354
+ </body>
355
+
356
+ </html>
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
  tqdm>=4.64.1
2
  requests>=2.28.1
3
- feedparser>=6.0.10
 
 
 
1
  tqdm>=4.64.1
2
  requests>=2.28.1
3
+ feedparser>=6.0.10
4
+ fastapi>=0.88.0
5
+ uvicorn>=0.20.0
run.py CHANGED
@@ -9,6 +9,7 @@ from src.utils import (
9
  if __name__ == "__main__":
10
  # use `bash scripts/get_aclanthology.sh` to download and prepare anthology data first
11
  acl_paper_list = AclanthologyPaperList("cache/aclanthology.json")
 
12
  ee_query = {
13
  "title": [
14
  ["information extraction"],
@@ -30,6 +31,19 @@ if __name__ == "__main__":
30
  ["tacl"],
31
  ["cl"],
32
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  }
34
  ee_papers = acl_paper_list.search(ee_query)
35
  dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")
 
9
  if __name__ == "__main__":
10
  # use `bash scripts/get_aclanthology.sh` to download and prepare anthology data first
11
  acl_paper_list = AclanthologyPaperList("cache/aclanthology.json")
12
+ # `ee_query`` is an example, and you don't have to fill all the fields
13
  ee_query = {
14
  "title": [
15
  ["information extraction"],
 
31
  ["tacl"],
32
  ["cl"],
33
  ],
34
+ "author": [
35
+ ["Heng Ji"],
36
+ ["Dan Roth"],
37
+ ],
38
+ "year": [
39
+ # multiple time spans with closed interval: ["2006", "2013"] means 2006-2013
40
+ ["2006", "2013"],
41
+ ["2018", "2022"],
42
+ ],
43
+ "month": [
44
+ # the same as the `year` field
45
+ ["4", "11"],
46
+ ]
47
  }
48
  ee_papers = acl_paper_list.search(ee_query)
49
  dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")
scripts/clean_tmp.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ find /tmp -maxdepth 1 -type f -mmin +10 -type f -regextype posix-extended -regex '\/tmp\/arxiv\.cache\..*?\.xml' -delete
2
+ find /tmp -maxdepth 1 -type f -mmin +10 -type f -regextype posix-extended -regex '\/tmp\/dblp\.cache\..*?\.json' -delete
3
+ find /tmp -maxdepth 1 -type f -mmin +10 -type f -regextype posix-extended -regex '\/tmp\/(aclanthology|arxiv|dblp)\.(search|download)\..*?\.json' -delete
scripts/get_aclanthology.sh CHANGED
@@ -1,3 +1,5 @@
 
 
1
  mkdir cache
2
  cd cache
3
  if ! [ -f acl-anthology/bin/anthology/anthology.py ]; then
@@ -9,7 +11,7 @@ else
9
  fi
10
  cd acl-anthology/bin
11
 
12
- pip install -r https://raw.githubusercontent.com/acl-org/acl-anthology/master/bin/requirements.txt
13
 
14
  python -c '
15
  import json
 
1
+ set -ex
2
+
3
  mkdir cache
4
  cd cache
5
  if ! [ -f acl-anthology/bin/anthology/anthology.py ]; then
 
11
  fi
12
  cd acl-anthology/bin
13
 
14
+ pip install --no-cache-dir -r ./requirements.txt
15
 
16
  python -c '
17
  import json
server.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import uuid
4
+ import tempfile
5
+ import pathlib
6
+
7
+ import uvicorn
8
+ from fastapi import FastAPI
9
+ from fastapi.responses import FileResponse
10
+ from pydantic import BaseModel
11
+
12
+ from src.interfaces.aclanthology import AclanthologyPaperList
13
+ from src.interfaces.arxiv import ArxivPaperList
14
+ from src.interfaces.dblp import DblpPaperList
15
+ from src.utils import dump_json, load_json
16
+
17
+
18
+ class SearchQuery(BaseModel):
19
+ method: str
20
+ query: dict
21
+ max_results: int = 1000
22
+ return_content: bool = False
23
+
24
+
25
+ REMOVE_CACHE = False
26
+ ACL_CACHE_FILEPATH = "./cache/aclanthology.json"
27
+ app = FastAPI()
28
+ logger = logging.getLogger("uvicorn.default")
29
+
30
+
31
+ def get_uid():
32
+ return uuid.uuid4().urn.split(":")[-1]
33
+
34
+
35
+ @app.get("/")
36
+ async def api():
37
+ return FileResponse("./index.html", media_type="text/html")
38
+
39
+
40
+ @app.post("/api/")
41
+ async def api(q: SearchQuery): # noqa: F811
42
+ ret = {
43
+ "ok": False,
44
+ "cand": 0,
45
+ "paper": 0,
46
+ "url": "",
47
+ "token": "",
48
+ "msg": "",
49
+ "content": [],
50
+ }
51
+ if q.method not in ["aclanthology", "arxiv", "dblp"]:
52
+ ret["msg"] = f"{q.method} method not supported"
53
+ return ret
54
+
55
+ papers = []
56
+ cache_filepath = ""
57
+ if q.method == "aclanthology":
58
+ cache_filepath = ACL_CACHE_FILEPATH
59
+ plist = AclanthologyPaperList.build_paper_list(ACL_CACHE_FILEPATH)
60
+ papers = plist.search(q.query)[: q.max_results]
61
+ ret["ok"] = True
62
+ ret["msg"] = f"#candidates: {len(plist.papers)}"
63
+ ret["cand"] = len(plist.papers)
64
+ elif q.method == "arxiv":
65
+ _, cache_filepath = tempfile.mkstemp(
66
+ prefix="arxiv.cache.", suffix=".xml", text=True
67
+ )
68
+ plist = ArxivPaperList.build_paper_list(
69
+ cache_filepath, q.query, max_results=q.max_results
70
+ )
71
+ papers = plist.search(q.query)[: q.max_results]
72
+ ret["ok"] = True
73
+ ret["msg"] = f"#candidates: {len(plist.papers)}"
74
+ ret["cand"] = len(plist.papers)
75
+ elif q.method == "dblp":
76
+ _, cache_filepath = tempfile.mkstemp(
77
+ prefix="dblp.cache.", suffix=".json", text=True
78
+ )
79
+ plist = DblpPaperList.build_paper_list(
80
+ cache_filepath, q.query, max_results=q.max_results
81
+ )
82
+ papers = plist.search(q.query)[: q.max_results]
83
+ ret["ok"] = True
84
+ ret["msg"] = f"#candidates: {len(plist.papers)}"
85
+ ret["cand"] = len(plist.papers)
86
+
87
+ if papers:
88
+ papers = [p.as_dict() for p in papers]
89
+ ret["paper"] = len(papers)
90
+ if q.return_content:
91
+ ret["content"] = papers
92
+ else:
93
+ _, result_filepath = tempfile.mkstemp(
94
+ prefix=f"{q.method}.search.", suffix=".json", text=True
95
+ )
96
+ ret["url"] = result_filepath
97
+ ret["token"] = get_uid()
98
+ cache = {
99
+ "token": ret["token"],
100
+ "url": ret["url"],
101
+ "content": papers,
102
+ }
103
+ dump_json(cache, result_filepath)
104
+
105
+ if REMOVE_CACHE and q.method != "aclanthology":
106
+ os.remove(cache_filepath)
107
+
108
+ logger.info(
109
+ (
110
+ f"m: {q.method}, q: {q.query}, cands: {len(plist.papers)},"
111
+ f" max: {q.max_results}, #papers: {len(papers)}, cache: {cache_filepath}"
112
+ f" ret.url: {ret.get('url', '')}"
113
+ )
114
+ )
115
+
116
+ return ret
117
+
118
+
119
+ @app.get("/download/")
120
+ async def download(u: str, t: str): # noqa: F811
121
+ logger.info(f"{u=}, {t=}")
122
+ results_filepath = pathlib.Path(u)
123
+ token = t
124
+ if results_filepath.exists():
125
+ data = load_json(results_filepath)
126
+ if data["token"] == token:
127
+ filename = results_filepath.name
128
+ prefix, _, middle, suffix = filename.split(".")
129
+ _, download_filepath = tempfile.mkstemp(
130
+ prefix=f"{prefix}.download.", suffix=".json"
131
+ )
132
+ dump_json(data["content"], download_filepath, indent=2)
133
+ logger.info(f"Download: from {u} to {download_filepath}")
134
+ return FileResponse(download_filepath, filename=f"{prefix}.json")
135
+ return {"ok": False, "msg": "file not exist or token mismatch"}
136
+
137
+
138
+ if __name__ == "__main__":
139
+ log_config = uvicorn.config.LOGGING_CONFIG
140
+ log_config["formatters"]["access"]["fmt"] = (
141
+ "%(asctime)s | " + log_config["formatters"]["access"]["fmt"]
142
+ )
143
+ log_config["formatters"]["default"]["fmt"] = (
144
+ "%(asctime)s | " + log_config["formatters"]["default"]["fmt"]
145
+ )
146
+ uvicorn.run(
147
+ "server:app",
148
+ host="0.0.0.0",
149
+ port=7860,
150
+ log_level="debug",
151
+ log_config=log_config,
152
+ reload=False,
153
+ )
src/engine.py CHANGED
@@ -2,35 +2,52 @@ from src.interfaces import Paper
2
 
3
 
4
  class SearchAPI:
5
- # fmt: off
6
- SEARCH_PRIORITY = ["doi", "url", "year", "month", "venue", "authors", "title", "abstract"]
7
- # fmt: on
8
 
9
  def __init__(self) -> None:
10
  self.papers: list[Paper] = []
11
 
12
  def exhausted_search(self, query: dict[str, tuple[tuple[str]]]) -> list[Paper]:
13
  """Exhausted search papers by matching query"""
 
 
 
 
 
 
 
 
 
 
14
  papers = self.papers
15
  for field in self.SEARCH_PRIORITY:
16
  if field in query:
17
  req = query[field]
 
 
 
 
 
 
 
18
  paper_indices = []
19
  for i, p in enumerate(papers):
20
- for or_conditions in req:
21
- matched = True
22
- for and_cond_string in or_conditions:
23
- if " " in and_cond_string:
24
- if not and_cond_string.lower() in p[field].lower():
25
- matched = False
26
- break
27
- else:
28
- p_field = self.tokenize(p[field].lower())
29
- if not and_cond_string.lower() in p_field:
30
- matched = False
31
- break
32
- if matched:
33
- paper_indices.append(i)
 
 
34
  papers = [papers[i] for i in paper_indices]
35
 
36
  return papers
@@ -68,3 +85,11 @@ class SearchAPI:
68
 
69
  def tokenize(self, string: str) -> list[str]:
70
  return string.lower().split()
 
 
 
 
 
 
 
 
 
2
 
3
 
4
  class SearchAPI:
5
+ SEARCH_PRIORITY = ["year", "month", "venue", "author", "title", "abstract"]
 
 
6
 
7
  def __init__(self) -> None:
8
  self.papers: list[Paper] = []
9
 
10
  def exhausted_search(self, query: dict[str, tuple[tuple[str]]]) -> list[Paper]:
11
  """Exhausted search papers by matching query"""
12
+ def _in_string(statement, string):
13
+ stmt_in_string = False
14
+ if " " in statement and statement.lower() in string.lower():
15
+ stmt_in_string = True
16
+ else:
17
+ tokens = self.tokenize(string.lower())
18
+ if statement.lower() in tokens:
19
+ stmt_in_string = True
20
+ return stmt_in_string
21
+
22
  papers = self.papers
23
  for field in self.SEARCH_PRIORITY:
24
  if field in query:
25
  req = query[field]
26
+ time_spans = []
27
+ if field in ["year", "month"]:
28
+ for span in req:
29
+ assert len(span) == 2
30
+ assert all(num.isdigit() for num in span)
31
+ time_spans.append((int(span[0]), int(span[1])))
32
+
33
  paper_indices = []
34
  for i, p in enumerate(papers):
35
+ matched = False
36
+ if time_spans:
37
+ if any(s <= p[field] <= e for s, e in time_spans):
38
+ matched = True
39
+ else:
40
+ if any(
41
+ all(
42
+ _in_string(stmt, p[field])
43
+ for stmt in and_statements
44
+ )
45
+ for and_statements in req
46
+ ):
47
+ matched = True
48
+
49
+ if matched:
50
+ paper_indices.append(i)
51
  papers = [papers[i] for i in paper_indices]
52
 
53
  return papers
 
85
 
86
  def tokenize(self, string: str) -> list[str]:
87
  return string.lower().split()
88
+
89
+ @classmethod
90
+ def build_paper_list(cls, *args, **kwargs):
91
+ raise NotImplementedError
92
+
93
+ @classmethod
94
+ def build_and_search(cls, *args, **kwargs) -> list[Paper]:
95
+ raise NotImplementedError
src/interfaces/__init__.py CHANGED
@@ -4,7 +4,7 @@ from dataclasses import dataclass
4
  @dataclass
5
  class Paper:
6
  title: str
7
- authors: str # People Name1, People Name2: split by `, `
8
  abstract: str
9
  url: str
10
  doi: str
@@ -15,21 +15,25 @@ class Paper:
15
  def as_dict(self):
16
  return {
17
  "title": self.title,
18
- "author": self.authors,
19
  "abstract": self.abstract,
20
  "url": self.url,
21
  "doi": self.doi,
22
  "venue": self.venue,
 
 
23
  }
24
 
25
  def as_tuple(self) -> tuple:
26
  return (
27
  self.title,
28
- self.authors,
29
  self.abstract,
30
  self.url,
31
  self.doi,
32
  self.venue,
 
 
33
  )
34
 
35
  def __getitem__(self, attr_key: str):
 
4
  @dataclass
5
  class Paper:
6
  title: str
7
+ author: str # People Name1, People Name2: split by `, `
8
  abstract: str
9
  url: str
10
  doi: str
 
15
  def as_dict(self):
16
  return {
17
  "title": self.title,
18
+ "author": self.author,
19
  "abstract": self.abstract,
20
  "url": self.url,
21
  "doi": self.doi,
22
  "venue": self.venue,
23
+ "year": self.year,
24
+ "month": self.month,
25
  }
26
 
27
  def as_tuple(self) -> tuple:
28
  return (
29
  self.title,
30
+ self.author,
31
  self.abstract,
32
  self.url,
33
  self.doi,
34
  self.venue,
35
+ self.year,
36
+ self.month,
37
  )
38
 
39
  def __getitem__(self, attr_key: str):
src/interfaces/aclanthology.py CHANGED
@@ -46,3 +46,14 @@ class AclanthologyPaperList(SearchAPI):
46
  full_name = f"{name['first']} {name['last']}"
47
 
48
  return full_name
 
 
 
 
 
 
 
 
 
 
 
 
46
  full_name = f"{name['first']} {name['last']}"
47
 
48
  return full_name
49
+
50
+ @classmethod
51
+ def build_paper_list(cls, cache_filepath: str):
52
+ return cls(cache_filepath)
53
+
54
+ @classmethod
55
+ def build_and_search(
56
+ cls, cache_filepath: str, query: dict, max_results: int = -1
57
+ ) -> list[Paper]:
58
+ obj = cls.build_paper_list(cache_filepath)
59
+ return obj.search(query)[:max_results]
src/interfaces/arxiv.py CHANGED
@@ -143,3 +143,47 @@ class ArxivPaperList(SearchAPI):
143
  str(date.tm_mon),
144
  )
145
  self.papers.append(paper)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  str(date.tm_mon),
144
  )
145
  self.papers.append(paper)
146
+
147
+ @staticmethod
148
+ def build_logic_string(req: list[list[str]]) -> str:
149
+ if not req:
150
+ return ""
151
+
152
+ tmp_strings = []
153
+ for and_strs in req:
154
+ tmp_strings.append(f"({' AND '.join(and_strs)})")
155
+ logic_string = " OR ".join(tmp_strings)
156
+ return logic_string
157
+
158
+ @classmethod
159
+ def build_paper_list(
160
+ cls, cache_filepath: str, query: dict, max_results: int = 5000
161
+ ):
162
+ title = query.get("title", [])
163
+ ti_string = cls.build_logic_string(title)
164
+ author = query.get("author", [])
165
+ au_string = cls.build_logic_string(author)
166
+ abstract = query.get("abstract", [])
167
+ abs_string = cls.build_logic_string(abstract)
168
+ venue = query.get("venue", [])
169
+ # only subject category is used when caching
170
+ if venue:
171
+ cat_string = venue[0]
172
+ else:
173
+ cat_string = ""
174
+ return cls(
175
+ cache_filepath,
176
+ use_cache=False,
177
+ title=ti_string,
178
+ author=au_string,
179
+ abstract=abs_string,
180
+ category=cat_string,
181
+ max_results=max_results,
182
+ )
183
+
184
+ @classmethod
185
+ def build_and_search(
186
+ cls, cache_filepath: str, query: dict, max_results: int = -1
187
+ ) -> list[Paper]:
188
+ obj = cls.build_paper_list(cache_filepath, query, max_results=max_results)
189
+ return obj.search(query)[:max_results]
src/interfaces/dblp.py CHANGED
@@ -2,6 +2,7 @@ import pathlib
2
  import random
3
  import re
4
  import time
 
5
 
6
  import requests
7
  from tqdm import trange
@@ -11,6 +12,9 @@ from src.interfaces import Paper
11
  from src.utils import dump_json, load_json
12
 
13
 
 
 
 
14
  class DblpPaperList(SearchAPI):
15
  """DBLP paper list
16
 
@@ -34,8 +38,8 @@ class DblpPaperList(SearchAPI):
34
  cache_filepath: pathlib.Path,
35
  use_cache: bool = False,
36
  query: str = "",
37
- max_results: int = 1000,
38
- request_time_inteval: float = 5,
39
  ) -> None:
40
  super().__init__()
41
 
@@ -62,7 +66,8 @@ class DblpPaperList(SearchAPI):
62
  break
63
  except KeyboardInterrupt:
64
  raise KeyboardInterrupt
65
- except Exception:
 
66
  break
67
  time.sleep((random.random() + 0.5) * request_time_inteval)
68
  dump_json(searched_results, cache_filepath)
@@ -95,3 +100,29 @@ class DblpPaperList(SearchAPI):
95
  "99",
96
  )
97
  self.papers.append(paper)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import random
3
  import re
4
  import time
5
+ import logging
6
 
7
  import requests
8
  from tqdm import trange
 
12
  from src.utils import dump_json, load_json
13
 
14
 
15
+ logger = logging.getLogger("uvicorn.default")
16
+
17
+
18
  class DblpPaperList(SearchAPI):
19
  """DBLP paper list
20
 
 
38
  cache_filepath: pathlib.Path,
39
  use_cache: bool = False,
40
  query: str = "",
41
+ max_results: int = 5000,
42
+ request_time_inteval: float = 3,
43
  ) -> None:
44
  super().__init__()
45
 
 
66
  break
67
  except KeyboardInterrupt:
68
  raise KeyboardInterrupt
69
+ except Exception as err:
70
+ logger.info(err)
71
  break
72
  time.sleep((random.random() + 0.5) * request_time_inteval)
73
  dump_json(searched_results, cache_filepath)
 
100
  "99",
101
  )
102
  self.papers.append(paper)
103
+
104
+ @classmethod
105
+ def build_paper_list(
106
+ cls, cache_filepath: str, query: dict, max_results: int = 1000
107
+ ):
108
+ title = query.get("title", [])
109
+ abstract = query.get("abstract", [])
110
+
111
+ cls_q = ""
112
+ for t in title:
113
+ cls_q += " ".join(t)
114
+ for a in abstract:
115
+ cls_q += " ".join(a)
116
+ return cls(
117
+ cache_filepath,
118
+ use_cache=False,
119
+ query=cls_q,
120
+ max_results=max_results,
121
+ )
122
+
123
+ @classmethod
124
+ def build_and_search(
125
+ cls, cache_filepath: str, query: dict, max_results: int = 1000
126
+ ) -> list[Paper]:
127
+ obj = cls.build_paper_list(cache_filepath, query, max_results=max_results)
128
+ return obj.search(query)[:max_results]
src/utils.py CHANGED
@@ -117,9 +117,9 @@ def load_json(filepath: pathlib.Path) -> dict | list:
117
  return data
118
 
119
 
120
- def dump_json(data: list | dict, filepath: str | pathlib.Path):
121
  with open(filepath, "wt", encoding="utf8") as fout:
122
- json.dump(data, fout, ensure_ascii=False)
123
 
124
 
125
  def load_jsonlines(filepath, **kwargs):
 
117
  return data
118
 
119
 
120
+ def dump_json(data: list | dict, filepath: str | pathlib.Path, **kwargs):
121
  with open(filepath, "wt", encoding="utf8") as fout:
122
+ json.dump(data, fout, ensure_ascii=False, **kwargs)
123
 
124
 
125
  def load_jsonlines(filepath, **kwargs):