Spaces:
Sleeping
Sleeping
- add `build_paper_list` and `build_and_search` methods to help build demo (direct API)
Browse files- change `Paper.authors` into `Paper.author`, fix author searching
- add `**kwargs` in `dump_json`
- support `month` and `year` searching
- use cached acl requirements
- add cached files cleaning script
- add huggingface spaces support
- demo support (vuejs, fastapi and uvicorn)
- dockerize app
- .github/workflows/hf_spaces.yml +19 -0
- Dockerfile +25 -0
- README.md +14 -0
- docker-compose.yml +12 -0
- index.html +356 -0
- requirements.txt +3 -1
- run.py +14 -0
- scripts/clean_tmp.sh +3 -0
- scripts/get_aclanthology.sh +3 -1
- server.py +153 -0
- src/engine.py +42 -17
- src/interfaces/__init__.py +7 -3
- src/interfaces/aclanthology.py +11 -0
- src/interfaces/arxiv.py +44 -0
- src/interfaces/dblp.py +34 -3
- src/utils.py +2 -2
.github/workflows/hf_spaces.yml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face hub
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- uses: actions/checkout@v3
|
14 |
+
with:
|
15 |
+
fetch-depth: 0
|
16 |
+
- name: Push to hub
|
17 |
+
env:
|
18 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
19 |
+
run: git push https://Spico:[email protected]/spaces/Spico/paper-hero main
|
Dockerfile
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10-slim
|
2 |
+
|
3 |
+
#Install cron and git
|
4 |
+
RUN apt-get update
|
5 |
+
RUN apt-get -y install cron git
|
6 |
+
|
7 |
+
# prepare scripts
|
8 |
+
WORKDIR /app/
|
9 |
+
COPY ./requirements.txt /app/requirements.txt
|
10 |
+
RUN pip install --no-cache-dir -r /app/requirements.txt
|
11 |
+
COPY ./scripts/ /app/scripts/
|
12 |
+
RUN bash scripts/get_aclanthology.sh
|
13 |
+
COPY ./src/ /app/src/
|
14 |
+
COPY ./index.html /app/index.html
|
15 |
+
COPY ./server.py /app/server.py
|
16 |
+
|
17 |
+
# Add the cron job
|
18 |
+
RUN crontab -l | { cat; echo "*/10 * * * * bash /app/scripts/clean_tmp.sh"; } | crontab -
|
19 |
+
RUN crontab -l | { cat; echo "0 0 * * * bash /app/scripts/get_aclanthology.sh"; } | crontab -
|
20 |
+
# Run the command on container startup
|
21 |
+
CMD cron
|
22 |
+
|
23 |
+
# start service
|
24 |
+
EXPOSE 7860
|
25 |
+
CMD ["python", "-u", "server.py"]
|
README.md
CHANGED
@@ -1,3 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# 💪 Paper Hero
|
2 |
|
3 |
A toolkit to help search for papers from aclanthology, arXiv and dblp.
|
@@ -60,3 +72,5 @@ if __name__ == "__main__":
|
|
60 |
- [x] aclanthology
|
61 |
- [x] arXiv
|
62 |
- [x] dblp
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Paper Hero
|
3 |
+
emoji: 💪
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: docker
|
7 |
+
app_port: 7860
|
8 |
+
pinned: true
|
9 |
+
license: apache-2.0
|
10 |
+
---
|
11 |
+
|
12 |
+
|
13 |
# 💪 Paper Hero
|
14 |
|
15 |
A toolkit to help search for papers from aclanthology, arXiv and dblp.
|
|
|
72 |
- [x] aclanthology
|
73 |
- [x] arXiv
|
74 |
- [x] dblp
|
75 |
+
- [x] add frontend support for building a demo
|
76 |
+
- [x] year and month searching
|
docker-compose.yml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: "3"
|
2 |
+
services:
|
3 |
+
paper_hero_api:
|
4 |
+
build: .
|
5 |
+
container_name: paper_hero
|
6 |
+
ports:
|
7 |
+
- 127.0.0.1:7860:7860
|
8 |
+
volumes:
|
9 |
+
- .:/app
|
10 |
+
- phero_tmp:/tmp
|
11 |
+
volumes:
|
12 |
+
phero_tmp:
|
index.html
ADDED
@@ -0,0 +1,356 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
|
4 |
+
<head>
|
5 |
+
<meta charset="UTF-8">
|
6 |
+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
7 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
8 |
+
<title>paper-hero</title>
|
9 |
+
<link rel="stylesheet" href="https://unpkg.com/boltcss/bolt.min.css">
|
10 |
+
<script type="importmap">
|
11 |
+
{
|
12 |
+
"imports": {
|
13 |
+
"vue": "https://unpkg.com/vue@3/dist/vue.esm-browser.js"
|
14 |
+
}
|
15 |
+
}
|
16 |
+
</script>
|
17 |
+
<style>
|
18 |
+
body {
|
19 |
+
max-width: 800px;
|
20 |
+
margin: 40px auto;
|
21 |
+
padding: 0 20px;
|
22 |
+
}
|
23 |
+
|
24 |
+
.form-group {
|
25 |
+
display: flex;
|
26 |
+
flex-direction: row;
|
27 |
+
justify-content: flex-start;
|
28 |
+
align-items: center;
|
29 |
+
}
|
30 |
+
|
31 |
+
label {
|
32 |
+
margin-right: 1rem;
|
33 |
+
}
|
34 |
+
|
35 |
+
button {
|
36 |
+
margin: 0.2rem 0.2rem;
|
37 |
+
}
|
38 |
+
|
39 |
+
button:hover {
|
40 |
+
background-color: #dbdbdb;
|
41 |
+
}
|
42 |
+
|
43 |
+
footer {
|
44 |
+
text-align: center;
|
45 |
+
margin-top: 2rem;
|
46 |
+
}
|
47 |
+
|
48 |
+
.button-group {
|
49 |
+
margin-top: 3rem;
|
50 |
+
}
|
51 |
+
|
52 |
+
.search-button {
|
53 |
+
background-color: #ffc83d;
|
54 |
+
color: #d67d00;
|
55 |
+
font-weight: bold;
|
56 |
+
}
|
57 |
+
|
58 |
+
.search-button:hover {
|
59 |
+
background-color: #ffc83dc0;
|
60 |
+
}
|
61 |
+
|
62 |
+
.download-button {
|
63 |
+
background-color: #98ca56;
|
64 |
+
color: white;
|
65 |
+
font-weight: bold;
|
66 |
+
}
|
67 |
+
|
68 |
+
.download-button:hover {
|
69 |
+
background-color: #98ca56d1;
|
70 |
+
}
|
71 |
+
|
72 |
+
.output-title {
|
73 |
+
margin-top: 2rem;
|
74 |
+
margin-bottom: 0;
|
75 |
+
display: block;
|
76 |
+
background-color: #98ca56;
|
77 |
+
color: white;
|
78 |
+
font-weight: bold;
|
79 |
+
font-size: large;
|
80 |
+
padding: 6px 15px;
|
81 |
+
border-top-left-radius: 6px;
|
82 |
+
border-top-right-radius: 6px;
|
83 |
+
}
|
84 |
+
|
85 |
+
.output-box {
|
86 |
+
margin-top: 0;
|
87 |
+
padding: 6px 15px;
|
88 |
+
background-color: white;
|
89 |
+
border: 2px solid #98ca56;
|
90 |
+
border-bottom-left-radius: 6px;
|
91 |
+
border-bottom-right-radius: 6px;
|
92 |
+
}
|
93 |
+
</style>
|
94 |
+
</head>
|
95 |
+
|
96 |
+
<body>
|
97 |
+
<header>
|
98 |
+
<h1>💪 Paper Hero</h1>
|
99 |
+
<p>
|
100 |
+
Paper Hero is a toolkit to help search for papers from aclanthology, arXiv and dblp.
|
101 |
+
</p>
|
102 |
+
<p>GitHub Address: <a href="https://github.com/Spico197/paper-hero" target="_blank">Spico197/paper-hero</a></p>
|
103 |
+
</header>
|
104 |
+
|
105 |
+
<main>
|
106 |
+
<div id="app">
|
107 |
+
<div class="form-group">
|
108 |
+
<label for="method"><strong>Source</strong></label>
|
109 |
+
<select id="method" v-model="method">
|
110 |
+
<option value="" disabled>Please select a source</option>
|
111 |
+
<option value="aclanthology">ACL Anthology</option>
|
112 |
+
<option value="arxiv">ArXiv</option>
|
113 |
+
<option value="dblp">DBLP</option>
|
114 |
+
</select>
|
115 |
+
</div>
|
116 |
+
|
117 |
+
<div>
|
118 |
+
<label for="max-res"><strong>Max Results</strong></label>
|
119 |
+
<input id="max-res" type="number" v-model="maxResults">
|
120 |
+
</div>
|
121 |
+
|
122 |
+
<div class="form-group">
|
123 |
+
<label for="add-field"><strong>New Field</strong></label>
|
124 |
+
<select id="add-field" v-model="addField">
|
125 |
+
<option value="" disabled>Please select a field</option>
|
126 |
+
<option :value="field" v-for="field in restFields">{{ field }}</option>
|
127 |
+
</select>
|
128 |
+
<button @click.prevent="addNewField">Add</button>
|
129 |
+
</div>
|
130 |
+
|
131 |
+
<hr>
|
132 |
+
|
133 |
+
<div>
|
134 |
+
<p><strong>Fields</strong></p>
|
135 |
+
<p>
|
136 |
+
Add <code>&&</code> to represent <code>AND</code> logic, e.g. <code>span-based && event extraction</code>
|
137 |
+
means <em>span-based</em> and <em>event extraction</em> both appear in a field.
|
138 |
+
</p>
|
139 |
+
<p>
|
140 |
+
For <code>year</code> and <code>month</code> fields, the query should
|
141 |
+
follow the <code>start && end</code> format,
|
142 |
+
e.g. year <code>2006 && 2013</code> means searching for papers published
|
143 |
+
between <code>2006</code> and <code>2013</code>.
|
144 |
+
</p>
|
145 |
+
<div v-for="(groups, field) in query">
|
146 |
+
<label :for="field"><strong>{{ field }}</strong></label>
|
147 |
+
<div v-for="(group, index) in groups">
|
148 |
+
<input class="field-input" type="text" v-model="query[field][index]" placeholder="text1 && text2 && text3"
|
149 |
+
size="50">
|
150 |
+
<button @click.prevent="rmAnd(field, index)">X</button>
|
151 |
+
</div>
|
152 |
+
<button @click.prevent="addOr(field)">OR</button>
|
153 |
+
</div>
|
154 |
+
</div>
|
155 |
+
|
156 |
+
<div v-if="timerHandler">
|
157 |
+
<p>⏱️ {{ searchSecondsTwoDecimal }}</p>
|
158 |
+
</div>
|
159 |
+
|
160 |
+
<div v-if="output">
|
161 |
+
<p class="output-title">Output Info</p>
|
162 |
+
<p class="output-box">
|
163 |
+
{{ output }}
|
164 |
+
<br>
|
165 |
+
You are ready to download the results by clicking the download button below.
|
166 |
+
<br>
|
167 |
+
Like this tool? ⭐ me on <a href="https://github.com/Spico197/paper-hero" target="_blank">GitHub</a> !
|
168 |
+
</p>
|
169 |
+
</div>
|
170 |
+
|
171 |
+
<div class="button-group">
|
172 |
+
<button @click.prevent="resetQuery">Reset</button>
|
173 |
+
<button class="search-button" @click.prevent="search">Search</button>
|
174 |
+
<a :href="downloadHref" :download="`${method}.json`" v-if="downloadHref">
|
175 |
+
<button class="search-button download-button">Download</button>
|
176 |
+
</a>
|
177 |
+
</div>
|
178 |
+
|
179 |
+
</div>
|
180 |
+
</main>
|
181 |
+
|
182 |
+
<footer>
|
183 |
+
<hr>
|
184 |
+
Made by <a href="https://spico197.github.io" target="_blank">Tong Zhu</a> w/ 💖
|
185 |
+
</footer>
|
186 |
+
|
187 |
+
<script type="module">
|
188 |
+
import { createApp, ref, computed, toRaw, watch } from 'vue'
|
189 |
+
|
190 |
+
createApp(
|
191 |
+
{
|
192 |
+
setup() {
|
193 |
+
const method = ref("aclanthology")
|
194 |
+
const query = ref({ title: [[]] })
|
195 |
+
const maxResults = ref(2000)
|
196 |
+
const addField = ref("")
|
197 |
+
const allFields = ["title", "author", "abstract", "venue", "year", "month"]
|
198 |
+
const downloadUrl = ref('')
|
199 |
+
const downloadToken = ref('')
|
200 |
+
const downloadHref = ref('')
|
201 |
+
const output = ref('')
|
202 |
+
const timerHandler = ref(0)
|
203 |
+
const searchSeconds = ref(0.0)
|
204 |
+
const searchSecondsTwoDecimal = computed(() => {
|
205 |
+
return `${searchSeconds.value.toFixed(1)}s`
|
206 |
+
})
|
207 |
+
const restFields = computed(() => {
|
208 |
+
let rest = []
|
209 |
+
for (const field of allFields) {
|
210 |
+
if (!(field in query.value)) {
|
211 |
+
rest.push(field)
|
212 |
+
}
|
213 |
+
}
|
214 |
+
return rest
|
215 |
+
})
|
216 |
+
|
217 |
+
function addNewField() {
|
218 |
+
if (addField.value) {
|
219 |
+
query.value[addField.value] = [[]]
|
220 |
+
addField.value = ""
|
221 |
+
}
|
222 |
+
}
|
223 |
+
|
224 |
+
function rmAnd(field, index) {
|
225 |
+
if (query.value[field].length == 1) {
|
226 |
+
delete query.value[field]
|
227 |
+
} else {
|
228 |
+
query.value[field].splice(index, 1)
|
229 |
+
}
|
230 |
+
}
|
231 |
+
|
232 |
+
function addOr(field) {
|
233 |
+
query.value[field].push([])
|
234 |
+
}
|
235 |
+
|
236 |
+
function resetOutput() {
|
237 |
+
output.value = ""
|
238 |
+
downloadUrl.value = ""
|
239 |
+
downloadToken.value = ""
|
240 |
+
URL.revokeObjectURL(downloadHref.value)
|
241 |
+
downloadHref.value = ""
|
242 |
+
searchSeconds.value = 0.0
|
243 |
+
timerHandler.value = 0
|
244 |
+
searchSecondsTwoDecimal.value = ""
|
245 |
+
}
|
246 |
+
|
247 |
+
function resetQuery() {
|
248 |
+
query.value = { title: [[]] }
|
249 |
+
resetOutput()
|
250 |
+
}
|
251 |
+
|
252 |
+
function startTimer() {
|
253 |
+
console.log("start")
|
254 |
+
timerHandler.value = setInterval(() => {
|
255 |
+
searchSeconds.value += 0.1
|
256 |
+
}, 100)
|
257 |
+
}
|
258 |
+
|
259 |
+
function endTimer() {
|
260 |
+
console.log("end")
|
261 |
+
if (timerHandler.value > 0) {
|
262 |
+
console.log("endi")
|
263 |
+
clearInterval(timerHandler.value)
|
264 |
+
}
|
265 |
+
}
|
266 |
+
|
267 |
+
function search() {
|
268 |
+
resetOutput()
|
269 |
+
startTimer()
|
270 |
+
let q = {}
|
271 |
+
for (const prop in query.value) {
|
272 |
+
q[prop] = []
|
273 |
+
for (let i = 0; i < query.value[prop].length; i++) {
|
274 |
+
if (query.value[prop][i].length > 0) {
|
275 |
+
let andString = toRaw(query.value[prop][i])
|
276 |
+
let andStrings = andString.split('&&')
|
277 |
+
for (let j = 0; j < andStrings.length; j++) {
|
278 |
+
andStrings[j] = andStrings[j].trim()
|
279 |
+
}
|
280 |
+
q[prop].push(andStrings)
|
281 |
+
}
|
282 |
+
}
|
283 |
+
if (q[prop].length < 1) {
|
284 |
+
delete q[prop]
|
285 |
+
}
|
286 |
+
}
|
287 |
+
const postData = JSON.stringify({
|
288 |
+
"method": method.value,
|
289 |
+
"query": q,
|
290 |
+
"max_results": maxResults.value,
|
291 |
+
"return_content": false,
|
292 |
+
})
|
293 |
+
fetch(
|
294 |
+
"/api/",
|
295 |
+
{
|
296 |
+
method: "POST",
|
297 |
+
headers: {
|
298 |
+
'Content-Type': 'application/json',
|
299 |
+
},
|
300 |
+
body: postData,
|
301 |
+
}
|
302 |
+
)
|
303 |
+
.then((response) => response.json())
|
304 |
+
.then((json) => {
|
305 |
+
if (json["ok"] === false) {
|
306 |
+
alert(json["msg"])
|
307 |
+
} else {
|
308 |
+
downloadUrl.value = json["url"]
|
309 |
+
downloadToken.value = json["token"]
|
310 |
+
output.value = `${json["msg"]}, #matched paper: ${json["paper"]}`
|
311 |
+
}
|
312 |
+
})
|
313 |
+
.catch((err) => { alert(err) })
|
314 |
+
.finally(() => endTimer())
|
315 |
+
}
|
316 |
+
|
317 |
+
watch(downloadUrl, (newUrl, oldUrl) => {
|
318 |
+
if (downloadToken.value) {
|
319 |
+
fetch(
|
320 |
+
`/download/?u=${downloadUrl.value}&t=${downloadToken.value}`,
|
321 |
+
{
|
322 |
+
method: "GET",
|
323 |
+
}
|
324 |
+
)
|
325 |
+
.then((response) => response.blob())
|
326 |
+
.then((data) => {
|
327 |
+
downloadHref.value = URL.createObjectURL(data)
|
328 |
+
})
|
329 |
+
}
|
330 |
+
})
|
331 |
+
|
332 |
+
return {
|
333 |
+
method,
|
334 |
+
query,
|
335 |
+
restFields,
|
336 |
+
addField,
|
337 |
+
addNewField,
|
338 |
+
search,
|
339 |
+
rmAnd,
|
340 |
+
addOr,
|
341 |
+
resetQuery,
|
342 |
+
maxResults,
|
343 |
+
output,
|
344 |
+
downloadUrl,
|
345 |
+
downloadHref,
|
346 |
+
searchSeconds,
|
347 |
+
timerHandler,
|
348 |
+
searchSecondsTwoDecimal,
|
349 |
+
}
|
350 |
+
}
|
351 |
+
}
|
352 |
+
).mount("#app")
|
353 |
+
</script>
|
354 |
+
</body>
|
355 |
+
|
356 |
+
</html>
|
requirements.txt
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
tqdm>=4.64.1
|
2 |
requests>=2.28.1
|
3 |
-
feedparser>=6.0.10
|
|
|
|
|
|
1 |
tqdm>=4.64.1
|
2 |
requests>=2.28.1
|
3 |
+
feedparser>=6.0.10
|
4 |
+
fastapi>=0.88.0
|
5 |
+
uvicorn>=0.20.0
|
run.py
CHANGED
@@ -9,6 +9,7 @@ from src.utils import (
|
|
9 |
if __name__ == "__main__":
|
10 |
# use `bash scripts/get_aclanthology.sh` to download and prepare anthology data first
|
11 |
acl_paper_list = AclanthologyPaperList("cache/aclanthology.json")
|
|
|
12 |
ee_query = {
|
13 |
"title": [
|
14 |
["information extraction"],
|
@@ -30,6 +31,19 @@ if __name__ == "__main__":
|
|
30 |
["tacl"],
|
31 |
["cl"],
|
32 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
}
|
34 |
ee_papers = acl_paper_list.search(ee_query)
|
35 |
dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")
|
|
|
9 |
if __name__ == "__main__":
|
10 |
# use `bash scripts/get_aclanthology.sh` to download and prepare anthology data first
|
11 |
acl_paper_list = AclanthologyPaperList("cache/aclanthology.json")
|
12 |
+
# `ee_query`` is an example, and you don't have to fill all the fields
|
13 |
ee_query = {
|
14 |
"title": [
|
15 |
["information extraction"],
|
|
|
31 |
["tacl"],
|
32 |
["cl"],
|
33 |
],
|
34 |
+
"author": [
|
35 |
+
["Heng Ji"],
|
36 |
+
["Dan Roth"],
|
37 |
+
],
|
38 |
+
"year": [
|
39 |
+
# multiple time spans with closed interval: ["2006", "2013"] means 2006-2013
|
40 |
+
["2006", "2013"],
|
41 |
+
["2018", "2022"],
|
42 |
+
],
|
43 |
+
"month": [
|
44 |
+
# the same as the `year` field
|
45 |
+
["4", "11"],
|
46 |
+
]
|
47 |
}
|
48 |
ee_papers = acl_paper_list.search(ee_query)
|
49 |
dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")
|
scripts/clean_tmp.sh
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
find /tmp -maxdepth 1 -type f -mmin +10 -type f -regextype posix-extended -regex '\/tmp\/arxiv\.cache\..*?\.xml' -delete
|
2 |
+
find /tmp -maxdepth 1 -type f -mmin +10 -type f -regextype posix-extended -regex '\/tmp\/dblp\.cache\..*?\.json' -delete
|
3 |
+
find /tmp -maxdepth 1 -type f -mmin +10 -type f -regextype posix-extended -regex '\/tmp\/(aclanthology|arxiv|dblp)\.(search|download)\..*?\.json' -delete
|
scripts/get_aclanthology.sh
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
mkdir cache
|
2 |
cd cache
|
3 |
if ! [ -f acl-anthology/bin/anthology/anthology.py ]; then
|
@@ -9,7 +11,7 @@ else
|
|
9 |
fi
|
10 |
cd acl-anthology/bin
|
11 |
|
12 |
-
pip install -r
|
13 |
|
14 |
python -c '
|
15 |
import json
|
|
|
1 |
+
set -ex
|
2 |
+
|
3 |
mkdir cache
|
4 |
cd cache
|
5 |
if ! [ -f acl-anthology/bin/anthology/anthology.py ]; then
|
|
|
11 |
fi
|
12 |
cd acl-anthology/bin
|
13 |
|
14 |
+
pip install --no-cache-dir -r ./requirements.txt
|
15 |
|
16 |
python -c '
|
17 |
import json
|
server.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import uuid
|
4 |
+
import tempfile
|
5 |
+
import pathlib
|
6 |
+
|
7 |
+
import uvicorn
|
8 |
+
from fastapi import FastAPI
|
9 |
+
from fastapi.responses import FileResponse
|
10 |
+
from pydantic import BaseModel
|
11 |
+
|
12 |
+
from src.interfaces.aclanthology import AclanthologyPaperList
|
13 |
+
from src.interfaces.arxiv import ArxivPaperList
|
14 |
+
from src.interfaces.dblp import DblpPaperList
|
15 |
+
from src.utils import dump_json, load_json
|
16 |
+
|
17 |
+
|
18 |
+
class SearchQuery(BaseModel):
|
19 |
+
method: str
|
20 |
+
query: dict
|
21 |
+
max_results: int = 1000
|
22 |
+
return_content: bool = False
|
23 |
+
|
24 |
+
|
25 |
+
REMOVE_CACHE = False
|
26 |
+
ACL_CACHE_FILEPATH = "./cache/aclanthology.json"
|
27 |
+
app = FastAPI()
|
28 |
+
logger = logging.getLogger("uvicorn.default")
|
29 |
+
|
30 |
+
|
31 |
+
def get_uid():
|
32 |
+
return uuid.uuid4().urn.split(":")[-1]
|
33 |
+
|
34 |
+
|
35 |
+
@app.get("/")
|
36 |
+
async def api():
|
37 |
+
return FileResponse("./index.html", media_type="text/html")
|
38 |
+
|
39 |
+
|
40 |
+
@app.post("/api/")
|
41 |
+
async def api(q: SearchQuery): # noqa: F811
|
42 |
+
ret = {
|
43 |
+
"ok": False,
|
44 |
+
"cand": 0,
|
45 |
+
"paper": 0,
|
46 |
+
"url": "",
|
47 |
+
"token": "",
|
48 |
+
"msg": "",
|
49 |
+
"content": [],
|
50 |
+
}
|
51 |
+
if q.method not in ["aclanthology", "arxiv", "dblp"]:
|
52 |
+
ret["msg"] = f"{q.method} method not supported"
|
53 |
+
return ret
|
54 |
+
|
55 |
+
papers = []
|
56 |
+
cache_filepath = ""
|
57 |
+
if q.method == "aclanthology":
|
58 |
+
cache_filepath = ACL_CACHE_FILEPATH
|
59 |
+
plist = AclanthologyPaperList.build_paper_list(ACL_CACHE_FILEPATH)
|
60 |
+
papers = plist.search(q.query)[: q.max_results]
|
61 |
+
ret["ok"] = True
|
62 |
+
ret["msg"] = f"#candidates: {len(plist.papers)}"
|
63 |
+
ret["cand"] = len(plist.papers)
|
64 |
+
elif q.method == "arxiv":
|
65 |
+
_, cache_filepath = tempfile.mkstemp(
|
66 |
+
prefix="arxiv.cache.", suffix=".xml", text=True
|
67 |
+
)
|
68 |
+
plist = ArxivPaperList.build_paper_list(
|
69 |
+
cache_filepath, q.query, max_results=q.max_results
|
70 |
+
)
|
71 |
+
papers = plist.search(q.query)[: q.max_results]
|
72 |
+
ret["ok"] = True
|
73 |
+
ret["msg"] = f"#candidates: {len(plist.papers)}"
|
74 |
+
ret["cand"] = len(plist.papers)
|
75 |
+
elif q.method == "dblp":
|
76 |
+
_, cache_filepath = tempfile.mkstemp(
|
77 |
+
prefix="dblp.cache.", suffix=".json", text=True
|
78 |
+
)
|
79 |
+
plist = DblpPaperList.build_paper_list(
|
80 |
+
cache_filepath, q.query, max_results=q.max_results
|
81 |
+
)
|
82 |
+
papers = plist.search(q.query)[: q.max_results]
|
83 |
+
ret["ok"] = True
|
84 |
+
ret["msg"] = f"#candidates: {len(plist.papers)}"
|
85 |
+
ret["cand"] = len(plist.papers)
|
86 |
+
|
87 |
+
if papers:
|
88 |
+
papers = [p.as_dict() for p in papers]
|
89 |
+
ret["paper"] = len(papers)
|
90 |
+
if q.return_content:
|
91 |
+
ret["content"] = papers
|
92 |
+
else:
|
93 |
+
_, result_filepath = tempfile.mkstemp(
|
94 |
+
prefix=f"{q.method}.search.", suffix=".json", text=True
|
95 |
+
)
|
96 |
+
ret["url"] = result_filepath
|
97 |
+
ret["token"] = get_uid()
|
98 |
+
cache = {
|
99 |
+
"token": ret["token"],
|
100 |
+
"url": ret["url"],
|
101 |
+
"content": papers,
|
102 |
+
}
|
103 |
+
dump_json(cache, result_filepath)
|
104 |
+
|
105 |
+
if REMOVE_CACHE and q.method != "aclanthology":
|
106 |
+
os.remove(cache_filepath)
|
107 |
+
|
108 |
+
logger.info(
|
109 |
+
(
|
110 |
+
f"m: {q.method}, q: {q.query}, cands: {len(plist.papers)},"
|
111 |
+
f" max: {q.max_results}, #papers: {len(papers)}, cache: {cache_filepath}"
|
112 |
+
f" ret.url: {ret.get('url', '')}"
|
113 |
+
)
|
114 |
+
)
|
115 |
+
|
116 |
+
return ret
|
117 |
+
|
118 |
+
|
119 |
+
@app.get("/download/")
|
120 |
+
async def download(u: str, t: str): # noqa: F811
|
121 |
+
logger.info(f"{u=}, {t=}")
|
122 |
+
results_filepath = pathlib.Path(u)
|
123 |
+
token = t
|
124 |
+
if results_filepath.exists():
|
125 |
+
data = load_json(results_filepath)
|
126 |
+
if data["token"] == token:
|
127 |
+
filename = results_filepath.name
|
128 |
+
prefix, _, middle, suffix = filename.split(".")
|
129 |
+
_, download_filepath = tempfile.mkstemp(
|
130 |
+
prefix=f"{prefix}.download.", suffix=".json"
|
131 |
+
)
|
132 |
+
dump_json(data["content"], download_filepath, indent=2)
|
133 |
+
logger.info(f"Download: from {u} to {download_filepath}")
|
134 |
+
return FileResponse(download_filepath, filename=f"{prefix}.json")
|
135 |
+
return {"ok": False, "msg": "file not exist or token mismatch"}
|
136 |
+
|
137 |
+
|
138 |
+
if __name__ == "__main__":
|
139 |
+
log_config = uvicorn.config.LOGGING_CONFIG
|
140 |
+
log_config["formatters"]["access"]["fmt"] = (
|
141 |
+
"%(asctime)s | " + log_config["formatters"]["access"]["fmt"]
|
142 |
+
)
|
143 |
+
log_config["formatters"]["default"]["fmt"] = (
|
144 |
+
"%(asctime)s | " + log_config["formatters"]["default"]["fmt"]
|
145 |
+
)
|
146 |
+
uvicorn.run(
|
147 |
+
"server:app",
|
148 |
+
host="0.0.0.0",
|
149 |
+
port=7860,
|
150 |
+
log_level="debug",
|
151 |
+
log_config=log_config,
|
152 |
+
reload=False,
|
153 |
+
)
|
src/engine.py
CHANGED
@@ -2,35 +2,52 @@ from src.interfaces import Paper
|
|
2 |
|
3 |
|
4 |
class SearchAPI:
|
5 |
-
|
6 |
-
SEARCH_PRIORITY = ["doi", "url", "year", "month", "venue", "authors", "title", "abstract"]
|
7 |
-
# fmt: on
|
8 |
|
9 |
def __init__(self) -> None:
|
10 |
self.papers: list[Paper] = []
|
11 |
|
12 |
def exhausted_search(self, query: dict[str, tuple[tuple[str]]]) -> list[Paper]:
|
13 |
"""Exhausted search papers by matching query"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
papers = self.papers
|
15 |
for field in self.SEARCH_PRIORITY:
|
16 |
if field in query:
|
17 |
req = query[field]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
paper_indices = []
|
19 |
for i, p in enumerate(papers):
|
20 |
-
|
21 |
-
|
22 |
-
for
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
34 |
papers = [papers[i] for i in paper_indices]
|
35 |
|
36 |
return papers
|
@@ -68,3 +85,11 @@ class SearchAPI:
|
|
68 |
|
69 |
def tokenize(self, string: str) -> list[str]:
|
70 |
return string.lower().split()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
|
4 |
class SearchAPI:
|
5 |
+
SEARCH_PRIORITY = ["year", "month", "venue", "author", "title", "abstract"]
|
|
|
|
|
6 |
|
7 |
def __init__(self) -> None:
|
8 |
self.papers: list[Paper] = []
|
9 |
|
10 |
def exhausted_search(self, query: dict[str, tuple[tuple[str]]]) -> list[Paper]:
|
11 |
"""Exhausted search papers by matching query"""
|
12 |
+
def _in_string(statement, string):
|
13 |
+
stmt_in_string = False
|
14 |
+
if " " in statement and statement.lower() in string.lower():
|
15 |
+
stmt_in_string = True
|
16 |
+
else:
|
17 |
+
tokens = self.tokenize(string.lower())
|
18 |
+
if statement.lower() in tokens:
|
19 |
+
stmt_in_string = True
|
20 |
+
return stmt_in_string
|
21 |
+
|
22 |
papers = self.papers
|
23 |
for field in self.SEARCH_PRIORITY:
|
24 |
if field in query:
|
25 |
req = query[field]
|
26 |
+
time_spans = []
|
27 |
+
if field in ["year", "month"]:
|
28 |
+
for span in req:
|
29 |
+
assert len(span) == 2
|
30 |
+
assert all(num.isdigit() for num in span)
|
31 |
+
time_spans.append((int(span[0]), int(span[1])))
|
32 |
+
|
33 |
paper_indices = []
|
34 |
for i, p in enumerate(papers):
|
35 |
+
matched = False
|
36 |
+
if time_spans:
|
37 |
+
if any(s <= p[field] <= e for s, e in time_spans):
|
38 |
+
matched = True
|
39 |
+
else:
|
40 |
+
if any(
|
41 |
+
all(
|
42 |
+
_in_string(stmt, p[field])
|
43 |
+
for stmt in and_statements
|
44 |
+
)
|
45 |
+
for and_statements in req
|
46 |
+
):
|
47 |
+
matched = True
|
48 |
+
|
49 |
+
if matched:
|
50 |
+
paper_indices.append(i)
|
51 |
papers = [papers[i] for i in paper_indices]
|
52 |
|
53 |
return papers
|
|
|
85 |
|
86 |
def tokenize(self, string: str) -> list[str]:
|
87 |
return string.lower().split()
|
88 |
+
|
89 |
+
@classmethod
|
90 |
+
def build_paper_list(cls, *args, **kwargs):
|
91 |
+
raise NotImplementedError
|
92 |
+
|
93 |
+
@classmethod
|
94 |
+
def build_and_search(cls, *args, **kwargs) -> list[Paper]:
|
95 |
+
raise NotImplementedError
|
src/interfaces/__init__.py
CHANGED
@@ -4,7 +4,7 @@ from dataclasses import dataclass
|
|
4 |
@dataclass
|
5 |
class Paper:
|
6 |
title: str
|
7 |
-
|
8 |
abstract: str
|
9 |
url: str
|
10 |
doi: str
|
@@ -15,21 +15,25 @@ class Paper:
|
|
15 |
def as_dict(self):
|
16 |
return {
|
17 |
"title": self.title,
|
18 |
-
"author": self.
|
19 |
"abstract": self.abstract,
|
20 |
"url": self.url,
|
21 |
"doi": self.doi,
|
22 |
"venue": self.venue,
|
|
|
|
|
23 |
}
|
24 |
|
25 |
def as_tuple(self) -> tuple:
|
26 |
return (
|
27 |
self.title,
|
28 |
-
self.
|
29 |
self.abstract,
|
30 |
self.url,
|
31 |
self.doi,
|
32 |
self.venue,
|
|
|
|
|
33 |
)
|
34 |
|
35 |
def __getitem__(self, attr_key: str):
|
|
|
4 |
@dataclass
|
5 |
class Paper:
|
6 |
title: str
|
7 |
+
author: str # People Name1, People Name2: split by `, `
|
8 |
abstract: str
|
9 |
url: str
|
10 |
doi: str
|
|
|
15 |
def as_dict(self):
|
16 |
return {
|
17 |
"title": self.title,
|
18 |
+
"author": self.author,
|
19 |
"abstract": self.abstract,
|
20 |
"url": self.url,
|
21 |
"doi": self.doi,
|
22 |
"venue": self.venue,
|
23 |
+
"year": self.year,
|
24 |
+
"month": self.month,
|
25 |
}
|
26 |
|
27 |
def as_tuple(self) -> tuple:
|
28 |
return (
|
29 |
self.title,
|
30 |
+
self.author,
|
31 |
self.abstract,
|
32 |
self.url,
|
33 |
self.doi,
|
34 |
self.venue,
|
35 |
+
self.year,
|
36 |
+
self.month,
|
37 |
)
|
38 |
|
39 |
def __getitem__(self, attr_key: str):
|
src/interfaces/aclanthology.py
CHANGED
@@ -46,3 +46,14 @@ class AclanthologyPaperList(SearchAPI):
|
|
46 |
full_name = f"{name['first']} {name['last']}"
|
47 |
|
48 |
return full_name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
full_name = f"{name['first']} {name['last']}"
|
47 |
|
48 |
return full_name
|
49 |
+
|
50 |
+
@classmethod
|
51 |
+
def build_paper_list(cls, cache_filepath: str):
|
52 |
+
return cls(cache_filepath)
|
53 |
+
|
54 |
+
@classmethod
|
55 |
+
def build_and_search(
|
56 |
+
cls, cache_filepath: str, query: dict, max_results: int = -1
|
57 |
+
) -> list[Paper]:
|
58 |
+
obj = cls.build_paper_list(cache_filepath)
|
59 |
+
return obj.search(query)[:max_results]
|
src/interfaces/arxiv.py
CHANGED
@@ -143,3 +143,47 @@ class ArxivPaperList(SearchAPI):
|
|
143 |
str(date.tm_mon),
|
144 |
)
|
145 |
self.papers.append(paper)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
str(date.tm_mon),
|
144 |
)
|
145 |
self.papers.append(paper)
|
146 |
+
|
147 |
+
@staticmethod
|
148 |
+
def build_logic_string(req: list[list[str]]) -> str:
|
149 |
+
if not req:
|
150 |
+
return ""
|
151 |
+
|
152 |
+
tmp_strings = []
|
153 |
+
for and_strs in req:
|
154 |
+
tmp_strings.append(f"({' AND '.join(and_strs)})")
|
155 |
+
logic_string = " OR ".join(tmp_strings)
|
156 |
+
return logic_string
|
157 |
+
|
158 |
+
@classmethod
|
159 |
+
def build_paper_list(
|
160 |
+
cls, cache_filepath: str, query: dict, max_results: int = 5000
|
161 |
+
):
|
162 |
+
title = query.get("title", [])
|
163 |
+
ti_string = cls.build_logic_string(title)
|
164 |
+
author = query.get("author", [])
|
165 |
+
au_string = cls.build_logic_string(author)
|
166 |
+
abstract = query.get("abstract", [])
|
167 |
+
abs_string = cls.build_logic_string(abstract)
|
168 |
+
venue = query.get("venue", [])
|
169 |
+
# only subject category is used when caching
|
170 |
+
if venue:
|
171 |
+
cat_string = venue[0]
|
172 |
+
else:
|
173 |
+
cat_string = ""
|
174 |
+
return cls(
|
175 |
+
cache_filepath,
|
176 |
+
use_cache=False,
|
177 |
+
title=ti_string,
|
178 |
+
author=au_string,
|
179 |
+
abstract=abs_string,
|
180 |
+
category=cat_string,
|
181 |
+
max_results=max_results,
|
182 |
+
)
|
183 |
+
|
184 |
+
@classmethod
|
185 |
+
def build_and_search(
|
186 |
+
cls, cache_filepath: str, query: dict, max_results: int = -1
|
187 |
+
) -> list[Paper]:
|
188 |
+
obj = cls.build_paper_list(cache_filepath, query, max_results=max_results)
|
189 |
+
return obj.search(query)[:max_results]
|
src/interfaces/dblp.py
CHANGED
@@ -2,6 +2,7 @@ import pathlib
|
|
2 |
import random
|
3 |
import re
|
4 |
import time
|
|
|
5 |
|
6 |
import requests
|
7 |
from tqdm import trange
|
@@ -11,6 +12,9 @@ from src.interfaces import Paper
|
|
11 |
from src.utils import dump_json, load_json
|
12 |
|
13 |
|
|
|
|
|
|
|
14 |
class DblpPaperList(SearchAPI):
|
15 |
"""DBLP paper list
|
16 |
|
@@ -34,8 +38,8 @@ class DblpPaperList(SearchAPI):
|
|
34 |
cache_filepath: pathlib.Path,
|
35 |
use_cache: bool = False,
|
36 |
query: str = "",
|
37 |
-
max_results: int =
|
38 |
-
request_time_inteval: float =
|
39 |
) -> None:
|
40 |
super().__init__()
|
41 |
|
@@ -62,7 +66,8 @@ class DblpPaperList(SearchAPI):
|
|
62 |
break
|
63 |
except KeyboardInterrupt:
|
64 |
raise KeyboardInterrupt
|
65 |
-
except Exception:
|
|
|
66 |
break
|
67 |
time.sleep((random.random() + 0.5) * request_time_inteval)
|
68 |
dump_json(searched_results, cache_filepath)
|
@@ -95,3 +100,29 @@ class DblpPaperList(SearchAPI):
|
|
95 |
"99",
|
96 |
)
|
97 |
self.papers.append(paper)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import random
|
3 |
import re
|
4 |
import time
|
5 |
+
import logging
|
6 |
|
7 |
import requests
|
8 |
from tqdm import trange
|
|
|
12 |
from src.utils import dump_json, load_json
|
13 |
|
14 |
|
15 |
+
logger = logging.getLogger("uvicorn.default")
|
16 |
+
|
17 |
+
|
18 |
class DblpPaperList(SearchAPI):
|
19 |
"""DBLP paper list
|
20 |
|
|
|
38 |
cache_filepath: pathlib.Path,
|
39 |
use_cache: bool = False,
|
40 |
query: str = "",
|
41 |
+
max_results: int = 5000,
|
42 |
+
request_time_inteval: float = 3,
|
43 |
) -> None:
|
44 |
super().__init__()
|
45 |
|
|
|
66 |
break
|
67 |
except KeyboardInterrupt:
|
68 |
raise KeyboardInterrupt
|
69 |
+
except Exception as err:
|
70 |
+
logger.info(err)
|
71 |
break
|
72 |
time.sleep((random.random() + 0.5) * request_time_inteval)
|
73 |
dump_json(searched_results, cache_filepath)
|
|
|
100 |
"99",
|
101 |
)
|
102 |
self.papers.append(paper)
|
103 |
+
|
104 |
+
@classmethod
|
105 |
+
def build_paper_list(
|
106 |
+
cls, cache_filepath: str, query: dict, max_results: int = 1000
|
107 |
+
):
|
108 |
+
title = query.get("title", [])
|
109 |
+
abstract = query.get("abstract", [])
|
110 |
+
|
111 |
+
cls_q = ""
|
112 |
+
for t in title:
|
113 |
+
cls_q += " ".join(t)
|
114 |
+
for a in abstract:
|
115 |
+
cls_q += " ".join(a)
|
116 |
+
return cls(
|
117 |
+
cache_filepath,
|
118 |
+
use_cache=False,
|
119 |
+
query=cls_q,
|
120 |
+
max_results=max_results,
|
121 |
+
)
|
122 |
+
|
123 |
+
@classmethod
|
124 |
+
def build_and_search(
|
125 |
+
cls, cache_filepath: str, query: dict, max_results: int = 1000
|
126 |
+
) -> list[Paper]:
|
127 |
+
obj = cls.build_paper_list(cache_filepath, query, max_results=max_results)
|
128 |
+
return obj.search(query)[:max_results]
|
src/utils.py
CHANGED
@@ -117,9 +117,9 @@ def load_json(filepath: pathlib.Path) -> dict | list:
|
|
117 |
return data
|
118 |
|
119 |
|
120 |
-
def dump_json(data: list | dict, filepath: str | pathlib.Path):
|
121 |
with open(filepath, "wt", encoding="utf8") as fout:
|
122 |
-
json.dump(data, fout, ensure_ascii=False)
|
123 |
|
124 |
|
125 |
def load_jsonlines(filepath, **kwargs):
|
|
|
117 |
return data
|
118 |
|
119 |
|
120 |
+
def dump_json(data: list | dict, filepath: str | pathlib.Path, **kwargs):
|
121 |
with open(filepath, "wt", encoding="utf8") as fout:
|
122 |
+
json.dump(data, fout, ensure_ascii=False, **kwargs)
|
123 |
|
124 |
|
125 |
def load_jsonlines(filepath, **kwargs):
|