Spaces:
Sleeping
Sleeping
add dblp
Browse files- README.md +3 -3
- results/dblp-ee-paper-list.md +13 -0
- run.py +78 -41
- scripts/get_dblp.sh +3 -0
- src/interfaces/arxiv.py +16 -13
- src/interfaces/dblp.py +97 -0
README.md
CHANGED
@@ -20,7 +20,7 @@ from src.interfaces.aclanthology import AclanthologyPaperList
|
|
20 |
from src.utils import dump_paper_list_to_markdown_checklist
|
21 |
|
22 |
if __name__ == "__main__":
|
23 |
-
# use `bash scripts/get_aclanthology.sh` to download and prepare anthology data
|
24 |
paper_list = AclanthologyPaperList("cache/aclanthology.json")
|
25 |
ee_query = {
|
26 |
"title": [
|
@@ -51,5 +51,5 @@ if __name__ == "__main__":
|
|
51 |
## 🗺️ Roadmap
|
52 |
|
53 |
- [x] aclanthology
|
54 |
-
- [
|
55 |
-
- [
|
|
|
20 |
from src.utils import dump_paper_list_to_markdown_checklist
|
21 |
|
22 |
if __name__ == "__main__":
|
23 |
+
# use `bash scripts/get_aclanthology.sh` to download and prepare anthology data first
|
24 |
paper_list = AclanthologyPaperList("cache/aclanthology.json")
|
25 |
ee_query = {
|
26 |
"title": [
|
|
|
51 |
## 🗺️ Roadmap
|
52 |
|
53 |
- [x] aclanthology
|
54 |
+
- [x] arXiv
|
55 |
+
- [x] dblp
|
results/dblp-ee-paper-list.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
- [ ] [AAAI, 2022] [Learning to Ask for Data-Efficient Event Argument Extraction (Student Abstract).](https://ojs.aaai.org/index.php/AAAI/article/view/21686)
|
2 |
+
- [ ] [AAAI, 2022] [Learning to Ask for Data-Efficient Event Argument Extraction (Student Abstract).](https://ojs.aaai.org/index.php/AAAI/article/view/21686)
|
3 |
+
- [ ] [IJCAI, 2022] [Efficient Document-level Event Extraction via Pseudo-Trigger-aware Pruned Complete Graph.](https://doi.org/10.24963/ijcai.2022/632)
|
4 |
+
- [ ] [AAAI, 2021] [What the Role is vs. What Plays the Role: Semi-Supervised Event Argument Extraction via Dual Question Answering.](https://ojs.aaai.org/index.php/AAAI/article/view/17720)
|
5 |
+
- [ ] [AAAI, 2021] [What the Role is vs. What Plays the Role: Semi-Supervised Event Argument Extraction via Dual Question Answering.](https://ojs.aaai.org/index.php/AAAI/article/view/17720)
|
6 |
+
- [ ] [AAAI, 2018] [Scale Up Event Extraction Learning via Automatic Training Data Generation.](https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16119)
|
7 |
+
- [ ] [AAAI, 2016] [Joint Inference over a Lightly Supervised Information Extraction Pipeline: Towards Event Coreference Resolution for Resource-Scarce Languages.](http://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/view/12413)
|
8 |
+
- [ ] [AAAI, 2016] [Joint Inference over a Lightly Supervised Information Extraction Pipeline: Towards Event Coreference Resolution for Resource-Scarce Languages.](http://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/view/12413)
|
9 |
+
- [ ] [AAAI FALL SYMPOSIA, 2014] [Risk Event and Probability Extraction for Modeling Medical Risks.](http://www.aaai.org/ocs/index.php/FSS/FSS14/paper/view/9198)
|
10 |
+
- [ ] [SIGIR, 2014] [An event extraction model based on timeline and user analysis in Latent Dirichlet allocation.](https://doi.org/10.1145/2600428.2609541)
|
11 |
+
- [ ] [IJCAI, 2013] [Joint Modeling of Argument Identification and Role Determination in Chinese Event Extraction with Discourse-Level Information.](http://www.aaai.org/ocs/index.php/IJCAI/IJCAI13/paper/view/6285)
|
12 |
+
- [ ] [IJCAI, 2013] [Joint Modeling of Argument Identification and Role Determination in Chinese Event Extraction with Discourse-Level Information.](http://www.aaai.org/ocs/index.php/IJCAI/IJCAI13/paper/view/6285)
|
13 |
+
- [ ] [AAAI, 2008] [Combining Global Relevance Information with Local Contextual Clues for Event-Oriented Information Extraction.](http://www.aaai.org/Library/AAAI/2008/aaai08-321.php)
|
run.py
CHANGED
@@ -1,50 +1,52 @@
|
|
1 |
from src.interfaces.aclanthology import AclanthologyPaperList
|
2 |
from src.interfaces.arxiv import ArxivPaperList
|
|
|
3 |
from src.utils import dump_paper_list_to_markdown_checklist
|
4 |
|
5 |
if __name__ == "__main__":
|
6 |
-
#
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
|
|
|
48 |
arxiv_paper_list = ArxivPaperList(
|
49 |
"cache/ee-arxiv.xml",
|
50 |
use_cache=True,
|
@@ -66,4 +68,39 @@ if __name__ == "__main__":
|
|
66 |
],
|
67 |
}
|
68 |
arxiv_ee_papers = arxiv_paper_list.search(arxiv_ee_query)
|
69 |
-
dump_paper_list_to_markdown_checklist(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from src.interfaces.aclanthology import AclanthologyPaperList
|
2 |
from src.interfaces.arxiv import ArxivPaperList
|
3 |
+
from src.interfaces.dblp import DblpPaperList
|
4 |
from src.utils import dump_paper_list_to_markdown_checklist
|
5 |
|
6 |
if __name__ == "__main__":
|
7 |
+
# use `bash scripts/get_aclanthology.sh` to download and prepare anthology data first
|
8 |
+
acl_paper_list = AclanthologyPaperList("cache/aclanthology.json")
|
9 |
+
ee_query = {
|
10 |
+
"title": [
|
11 |
+
["information extraction"],
|
12 |
+
["event", "extraction"],
|
13 |
+
["event", "argument", "extraction"],
|
14 |
+
["event", "detection"],
|
15 |
+
["event", "classification"],
|
16 |
+
["event", "tracking"],
|
17 |
+
["event", "relation", "extraction"],
|
18 |
+
],
|
19 |
+
"venue": [
|
20 |
+
["acl"],
|
21 |
+
["emnlp"],
|
22 |
+
["naacl"],
|
23 |
+
["coling"],
|
24 |
+
["findings"],
|
25 |
+
["tacl"],
|
26 |
+
["cl"],
|
27 |
+
],
|
28 |
+
}
|
29 |
+
ee_papers = acl_paper_list.search(ee_query)
|
30 |
+
dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")
|
31 |
|
32 |
+
doc_query = {
|
33 |
+
"title": [
|
34 |
+
["document-level"],
|
35 |
+
],
|
36 |
+
"venue": [
|
37 |
+
["acl"],
|
38 |
+
["emnlp"],
|
39 |
+
["naacl"],
|
40 |
+
["coling"],
|
41 |
+
["findings"],
|
42 |
+
["tacl"],
|
43 |
+
["cl"],
|
44 |
+
],
|
45 |
+
}
|
46 |
+
doc_papers = acl_paper_list.search(doc_query)
|
47 |
+
dump_paper_list_to_markdown_checklist(doc_papers, "results/doc-paper-list.md")
|
48 |
|
49 |
+
# arxiv papers
|
50 |
arxiv_paper_list = ArxivPaperList(
|
51 |
"cache/ee-arxiv.xml",
|
52 |
use_cache=True,
|
|
|
68 |
],
|
69 |
}
|
70 |
arxiv_ee_papers = arxiv_paper_list.search(arxiv_ee_query)
|
71 |
+
dump_paper_list_to_markdown_checklist(
|
72 |
+
arxiv_ee_papers, "results/arxiv-ee-paper-list.md"
|
73 |
+
)
|
74 |
+
|
75 |
+
# dblp papers
|
76 |
+
dblp_paper_list = DblpPaperList(
|
77 |
+
"./cache/dblp.json",
|
78 |
+
use_cache=True,
|
79 |
+
query="Event Extraction",
|
80 |
+
)
|
81 |
+
dblp_ee_query = {
|
82 |
+
"title": [
|
83 |
+
["information extraction"],
|
84 |
+
["event", "extraction"],
|
85 |
+
["event", "argument", "extraction"],
|
86 |
+
["event", "detection"],
|
87 |
+
["event", "classification"],
|
88 |
+
["event", "tracking"],
|
89 |
+
["event", "relation", "extraction"],
|
90 |
+
],
|
91 |
+
"venue": [
|
92 |
+
["aaai"],
|
93 |
+
["ijcai"],
|
94 |
+
["icml"],
|
95 |
+
["iclr"],
|
96 |
+
["nips"],
|
97 |
+
["neurips"],
|
98 |
+
["sigir"],
|
99 |
+
["cvpr"],
|
100 |
+
["iccv"],
|
101 |
+
],
|
102 |
+
}
|
103 |
+
dblp_ee_papers = dblp_paper_list.search(dblp_ee_query)
|
104 |
+
dump_paper_list_to_markdown_checklist(
|
105 |
+
dblp_ee_papers, "results/dblp-ee-paper-list.md"
|
106 |
+
)
|
scripts/get_dblp.sh
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
mkdir cache
|
2 |
+
cd cache
|
3 |
+
wget -cO ./dblp.xml.gz https://dblp.uni-trier.de/xml/dblp.xml.gz
|
src/interfaces/arxiv.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
-
import re
|
2 |
import pathlib
|
|
|
3 |
|
4 |
import feedparser
|
5 |
|
6 |
-
from src.interfaces import Paper
|
7 |
from src.engine import SearchAPI
|
|
|
8 |
from src.utils import download
|
9 |
|
10 |
|
@@ -12,15 +12,15 @@ class ArxivPaperList(SearchAPI):
|
|
12 |
"""arXiv API
|
13 |
|
14 |
Inputs:
|
15 |
-
cache_filepath:
|
16 |
use_cache: will use cached file if `True`
|
17 |
-
raw:
|
18 |
-
title:
|
19 |
-
author:
|
20 |
-
abstract:
|
21 |
-
comment:
|
22 |
category: arXiv category, e.g. "cs.CL"
|
23 |
-
max_results:
|
24 |
sort_by: `submittedDate` (default) or `lastUpdatedDate`
|
25 |
sort_order: `descending` (default) or `ascending`
|
26 |
|
@@ -51,6 +51,7 @@ class ArxivPaperList(SearchAPI):
|
|
51 |
References:
|
52 |
https://arxiv.org/help/api/user-manual#title_id_published_updated
|
53 |
"""
|
|
|
54 |
API_URL = "https://export.arxiv.org/api/query?search_query="
|
55 |
|
56 |
def __init__(
|
@@ -102,7 +103,7 @@ class ArxivPaperList(SearchAPI):
|
|
102 |
query = query.strip().replace(" ", "+")
|
103 |
query = query.replace("(", "%28")
|
104 |
query = query.replace(")", "%29")
|
105 |
-
query = query.replace("
|
106 |
|
107 |
url = f"{self.API_URL}{query}&start=0&max_results={max_results}&sortBy={sort_by}&sortOrder={sort_order}"
|
108 |
download(url, cache_filepath)
|
@@ -112,7 +113,7 @@ class ArxivPaperList(SearchAPI):
|
|
112 |
for entry in feed.entries:
|
113 |
author = ""
|
114 |
if hasattr(entry, "authors"):
|
115 |
-
author =
|
116 |
url = ""
|
117 |
doi = ""
|
118 |
for link in entry.links:
|
@@ -128,14 +129,16 @@ class ArxivPaperList(SearchAPI):
|
|
128 |
date = entry.updated_parsed
|
129 |
|
130 |
title = re.sub(r"[\s\n]+", " ", entry.title, flags=re.MULTILINE).strip()
|
131 |
-
abstract = re.sub(
|
|
|
|
|
132 |
paper = Paper(
|
133 |
title,
|
134 |
author,
|
135 |
abstract,
|
136 |
url,
|
137 |
doi,
|
138 |
-
" , ".join([t[
|
139 |
str(date.tm_year),
|
140 |
str(date.tm_mon),
|
141 |
)
|
|
|
|
|
1 |
import pathlib
|
2 |
+
import re
|
3 |
|
4 |
import feedparser
|
5 |
|
|
|
6 |
from src.engine import SearchAPI
|
7 |
+
from src.interfaces import Paper
|
8 |
from src.utils import download
|
9 |
|
10 |
|
|
|
12 |
"""arXiv API
|
13 |
|
14 |
Inputs:
|
15 |
+
cache_filepath: Filepath to save cached file
|
16 |
use_cache: will use cached file if `True`
|
17 |
+
raw: Raw api query, e.g. `cat:cs.CL AND ti:event`. If set, others will be disabled
|
18 |
+
title: String of title you wanna search
|
19 |
+
author: Author string
|
20 |
+
abstract: Abstract string
|
21 |
+
comment: Comment string
|
22 |
category: arXiv category, e.g. "cs.CL"
|
23 |
+
max_results: Maximal returned papers
|
24 |
sort_by: `submittedDate` (default) or `lastUpdatedDate`
|
25 |
sort_order: `descending` (default) or `ascending`
|
26 |
|
|
|
51 |
References:
|
52 |
https://arxiv.org/help/api/user-manual#title_id_published_updated
|
53 |
"""
|
54 |
+
|
55 |
API_URL = "https://export.arxiv.org/api/query?search_query="
|
56 |
|
57 |
def __init__(
|
|
|
103 |
query = query.strip().replace(" ", "+")
|
104 |
query = query.replace("(", "%28")
|
105 |
query = query.replace(")", "%29")
|
106 |
+
query = query.replace('"', "%22")
|
107 |
|
108 |
url = f"{self.API_URL}{query}&start=0&max_results={max_results}&sortBy={sort_by}&sortOrder={sort_order}"
|
109 |
download(url, cache_filepath)
|
|
|
113 |
for entry in feed.entries:
|
114 |
author = ""
|
115 |
if hasattr(entry, "authors"):
|
116 |
+
author = " , ".join(author.name for author in entry.authors)
|
117 |
url = ""
|
118 |
doi = ""
|
119 |
for link in entry.links:
|
|
|
129 |
date = entry.updated_parsed
|
130 |
|
131 |
title = re.sub(r"[\s\n]+", " ", entry.title, flags=re.MULTILINE).strip()
|
132 |
+
abstract = re.sub(
|
133 |
+
r"[\s\n]+", " ", entry.summary, flags=re.MULTILINE
|
134 |
+
).strip()
|
135 |
paper = Paper(
|
136 |
title,
|
137 |
author,
|
138 |
abstract,
|
139 |
url,
|
140 |
doi,
|
141 |
+
" , ".join([t["term"] for t in entry.tags]),
|
142 |
str(date.tm_year),
|
143 |
str(date.tm_mon),
|
144 |
)
|
src/interfaces/dblp.py
CHANGED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pathlib
|
2 |
+
import random
|
3 |
+
import re
|
4 |
+
import time
|
5 |
+
|
6 |
+
import requests
|
7 |
+
from tqdm import trange
|
8 |
+
|
9 |
+
from src.engine import SearchAPI
|
10 |
+
from src.interfaces import Paper
|
11 |
+
from src.utils import dump_json, load_json
|
12 |
+
|
13 |
+
|
14 |
+
class DblpPaperList(SearchAPI):
|
15 |
+
"""DBLP paper list
|
16 |
+
|
17 |
+
Inputs:
|
18 |
+
cache_filepath: Filepath to save cached file
|
19 |
+
use_cache: will use cached file if `True`, otherwise download again
|
20 |
+
query: Query string, basically the title
|
21 |
+
you wanna search in a search box.
|
22 |
+
Special logical grammars refer to the reference.
|
23 |
+
max_results: Maximal returned papers
|
24 |
+
request_time_inteval: Seconds to sleep when calling DBLP API
|
25 |
+
|
26 |
+
References:
|
27 |
+
https://dblp.org/faq/How+to+use+the+dblp+search+API.html
|
28 |
+
"""
|
29 |
+
|
30 |
+
API_URL = "https://dblp.org/search/publ/api"
|
31 |
+
|
32 |
+
def __init__(
|
33 |
+
self,
|
34 |
+
cache_filepath: pathlib.Path,
|
35 |
+
use_cache: bool = False,
|
36 |
+
query: str = "",
|
37 |
+
max_results: int = 1000,
|
38 |
+
request_time_inteval: float = 5,
|
39 |
+
) -> None:
|
40 |
+
super().__init__()
|
41 |
+
|
42 |
+
if isinstance(cache_filepath, str):
|
43 |
+
cache_filepath = pathlib.Path(cache_filepath)
|
44 |
+
if (not cache_filepath.exists()) or (not use_cache):
|
45 |
+
query = query.strip()
|
46 |
+
query = re.sub(r"\s+?\|\s+?", "|", query)
|
47 |
+
query = re.sub(r"\s+", "+", query)
|
48 |
+
|
49 |
+
searched_results = []
|
50 |
+
# max capacity is 1000
|
51 |
+
h = 1000
|
52 |
+
for f in trange(0, max_results, h, desc="DBLP Downloading"):
|
53 |
+
url = f"{self.API_URL}?q={query}&format=json&c=0&f={f}&h={h}"
|
54 |
+
try:
|
55 |
+
response = requests.get(url)
|
56 |
+
response.raise_for_status()
|
57 |
+
page = response.json()
|
58 |
+
page_data = page["result"]["hits"]["hit"]
|
59 |
+
if page_data:
|
60 |
+
searched_results.extend(page_data)
|
61 |
+
else:
|
62 |
+
break
|
63 |
+
except KeyboardInterrupt:
|
64 |
+
raise KeyboardInterrupt
|
65 |
+
except Exception:
|
66 |
+
break
|
67 |
+
time.sleep((random.random() + 0.5) * request_time_inteval)
|
68 |
+
dump_json(searched_results, cache_filepath)
|
69 |
+
|
70 |
+
data = load_json(cache_filepath)
|
71 |
+
for d in data:
|
72 |
+
# dblp does not provide abstract and month data
|
73 |
+
authors = []
|
74 |
+
if "authors" in d["info"]:
|
75 |
+
if isinstance(d["info"]["authors"]["author"], dict):
|
76 |
+
authors.append(d["info"]["authors"]["author"]["text"])
|
77 |
+
else:
|
78 |
+
authors = [a["text"] for a in d["info"]["authors"]["author"]]
|
79 |
+
|
80 |
+
venues = []
|
81 |
+
if "venue" in d["info"]:
|
82 |
+
if isinstance(d["info"]["venue"], str):
|
83 |
+
venues.append(d["info"]["venue"])
|
84 |
+
else:
|
85 |
+
for venue in d["info"]["venue"]:
|
86 |
+
venues.append(venue)
|
87 |
+
paper = Paper(
|
88 |
+
d["info"]["title"],
|
89 |
+
" , ".join(authors),
|
90 |
+
"",
|
91 |
+
d["info"].get("ee", d["info"].get("url", "")),
|
92 |
+
d["info"].get("doi", ""),
|
93 |
+
" , ".join(venues),
|
94 |
+
d["info"].get("year", "9999"),
|
95 |
+
"99",
|
96 |
+
)
|
97 |
+
self.papers.append(paper)
|