Spico commited on
Commit
348017a
·
1 Parent(s): fe3c056
README.md CHANGED
@@ -20,7 +20,7 @@ from src.interfaces.aclanthology import AclanthologyPaperList
20
  from src.utils import dump_paper_list_to_markdown_checklist
21
 
22
  if __name__ == "__main__":
23
- # use `bash scripts/get_aclanthology.sh` to download and prepare anthology data
24
  paper_list = AclanthologyPaperList("cache/aclanthology.json")
25
  ee_query = {
26
  "title": [
@@ -51,5 +51,5 @@ if __name__ == "__main__":
51
  ## 🗺️ Roadmap
52
 
53
  - [x] aclanthology
54
- - [ ] arXiv
55
- - [ ] dblp
 
20
  from src.utils import dump_paper_list_to_markdown_checklist
21
 
22
  if __name__ == "__main__":
23
+ # use `bash scripts/get_aclanthology.sh` to download and prepare anthology data first
24
  paper_list = AclanthologyPaperList("cache/aclanthology.json")
25
  ee_query = {
26
  "title": [
 
51
  ## 🗺️ Roadmap
52
 
53
  - [x] aclanthology
54
+ - [x] arXiv
55
+ - [x] dblp
results/dblp-ee-paper-list.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - [ ] [AAAI, 2022] [Learning to Ask for Data-Efficient Event Argument Extraction (Student Abstract).](https://ojs.aaai.org/index.php/AAAI/article/view/21686)
2
+ - [ ] [AAAI, 2022] [Learning to Ask for Data-Efficient Event Argument Extraction (Student Abstract).](https://ojs.aaai.org/index.php/AAAI/article/view/21686)
3
+ - [ ] [IJCAI, 2022] [Efficient Document-level Event Extraction via Pseudo-Trigger-aware Pruned Complete Graph.](https://doi.org/10.24963/ijcai.2022/632)
4
+ - [ ] [AAAI, 2021] [What the Role is vs. What Plays the Role: Semi-Supervised Event Argument Extraction via Dual Question Answering.](https://ojs.aaai.org/index.php/AAAI/article/view/17720)
5
+ - [ ] [AAAI, 2021] [What the Role is vs. What Plays the Role: Semi-Supervised Event Argument Extraction via Dual Question Answering.](https://ojs.aaai.org/index.php/AAAI/article/view/17720)
6
+ - [ ] [AAAI, 2018] [Scale Up Event Extraction Learning via Automatic Training Data Generation.](https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16119)
7
+ - [ ] [AAAI, 2016] [Joint Inference over a Lightly Supervised Information Extraction Pipeline: Towards Event Coreference Resolution for Resource-Scarce Languages.](http://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/view/12413)
8
+ - [ ] [AAAI, 2016] [Joint Inference over a Lightly Supervised Information Extraction Pipeline: Towards Event Coreference Resolution for Resource-Scarce Languages.](http://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/view/12413)
9
+ - [ ] [AAAI FALL SYMPOSIA, 2014] [Risk Event and Probability Extraction for Modeling Medical Risks.](http://www.aaai.org/ocs/index.php/FSS/FSS14/paper/view/9198)
10
+ - [ ] [SIGIR, 2014] [An event extraction model based on timeline and user analysis in Latent Dirichlet allocation.](https://doi.org/10.1145/2600428.2609541)
11
+ - [ ] [IJCAI, 2013] [Joint Modeling of Argument Identification and Role Determination in Chinese Event Extraction with Discourse-Level Information.](http://www.aaai.org/ocs/index.php/IJCAI/IJCAI13/paper/view/6285)
12
+ - [ ] [IJCAI, 2013] [Joint Modeling of Argument Identification and Role Determination in Chinese Event Extraction with Discourse-Level Information.](http://www.aaai.org/ocs/index.php/IJCAI/IJCAI13/paper/view/6285)
13
+ - [ ] [AAAI, 2008] [Combining Global Relevance Information with Local Contextual Clues for Event-Oriented Information Extraction.](http://www.aaai.org/Library/AAAI/2008/aaai08-321.php)
run.py CHANGED
@@ -1,50 +1,52 @@
1
  from src.interfaces.aclanthology import AclanthologyPaperList
2
  from src.interfaces.arxiv import ArxivPaperList
 
3
  from src.utils import dump_paper_list_to_markdown_checklist
4
 
5
  if __name__ == "__main__":
6
- # # use `bash scripts/get_aclanthology.sh` to download and prepare anthology data
7
- # acl_paper_list = AclanthologyPaperList("cache/aclanthology.json")
8
- # ee_query = {
9
- # "title": [
10
- # ["information extraction"],
11
- # ["event", "extraction"],
12
- # ["event", "argument", "extraction"],
13
- # ["event", "detection"],
14
- # ["event", "classification"],
15
- # ["event", "tracking"],
16
- # ["event", "relation", "extraction"],
17
- # ],
18
- # "venue": [
19
- # ["acl"],
20
- # ["emnlp"],
21
- # ["naacl"],
22
- # ["coling"],
23
- # ["findings"],
24
- # ["tacl"],
25
- # ["cl"],
26
- # ],
27
- # }
28
- # ee_papers = acl_paper_list.search(ee_query)
29
- # dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")
30
 
31
- # doc_query = {
32
- # "title": [
33
- # ["document-level"],
34
- # ],
35
- # "venue": [
36
- # ["acl"],
37
- # ["emnlp"],
38
- # ["naacl"],
39
- # ["coling"],
40
- # ["findings"],
41
- # ["tacl"],
42
- # ["cl"],
43
- # ],
44
- # }
45
- # doc_papers = acl_paper_list.search(doc_query)
46
- # dump_paper_list_to_markdown_checklist(doc_papers, "results/doc-paper-list.md")
47
 
 
48
  arxiv_paper_list = ArxivPaperList(
49
  "cache/ee-arxiv.xml",
50
  use_cache=True,
@@ -66,4 +68,39 @@ if __name__ == "__main__":
66
  ],
67
  }
68
  arxiv_ee_papers = arxiv_paper_list.search(arxiv_ee_query)
69
- dump_paper_list_to_markdown_checklist(arxiv_ee_papers, "results/arxiv-ee-paper-list.md")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from src.interfaces.aclanthology import AclanthologyPaperList
2
  from src.interfaces.arxiv import ArxivPaperList
3
+ from src.interfaces.dblp import DblpPaperList
4
  from src.utils import dump_paper_list_to_markdown_checklist
5
 
6
  if __name__ == "__main__":
7
+ # use `bash scripts/get_aclanthology.sh` to download and prepare anthology data first
8
+ acl_paper_list = AclanthologyPaperList("cache/aclanthology.json")
9
+ ee_query = {
10
+ "title": [
11
+ ["information extraction"],
12
+ ["event", "extraction"],
13
+ ["event", "argument", "extraction"],
14
+ ["event", "detection"],
15
+ ["event", "classification"],
16
+ ["event", "tracking"],
17
+ ["event", "relation", "extraction"],
18
+ ],
19
+ "venue": [
20
+ ["acl"],
21
+ ["emnlp"],
22
+ ["naacl"],
23
+ ["coling"],
24
+ ["findings"],
25
+ ["tacl"],
26
+ ["cl"],
27
+ ],
28
+ }
29
+ ee_papers = acl_paper_list.search(ee_query)
30
+ dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")
31
 
32
+ doc_query = {
33
+ "title": [
34
+ ["document-level"],
35
+ ],
36
+ "venue": [
37
+ ["acl"],
38
+ ["emnlp"],
39
+ ["naacl"],
40
+ ["coling"],
41
+ ["findings"],
42
+ ["tacl"],
43
+ ["cl"],
44
+ ],
45
+ }
46
+ doc_papers = acl_paper_list.search(doc_query)
47
+ dump_paper_list_to_markdown_checklist(doc_papers, "results/doc-paper-list.md")
48
 
49
+ # arxiv papers
50
  arxiv_paper_list = ArxivPaperList(
51
  "cache/ee-arxiv.xml",
52
  use_cache=True,
 
68
  ],
69
  }
70
  arxiv_ee_papers = arxiv_paper_list.search(arxiv_ee_query)
71
+ dump_paper_list_to_markdown_checklist(
72
+ arxiv_ee_papers, "results/arxiv-ee-paper-list.md"
73
+ )
74
+
75
+ # dblp papers
76
+ dblp_paper_list = DblpPaperList(
77
+ "./cache/dblp.json",
78
+ use_cache=True,
79
+ query="Event Extraction",
80
+ )
81
+ dblp_ee_query = {
82
+ "title": [
83
+ ["information extraction"],
84
+ ["event", "extraction"],
85
+ ["event", "argument", "extraction"],
86
+ ["event", "detection"],
87
+ ["event", "classification"],
88
+ ["event", "tracking"],
89
+ ["event", "relation", "extraction"],
90
+ ],
91
+ "venue": [
92
+ ["aaai"],
93
+ ["ijcai"],
94
+ ["icml"],
95
+ ["iclr"],
96
+ ["nips"],
97
+ ["neurips"],
98
+ ["sigir"],
99
+ ["cvpr"],
100
+ ["iccv"],
101
+ ],
102
+ }
103
+ dblp_ee_papers = dblp_paper_list.search(dblp_ee_query)
104
+ dump_paper_list_to_markdown_checklist(
105
+ dblp_ee_papers, "results/dblp-ee-paper-list.md"
106
+ )
scripts/get_dblp.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ mkdir cache
2
+ cd cache
3
+ wget -cO ./dblp.xml.gz https://dblp.uni-trier.de/xml/dblp.xml.gz
src/interfaces/arxiv.py CHANGED
@@ -1,10 +1,10 @@
1
- import re
2
  import pathlib
 
3
 
4
  import feedparser
5
 
6
- from src.interfaces import Paper
7
  from src.engine import SearchAPI
 
8
  from src.utils import download
9
 
10
 
@@ -12,15 +12,15 @@ class ArxivPaperList(SearchAPI):
12
  """arXiv API
13
 
14
  Inputs:
15
- cache_filepath: filepath to save cached file
16
  use_cache: will use cached file if `True`
17
- raw: raw api query, e.g. `cat:cs.CL AND ti:event`. If set, others will be disabled
18
- title: string of title you wanna search
19
- author: author string
20
- abstract: abstract string
21
- comment: comment string
22
  category: arXiv category, e.g. "cs.CL"
23
- max_results: maximal returned papers
24
  sort_by: `submittedDate` (default) or `lastUpdatedDate`
25
  sort_order: `descending` (default) or `ascending`
26
 
@@ -51,6 +51,7 @@ class ArxivPaperList(SearchAPI):
51
  References:
52
  https://arxiv.org/help/api/user-manual#title_id_published_updated
53
  """
 
54
  API_URL = "https://export.arxiv.org/api/query?search_query="
55
 
56
  def __init__(
@@ -102,7 +103,7 @@ class ArxivPaperList(SearchAPI):
102
  query = query.strip().replace(" ", "+")
103
  query = query.replace("(", "%28")
104
  query = query.replace(")", "%29")
105
- query = query.replace("\"", "%22")
106
 
107
  url = f"{self.API_URL}{query}&start=0&max_results={max_results}&sortBy={sort_by}&sortOrder={sort_order}"
108
  download(url, cache_filepath)
@@ -112,7 +113,7 @@ class ArxivPaperList(SearchAPI):
112
  for entry in feed.entries:
113
  author = ""
114
  if hasattr(entry, "authors"):
115
- author = ' , '.join(author.name for author in entry.authors)
116
  url = ""
117
  doi = ""
118
  for link in entry.links:
@@ -128,14 +129,16 @@ class ArxivPaperList(SearchAPI):
128
  date = entry.updated_parsed
129
 
130
  title = re.sub(r"[\s\n]+", " ", entry.title, flags=re.MULTILINE).strip()
131
- abstract = re.sub(r"[\s\n]+", " ", entry.summary, flags=re.MULTILINE).strip()
 
 
132
  paper = Paper(
133
  title,
134
  author,
135
  abstract,
136
  url,
137
  doi,
138
- " , ".join([t['term'] for t in entry.tags]),
139
  str(date.tm_year),
140
  str(date.tm_mon),
141
  )
 
 
1
  import pathlib
2
+ import re
3
 
4
  import feedparser
5
 
 
6
  from src.engine import SearchAPI
7
+ from src.interfaces import Paper
8
  from src.utils import download
9
 
10
 
 
12
  """arXiv API
13
 
14
  Inputs:
15
+ cache_filepath: Filepath to save cached file
16
  use_cache: will use cached file if `True`
17
+ raw: Raw api query, e.g. `cat:cs.CL AND ti:event`. If set, others will be disabled
18
+ title: String of title you wanna search
19
+ author: Author string
20
+ abstract: Abstract string
21
+ comment: Comment string
22
  category: arXiv category, e.g. "cs.CL"
23
+ max_results: Maximal returned papers
24
  sort_by: `submittedDate` (default) or `lastUpdatedDate`
25
  sort_order: `descending` (default) or `ascending`
26
 
 
51
  References:
52
  https://arxiv.org/help/api/user-manual#title_id_published_updated
53
  """
54
+
55
  API_URL = "https://export.arxiv.org/api/query?search_query="
56
 
57
  def __init__(
 
103
  query = query.strip().replace(" ", "+")
104
  query = query.replace("(", "%28")
105
  query = query.replace(")", "%29")
106
+ query = query.replace('"', "%22")
107
 
108
  url = f"{self.API_URL}{query}&start=0&max_results={max_results}&sortBy={sort_by}&sortOrder={sort_order}"
109
  download(url, cache_filepath)
 
113
  for entry in feed.entries:
114
  author = ""
115
  if hasattr(entry, "authors"):
116
+ author = " , ".join(author.name for author in entry.authors)
117
  url = ""
118
  doi = ""
119
  for link in entry.links:
 
129
  date = entry.updated_parsed
130
 
131
  title = re.sub(r"[\s\n]+", " ", entry.title, flags=re.MULTILINE).strip()
132
+ abstract = re.sub(
133
+ r"[\s\n]+", " ", entry.summary, flags=re.MULTILINE
134
+ ).strip()
135
  paper = Paper(
136
  title,
137
  author,
138
  abstract,
139
  url,
140
  doi,
141
+ " , ".join([t["term"] for t in entry.tags]),
142
  str(date.tm_year),
143
  str(date.tm_mon),
144
  )
src/interfaces/dblp.py CHANGED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pathlib
2
+ import random
3
+ import re
4
+ import time
5
+
6
+ import requests
7
+ from tqdm import trange
8
+
9
+ from src.engine import SearchAPI
10
+ from src.interfaces import Paper
11
+ from src.utils import dump_json, load_json
12
+
13
+
14
+ class DblpPaperList(SearchAPI):
15
+ """DBLP paper list
16
+
17
+ Inputs:
18
+ cache_filepath: Filepath to save cached file
19
+ use_cache: will use cached file if `True`, otherwise download again
20
+ query: Query string, basically the title
21
+ you wanna search in a search box.
22
+ Special logical grammars refer to the reference.
23
+ max_results: Maximal returned papers
24
+ request_time_inteval: Seconds to sleep when calling DBLP API
25
+
26
+ References:
27
+ https://dblp.org/faq/How+to+use+the+dblp+search+API.html
28
+ """
29
+
30
+ API_URL = "https://dblp.org/search/publ/api"
31
+
32
+ def __init__(
33
+ self,
34
+ cache_filepath: pathlib.Path,
35
+ use_cache: bool = False,
36
+ query: str = "",
37
+ max_results: int = 1000,
38
+ request_time_inteval: float = 5,
39
+ ) -> None:
40
+ super().__init__()
41
+
42
+ if isinstance(cache_filepath, str):
43
+ cache_filepath = pathlib.Path(cache_filepath)
44
+ if (not cache_filepath.exists()) or (not use_cache):
45
+ query = query.strip()
46
+ query = re.sub(r"\s+?\|\s+?", "|", query)
47
+ query = re.sub(r"\s+", "+", query)
48
+
49
+ searched_results = []
50
+ # max capacity is 1000
51
+ h = 1000
52
+ for f in trange(0, max_results, h, desc="DBLP Downloading"):
53
+ url = f"{self.API_URL}?q={query}&format=json&c=0&f={f}&h={h}"
54
+ try:
55
+ response = requests.get(url)
56
+ response.raise_for_status()
57
+ page = response.json()
58
+ page_data = page["result"]["hits"]["hit"]
59
+ if page_data:
60
+ searched_results.extend(page_data)
61
+ else:
62
+ break
63
+ except KeyboardInterrupt:
64
+ raise KeyboardInterrupt
65
+ except Exception:
66
+ break
67
+ time.sleep((random.random() + 0.5) * request_time_inteval)
68
+ dump_json(searched_results, cache_filepath)
69
+
70
+ data = load_json(cache_filepath)
71
+ for d in data:
72
+ # dblp does not provide abstract and month data
73
+ authors = []
74
+ if "authors" in d["info"]:
75
+ if isinstance(d["info"]["authors"]["author"], dict):
76
+ authors.append(d["info"]["authors"]["author"]["text"])
77
+ else:
78
+ authors = [a["text"] for a in d["info"]["authors"]["author"]]
79
+
80
+ venues = []
81
+ if "venue" in d["info"]:
82
+ if isinstance(d["info"]["venue"], str):
83
+ venues.append(d["info"]["venue"])
84
+ else:
85
+ for venue in d["info"]["venue"]:
86
+ venues.append(venue)
87
+ paper = Paper(
88
+ d["info"]["title"],
89
+ " , ".join(authors),
90
+ "",
91
+ d["info"].get("ee", d["info"].get("url", "")),
92
+ d["info"].get("doi", ""),
93
+ " , ".join(venues),
94
+ d["info"].get("year", "9999"),
95
+ "99",
96
+ )
97
+ self.papers.append(paper)