File size: 4,479 Bytes
c149479
fe3c056
348017a
796eb82
 
02e4a72
796eb82
c149479
 
348017a
 
0841c28
348017a
 
 
 
 
 
 
 
 
796eb82
 
348017a
 
 
 
 
 
 
 
 
 
0841c28
 
 
 
 
 
 
 
 
 
 
 
 
348017a
 
 
796eb82
fe3c056
348017a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
796eb82
fe3c056
348017a
fe3c056
 
 
c54c848
 
 
 
796eb82
c54c848
fe3c056
 
 
c149479
 
 
 
 
 
 
 
796eb82
 
c149479
 
fe3c056
c149479
 
fe3c056
348017a
 
 
02e4a72
348017a
 
 
 
 
796eb82
 
348017a
 
 
 
 
 
 
 
 
 
796eb82
 
348017a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02e4a72
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from src.interfaces.aclanthology import AclanthologyPaperList
from src.interfaces.arxiv import ArxivPaperList
from src.interfaces.dblp import DblpPaperList
from src.utils import (
    dump_paper_list_to_jsonlines,
    dump_paper_list_to_markdown_checklist,
)

if __name__ == "__main__":
    # use `bash scripts/get_aclanthology.sh` to download and prepare anthology data first
    acl_paper_list = AclanthologyPaperList("cache/aclanthology.json")
    # `ee_query`` is an example, and you don't have to fill all the fields
    ee_query = {
        "title": [
            ["information extraction"],
            ["event", "extraction"],
            ["event", "argument", "extraction"],
            ["event", "detection"],
            ["event", "classification"],
            ["event", "tracking"],
            ["event", "relation", "extraction"],
            ["event", "prediction"],
            ["script", "learning"],
        ],
        "venue": [
            ["acl"],
            ["emnlp"],
            ["naacl"],
            ["coling"],
            ["findings"],
            ["tacl"],
            ["cl"],
        ],
        "author": [
            ["Heng Ji"],
            ["Dan Roth"],
        ],
        "year": [
            # multiple time spans with closed interval: ["2006", "2013"] means 2006-2013
            ["2006", "2013"],
            ["2018", "2022"],
        ],
        "month": [
            # the same as the `year` field
            ["4", "11"],
        ]
    }
    ee_papers = acl_paper_list.search(ee_query)
    dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")
    dump_paper_list_to_jsonlines(ee_papers, "results/ee-paper-list.jsonl")

    doc_query = {
        "title": [
            ["document-level"],
        ],
        "venue": [
            ["acl"],
            ["emnlp"],
            ["naacl"],
            ["coling"],
            ["findings"],
            ["tacl"],
            ["cl"],
        ],
    }
    doc_papers = acl_paper_list.search(doc_query)
    dump_paper_list_to_markdown_checklist(doc_papers, "results/doc-paper-list.md")
    dump_paper_list_to_jsonlines(doc_papers, "results/doc-paper-list.jsonl")

    # arxiv papers
    arxiv_paper_list = ArxivPaperList(
        "cache/ee-arxiv.xml",
        use_cache=True,
        title=(
            "Event Extraction OR Event Argument Extraction OR Event Detection"
            " OR Event Classification OR Event Tracking"
            " OR Event Relation Extraction OR Information Extraction"
            " OR Event Prediction OR Script Learning"
        ),
        category="cs.CL",
    )
    arxiv_ee_query = {
        "title": [
            ["information extraction"],
            ["event", "extraction"],
            ["event", "argument", "extraction"],
            ["event", "detection"],
            ["event", "classification"],
            ["event", "tracking"],
            ["event", "relation", "extraction"],
            ["event", "prediction"],
            ["script", "learning"],
        ],
        "venue": [
            ["cs.CL"],
        ],
    }
    arxiv_ee_papers = arxiv_paper_list.search(arxiv_ee_query)
    dump_paper_list_to_markdown_checklist(
        arxiv_ee_papers, "results/arxiv-ee-paper-list.md"
    )
    dump_paper_list_to_jsonlines(arxiv_ee_papers, "results/arxiv-ee-paper-list.jsonl")

    # dblp papers
    dblp_paper_list = DblpPaperList(
        "./cache/dblp.json",
        use_cache=True,
        query="Event|Information|Argument|Script Extraction|Classification|Tracking|Prediction|Learning",
        max_results=50000,
    )
    dblp_ee_query = {
        "title": [
            ["information extraction"],
            ["event", "extraction"],
            ["event", "argument", "extraction"],
            ["event", "detection"],
            ["event", "classification"],
            ["event", "tracking"],
            ["event", "relation", "extraction"],
            ["event", "prediction"],
            ["script", "learning"],
        ],
        "venue": [
            ["aaai"],
            ["ijcai"],
            ["icml"],
            ["iclr"],
            ["nips"],
            ["neurips"],
            ["sigir"],
            ["cvpr"],
            ["iccv"],
        ],
    }
    dblp_ee_papers = dblp_paper_list.search(dblp_ee_query)
    dump_paper_list_to_markdown_checklist(
        dblp_ee_papers, "results/dblp-ee-paper-list.md"
    )
    dump_paper_list_to_jsonlines(dblp_ee_papers, "results/dblp-ee-paper-list.jsonl")