Spaces:

Spico
/

paper-hero

Runtime error

App Files Files Community

Spico commited on Jan 19, 2023

Commit

348017a

1 Parent(s): fe3c056

add dblp

Browse files

Files changed (6) hide show

README.md +3 -3
results/dblp-ee-paper-list.md +13 -0
run.py +78 -41
scripts/get_dblp.sh +3 -0
src/interfaces/arxiv.py +16 -13
src/interfaces/dblp.py +97 -0

README.md CHANGED Viewed

@@ -20,7 +20,7 @@ from src.interfaces.aclanthology import AclanthologyPaperList
 from src.utils import dump_paper_list_to_markdown_checklist
 if __name__ == "__main__":
-    # use `bash scripts/get_aclanthology.sh` to download and prepare anthology data
     paper_list = AclanthologyPaperList("cache/aclanthology.json")
     ee_query = {
         "title": [
@@ -51,5 +51,5 @@ if __name__ == "__main__":
 ## 🗺️ Roadmap
 - [x] aclanthology
-- [ ] arXiv
-- [ ] dblp

 from src.utils import dump_paper_list_to_markdown_checklist
 if __name__ == "__main__":
+    # use `bash scripts/get_aclanthology.sh` to download and prepare anthology data first
     paper_list = AclanthologyPaperList("cache/aclanthology.json")
     ee_query = {
         "title": [
 ## 🗺️ Roadmap
 - [x] aclanthology
+- [x] arXiv
+- [x] dblp

results/dblp-ee-paper-list.md ADDED Viewed

	@@ -0,0 +1,13 @@

+- [ ] [AAAI, 2022] [Learning to Ask for Data-Efficient Event Argument Extraction (Student Abstract).](https://ojs.aaai.org/index.php/AAAI/article/view/21686)
+- [ ] [AAAI, 2022] [Learning to Ask for Data-Efficient Event Argument Extraction (Student Abstract).](https://ojs.aaai.org/index.php/AAAI/article/view/21686)
+- [ ] [IJCAI, 2022] [Efficient Document-level Event Extraction via Pseudo-Trigger-aware Pruned Complete Graph.](https://doi.org/10.24963/ijcai.2022/632)
+- [ ] [AAAI, 2021] [What the Role is vs. What Plays the Role: Semi-Supervised Event Argument Extraction via Dual Question Answering.](https://ojs.aaai.org/index.php/AAAI/article/view/17720)
+- [ ] [AAAI, 2021] [What the Role is vs. What Plays the Role: Semi-Supervised Event Argument Extraction via Dual Question Answering.](https://ojs.aaai.org/index.php/AAAI/article/view/17720)
+- [ ] [AAAI, 2018] [Scale Up Event Extraction Learning via Automatic Training Data Generation.](https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16119)
+- [ ] [AAAI, 2016] [Joint Inference over a Lightly Supervised Information Extraction Pipeline: Towards Event Coreference Resolution for Resource-Scarce Languages.](http://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/view/12413)
+- [ ] [AAAI, 2016] [Joint Inference over a Lightly Supervised Information Extraction Pipeline: Towards Event Coreference Resolution for Resource-Scarce Languages.](http://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/view/12413)
+- [ ] [AAAI FALL SYMPOSIA, 2014] [Risk Event and Probability Extraction for Modeling Medical Risks.](http://www.aaai.org/ocs/index.php/FSS/FSS14/paper/view/9198)
+- [ ] [SIGIR, 2014] [An event extraction model based on timeline and user analysis in Latent Dirichlet allocation.](https://doi.org/10.1145/2600428.2609541)
+- [ ] [IJCAI, 2013] [Joint Modeling of Argument Identification and Role Determination in Chinese Event Extraction with Discourse-Level Information.](http://www.aaai.org/ocs/index.php/IJCAI/IJCAI13/paper/view/6285)
+- [ ] [IJCAI, 2013] [Joint Modeling of Argument Identification and Role Determination in Chinese Event Extraction with Discourse-Level Information.](http://www.aaai.org/ocs/index.php/IJCAI/IJCAI13/paper/view/6285)
+- [ ] [AAAI, 2008] [Combining Global Relevance Information with Local Contextual Clues for Event-Oriented Information Extraction.](http://www.aaai.org/Library/AAAI/2008/aaai08-321.php)

run.py CHANGED Viewed

@@ -1,50 +1,52 @@
 from src.interfaces.aclanthology import AclanthologyPaperList
 from src.interfaces.arxiv import ArxivPaperList
 from src.utils import dump_paper_list_to_markdown_checklist
 if __name__ == "__main__":
-    # # use `bash scripts/get_aclanthology.sh` to download and prepare anthology data
-    # acl_paper_list = AclanthologyPaperList("cache/aclanthology.json")
-    # ee_query = {
-    #     "title": [
-    #         ["information extraction"],
-    #         ["event", "extraction"],
-    #         ["event", "argument", "extraction"],
-    #         ["event", "detection"],
-    #         ["event", "classification"],
-    #         ["event", "tracking"],
-    #         ["event", "relation", "extraction"],
-    #     ],
-    #     "venue": [
-    #         ["acl"],
-    #         ["emnlp"],
-    #         ["naacl"],
-    #         ["coling"],
-    #         ["findings"],
-    #         ["tacl"],
-    #         ["cl"],
-    #     ],
-    # }
-    # ee_papers = acl_paper_list.search(ee_query)
-    # dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")
-    # doc_query = {
-    #     "title": [
-    #         ["document-level"],
-    #     ],
-    #     "venue": [
-    #         ["acl"],
-    #         ["emnlp"],
-    #         ["naacl"],
-    #         ["coling"],
-    #         ["findings"],
-    #         ["tacl"],
-    #         ["cl"],
-    #     ],
-    # }
-    # doc_papers = acl_paper_list.search(doc_query)
-    # dump_paper_list_to_markdown_checklist(doc_papers, "results/doc-paper-list.md")
     arxiv_paper_list = ArxivPaperList(
         "cache/ee-arxiv.xml",
         use_cache=True,
@@ -66,4 +68,39 @@ if __name__ == "__main__":
         ],
     }
     arxiv_ee_papers = arxiv_paper_list.search(arxiv_ee_query)
-    dump_paper_list_to_markdown_checklist(arxiv_ee_papers, "results/arxiv-ee-paper-list.md")

 from src.interfaces.aclanthology import AclanthologyPaperList
 from src.interfaces.arxiv import ArxivPaperList
+from src.interfaces.dblp import DblpPaperList
 from src.utils import dump_paper_list_to_markdown_checklist
 if __name__ == "__main__":
+    # use `bash scripts/get_aclanthology.sh` to download and prepare anthology data first
+    acl_paper_list = AclanthologyPaperList("cache/aclanthology.json")
+    ee_query = {
+        "title": [
+            ["information extraction"],
+            ["event", "extraction"],
+            ["event", "argument", "extraction"],
+            ["event", "detection"],
+            ["event", "classification"],
+            ["event", "tracking"],
+            ["event", "relation", "extraction"],
+        ],
+        "venue": [
+            ["acl"],
+            ["emnlp"],
+            ["naacl"],
+            ["coling"],
+            ["findings"],
+            ["tacl"],
+            ["cl"],
+        ],
+    }
+    ee_papers = acl_paper_list.search(ee_query)
+    dump_paper_list_to_markdown_checklist(ee_papers, "results/ee-paper-list.md")
+    doc_query = {
+        "title": [
+            ["document-level"],
+        ],
+        "venue": [
+            ["acl"],
+            ["emnlp"],
+            ["naacl"],
+            ["coling"],
+            ["findings"],
+            ["tacl"],
+            ["cl"],
+        ],
+    }
+    doc_papers = acl_paper_list.search(doc_query)
+    dump_paper_list_to_markdown_checklist(doc_papers, "results/doc-paper-list.md")
+    # arxiv papers
     arxiv_paper_list = ArxivPaperList(
         "cache/ee-arxiv.xml",
         use_cache=True,
         ],
     }
     arxiv_ee_papers = arxiv_paper_list.search(arxiv_ee_query)
+    dump_paper_list_to_markdown_checklist(
+        arxiv_ee_papers, "results/arxiv-ee-paper-list.md"
+    )
+    # dblp papers
+    dblp_paper_list = DblpPaperList(
+        "./cache/dblp.json",
+        use_cache=True,
+        query="Event Extraction",
+    )
+    dblp_ee_query = {
+        "title": [
+            ["information extraction"],
+            ["event", "extraction"],
+            ["event", "argument", "extraction"],
+            ["event", "detection"],
+            ["event", "classification"],
+            ["event", "tracking"],
+            ["event", "relation", "extraction"],
+        ],
+        "venue": [
+            ["aaai"],
+            ["ijcai"],
+            ["icml"],
+            ["iclr"],
+            ["nips"],
+            ["neurips"],
+            ["sigir"],
+            ["cvpr"],
+            ["iccv"],
+        ],
+    }
+    dblp_ee_papers = dblp_paper_list.search(dblp_ee_query)
+    dump_paper_list_to_markdown_checklist(
+        dblp_ee_papers, "results/dblp-ee-paper-list.md"
+    )

scripts/get_dblp.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+mkdir cache
+cd cache
+wget -cO ./dblp.xml.gz https://dblp.uni-trier.de/xml/dblp.xml.gz

src/interfaces/arxiv.py CHANGED Viewed

@@ -1,10 +1,10 @@
-import re
 import pathlib
 import feedparser
-from src.interfaces import Paper
 from src.engine import SearchAPI
 from src.utils import download
@@ -12,15 +12,15 @@ class ArxivPaperList(SearchAPI):
     """arXiv API
     Inputs:
-        cache_filepath: filepath to save cached file
         use_cache: will use cached file if `True`
-        raw: raw api query, e.g. `cat:cs.CL AND ti:event`. If set, others will be disabled
-        title: string of title you wanna search
-        author: author string
-        abstract: abstract string
-        comment: comment string
         category: arXiv category, e.g. "cs.CL"
-        max_results: maximal returned papers
         sort_by: `submittedDate` (default) or `lastUpdatedDate`
         sort_order: `descending` (default) or `ascending`
@@ -51,6 +51,7 @@ class ArxivPaperList(SearchAPI):
     References:
         https://arxiv.org/help/api/user-manual#title_id_published_updated
     """
     API_URL = "https://export.arxiv.org/api/query?search_query="
     def __init__(
@@ -102,7 +103,7 @@ class ArxivPaperList(SearchAPI):
             query = query.strip().replace(" ", "+")
             query = query.replace("(", "%28")
             query = query.replace(")", "%29")
-            query = query.replace("\"", "%22")
             url = f"{self.API_URL}{query}&start=0&max_results={max_results}&sortBy={sort_by}&sortOrder={sort_order}"
             download(url, cache_filepath)
@@ -112,7 +113,7 @@ class ArxivPaperList(SearchAPI):
         for entry in feed.entries:
             author = ""
             if hasattr(entry, "authors"):
-                author = ' , '.join(author.name for author in entry.authors)
             url = ""
             doi = ""
             for link in entry.links:
@@ -128,14 +129,16 @@ class ArxivPaperList(SearchAPI):
                 date = entry.updated_parsed
             title = re.sub(r"[\s\n]+", " ", entry.title, flags=re.MULTILINE).strip()
-            abstract = re.sub(r"[\s\n]+", " ", entry.summary, flags=re.MULTILINE).strip()
             paper = Paper(
                 title,
                 author,
                 abstract,
                 url,
                 doi,
-                " , ".join([t['term'] for t in entry.tags]),
                 str(date.tm_year),
                 str(date.tm_mon),
             )

 import pathlib
+import re
 import feedparser
 from src.engine import SearchAPI
+from src.interfaces import Paper
 from src.utils import download
     """arXiv API
     Inputs:
+        cache_filepath: Filepath to save cached file
         use_cache: will use cached file if `True`
+        raw: Raw api query, e.g. `cat:cs.CL AND ti:event`. If set, others will be disabled
+        title: String of title you wanna search
+        author: Author string
+        abstract: Abstract string
+        comment: Comment string
         category: arXiv category, e.g. "cs.CL"
+        max_results: Maximal returned papers
         sort_by: `submittedDate` (default) or `lastUpdatedDate`
         sort_order: `descending` (default) or `ascending`
     References:
         https://arxiv.org/help/api/user-manual#title_id_published_updated
     """
     API_URL = "https://export.arxiv.org/api/query?search_query="
     def __init__(
             query = query.strip().replace(" ", "+")
             query = query.replace("(", "%28")
             query = query.replace(")", "%29")
+            query = query.replace('"', "%22")
             url = f"{self.API_URL}{query}&start=0&max_results={max_results}&sortBy={sort_by}&sortOrder={sort_order}"
             download(url, cache_filepath)
         for entry in feed.entries:
             author = ""
             if hasattr(entry, "authors"):
+                author = " , ".join(author.name for author in entry.authors)
             url = ""
             doi = ""
             for link in entry.links:
                 date = entry.updated_parsed
             title = re.sub(r"[\s\n]+", " ", entry.title, flags=re.MULTILINE).strip()
+            abstract = re.sub(
+                r"[\s\n]+", " ", entry.summary, flags=re.MULTILINE
+            ).strip()
             paper = Paper(
                 title,
                 author,
                 abstract,
                 url,
                 doi,
+                " , ".join([t["term"] for t in entry.tags]),
                 str(date.tm_year),
                 str(date.tm_mon),
             )

src/interfaces/dblp.py CHANGED Viewed

	@@ -0,0 +1,97 @@

+import pathlib
+import random
+import re
+import time
+import requests
+from tqdm import trange
+from src.engine import SearchAPI
+from src.interfaces import Paper
+from src.utils import dump_json, load_json
+class DblpPaperList(SearchAPI):
+    """DBLP paper list
+    Inputs:
+        cache_filepath: Filepath to save cached file
+        use_cache: will use cached file if `True`, otherwise download again
+        query: Query string, basically the title
+            you wanna search in a search box.
+            Special logical grammars refer to the reference.
+        max_results: Maximal returned papers
+        request_time_inteval: Seconds to sleep when calling DBLP API
+    References:
+        https://dblp.org/faq/How+to+use+the+dblp+search+API.html
+    """
+    API_URL = "https://dblp.org/search/publ/api"
+    def __init__(
+        self,
+        cache_filepath: pathlib.Path,
+        use_cache: bool = False,
+        query: str = "",
+        max_results: int = 1000,
+        request_time_inteval: float = 5,
+    ) -> None:
+        super().__init__()
+        if isinstance(cache_filepath, str):
+            cache_filepath = pathlib.Path(cache_filepath)
+        if (not cache_filepath.exists()) or (not use_cache):
+            query = query.strip()
+            query = re.sub(r"\s+?\|\s+?", "|", query)
+            query = re.sub(r"\s+", "+", query)
+            searched_results = []
+            # max capacity is 1000
+            h = 1000
+            for f in trange(0, max_results, h, desc="DBLP Downloading"):
+                url = f"{self.API_URL}?q={query}&format=json&c=0&f={f}&h={h}"
+                try:
+                    response = requests.get(url)
+                    response.raise_for_status()
+                    page = response.json()
+                    page_data = page["result"]["hits"]["hit"]
+                    if page_data:
+                        searched_results.extend(page_data)
+                    else:
+                        break
+                except KeyboardInterrupt:
+                    raise KeyboardInterrupt
+                except Exception:
+                    break
+                time.sleep((random.random() + 0.5) * request_time_inteval)
+            dump_json(searched_results, cache_filepath)
+        data = load_json(cache_filepath)
+        for d in data:
+            # dblp does not provide abstract and month data
+            authors = []
+            if "authors" in d["info"]:
+                if isinstance(d["info"]["authors"]["author"], dict):
+                    authors.append(d["info"]["authors"]["author"]["text"])
+                else:
+                    authors = [a["text"] for a in d["info"]["authors"]["author"]]
+            venues = []
+            if "venue" in d["info"]:
+                if isinstance(d["info"]["venue"], str):
+                    venues.append(d["info"]["venue"])
+                else:
+                    for venue in d["info"]["venue"]:
+                        venues.append(venue)
+            paper = Paper(
+                d["info"]["title"],
+                " , ".join(authors),
+                "",
+                d["info"].get("ee", d["info"].get("url", "")),
+                d["info"].get("doi", ""),
+                " , ".join(venues),
+                d["info"].get("year", "9999"),
+                "99",
+            )
+            self.papers.append(paper)