Spaces:
Sleeping
Sleeping
import functools | |
import gzip | |
import json | |
import pathlib | |
import re | |
import shutil | |
import requests | |
from tqdm.auto import tqdm | |
from src.interfaces import Paper | |
def download(url: str, filepath: str) -> pathlib.Path: | |
"""Download file from url | |
Returns: | |
filepath of the saved file | |
""" | |
r = requests.get(url, stream=True, allow_redirects=True) | |
if r.status_code != 200: | |
r.raise_for_status() # Will only raise for 4xx codes, so... | |
raise RuntimeError(f"Request to {url} returned status code {r.status_code}") | |
file_size = int(r.headers.get("Content-Length", 0)) | |
path = pathlib.Path(filepath).expanduser().resolve() | |
path.parent.mkdir(parents=True, exist_ok=True) | |
desc = "(Unknown total file size)" if file_size == 0 else "" | |
r.raw.read = functools.partial( | |
r.raw.read, decode_content=True | |
) # Decompress if needed | |
with tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw: | |
with path.open("wb") as f: | |
shutil.copyfileobj(r_raw, f) | |
return path | |
def parse_bib( | |
input_filepath: pathlib.Path, output_filepath: pathlib.Path | |
) -> list[dict]: | |
if input_filepath.suffix == ".gz": | |
open_func = gzip.open | |
else: | |
open_func = open | |
data = [] | |
with open_func(input_filepath, "rt", encoding="utf8") as fin: | |
tot_bib_string = fin.read() | |
tot_bib_string = re.sub( | |
r" and\n\s+", " and ", tot_bib_string, flags=re.MULTILINE | |
) | |
tot_entries = tot_bib_string.count("@") | |
for bib in tqdm( | |
re.finditer( | |
r"@(\w+)\{(.+?),\n(.*?)\}$", | |
tot_bib_string, | |
flags=re.MULTILINE | re.DOTALL, | |
), | |
desc="parse bib", | |
total=tot_entries, | |
): | |
bib_type = bib.group(1) | |
bib_key = bib.group(2) | |
bib_content = {} | |
content_string = bib.group(3).strip() | |
for val in re.finditer( | |
r"\s*(.*?)\s*=\s*(.+?),$\n", content_string, flags=re.MULTILINE | |
): | |
bib_content[val.group(1).strip()] = ( | |
val.group(2).strip().removeprefix('"').removesuffix('"') | |
) | |
ins = {"type": bib_type, "key": bib_key, "content": bib_content} | |
if bib_type == "article": | |
ins["content"]["volume"] = ins["content"]["journal"] | |
elif bib_type == "inproceedings": | |
ins["content"]["volume"] = ins["content"]["booktitle"] | |
data.append(ins) | |
with open_func(output_filepath, "wt", encoding="utf8") as fout: | |
json.dump(data, fout, ensure_ascii=False) | |
return data | |
# fmt: off | |
MONTH_MAP = { | |
"january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6, "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12, | |
"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12, | |
} | |
# fmt: one | |
def parse_bib_month(month: str) -> int: | |
if month.isdigit(): | |
return int(month) | |
elif month.lower() in MONTH_MAP: | |
return MONTH_MAP[month.lower()] | |
else: | |
return 99 | |
def load_json(filepath: pathlib.Path) -> dict | list: | |
if isinstance(filepath, str): | |
filepath = pathlib.Path(filepath) | |
if filepath.suffix == ".gz": | |
open_func = gzip.open | |
else: | |
open_func = open | |
with open_func(filepath, "rt", encoding="utf8") as fin: | |
data = json.load(fin) | |
return data | |
def dump_json(data: list | dict, filepath: str | pathlib.Path, **kwargs): | |
with open(filepath, "wt", encoding="utf8") as fout: | |
json.dump(data, fout, ensure_ascii=False, **kwargs) | |
def load_jsonlines(filepath, **kwargs): | |
data = list() | |
with open(filepath, "rt", encoding="utf-8") as fin: | |
for line in fin: | |
line_data = json.loads(line.strip()) | |
data.append(line_data) | |
return data | |
def dump_jsonlines(obj, filepath, **kwargs): | |
with open(filepath, "wt", encoding="utf-8") as fout: | |
for d in obj: | |
line_d = json.dumps( | |
d, ensure_ascii=False, **kwargs | |
) | |
fout.write("{}\n".format(line_d)) | |
def dump_list_to_markdown_checklist(str_list: list[str], filepath: str | pathlib.Path): | |
md_string = "" | |
for string in str_list: | |
md_string += f"- [ ] {string}\n" | |
if isinstance(filepath, str): | |
filepath = pathlib.Path(filepath) | |
if not filepath.parent.exists(): | |
filepath.parent.mkdir(parents=True) | |
with open(filepath, "wt", encoding="utf8") as fout: | |
fout.write(f"{md_string}") | |
def dump_paper_list_to_markdown_checklist(papers: list[Paper], filepath: str | pathlib.Path): | |
string_list = [ | |
f"[{paper.venue.upper()}, {paper.year}] [{paper.title}]({paper.url})" | |
for paper in papers | |
] | |
dump_list_to_markdown_checklist(string_list, filepath) | |
def dump_paper_list_to_jsonlines(papers: list[Paper], filepath: str | pathlib.Path): | |
dump = [] | |
for paper in papers: | |
dump.append(paper.as_dict()) | |
dump_jsonlines(dump, filepath) | |
if __name__ == "__main__": | |
parse_bib( | |
pathlib.Path("cache/anthology+abstracts.bib.gz"), | |
pathlib.Path("cache/anthology+abstracts.json.gz"), | |
) | |