Spaces:

Spico
/

paper-hero

Runtime error

File size: 5,346 Bytes

import functools
import gzip
import json
import pathlib
import re
import shutil

import requests
from tqdm.auto import tqdm

from src.interfaces import Paper


def download(url: str, filepath: str) -> pathlib.Path:
    """Download file from url

    Returns:
        filepath of the saved file
    """
    r = requests.get(url, stream=True, allow_redirects=True)
    if r.status_code != 200:
        r.raise_for_status()  # Will only raise for 4xx codes, so...
        raise RuntimeError(f"Request to {url} returned status code {r.status_code}")
    file_size = int(r.headers.get("Content-Length", 0))

    path = pathlib.Path(filepath).expanduser().resolve()
    path.parent.mkdir(parents=True, exist_ok=True)

    desc = "(Unknown total file size)" if file_size == 0 else ""
    r.raw.read = functools.partial(
        r.raw.read, decode_content=True
    )  # Decompress if needed
    with tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
        with path.open("wb") as f:
            shutil.copyfileobj(r_raw, f)

    return path


def parse_bib(
    input_filepath: pathlib.Path, output_filepath: pathlib.Path
) -> list[dict]:
    if input_filepath.suffix == ".gz":
        open_func = gzip.open
    else:
        open_func = open

    data = []
    with open_func(input_filepath, "rt", encoding="utf8") as fin:
        tot_bib_string = fin.read()
        tot_bib_string = re.sub(
            r"  and\n\s+", "  and  ", tot_bib_string, flags=re.MULTILINE
        )
        tot_entries = tot_bib_string.count("@")
        for bib in tqdm(
            re.finditer(
                r"@(\w+)\{(.+?),\n(.*?)\}$",
                tot_bib_string,
                flags=re.MULTILINE | re.DOTALL,
            ),
            desc="parse bib",
            total=tot_entries,
        ):
            bib_type = bib.group(1)
            bib_key = bib.group(2)
            bib_content = {}
            content_string = bib.group(3).strip()
            for val in re.finditer(
                r"\s*(.*?)\s*=\s*(.+?),$\n", content_string, flags=re.MULTILINE
            ):
                bib_content[val.group(1).strip()] = (
                    val.group(2).strip().removeprefix('"').removesuffix('"')
                )
            ins = {"type": bib_type, "key": bib_key, "content": bib_content}

            if bib_type == "article":
                ins["content"]["volume"] = ins["content"]["journal"]
            elif bib_type == "inproceedings":
                ins["content"]["volume"] = ins["content"]["booktitle"]

            data.append(ins)

    with open_func(output_filepath, "wt", encoding="utf8") as fout:
        json.dump(data, fout, ensure_ascii=False)

    return data


# fmt: off
MONTH_MAP = {
    "january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6, "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12,
    "jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12,
}
# fmt: one


def parse_bib_month(month: str) -> int:
    if month.isdigit():
        return int(month)
    elif month.lower() in MONTH_MAP:
        return MONTH_MAP[month.lower()]
    else:
        return 99


def load_json(filepath: pathlib.Path) -> dict | list:
    if isinstance(filepath, str):
        filepath = pathlib.Path(filepath)

    if filepath.suffix == ".gz":
        open_func = gzip.open
    else:
        open_func = open

    with open_func(filepath, "rt", encoding="utf8") as fin:
        data = json.load(fin)
        return data


def dump_json(data: list | dict, filepath: str | pathlib.Path, **kwargs):
    with open(filepath, "wt", encoding="utf8") as fout:
        json.dump(data, fout, ensure_ascii=False, **kwargs)


def load_jsonlines(filepath, **kwargs):
    data = list()
    with open(filepath, "rt", encoding="utf-8") as fin:
        for line in fin:
            line_data = json.loads(line.strip())
            data.append(line_data)
    return data


def dump_jsonlines(obj, filepath, **kwargs):
    with open(filepath, "wt", encoding="utf-8") as fout:
        for d in obj:
            line_d = json.dumps(
                d, ensure_ascii=False, **kwargs
            )
            fout.write("{}\n".format(line_d))


def dump_list_to_markdown_checklist(str_list: list[str], filepath: str | pathlib.Path):
    md_string = ""
    for string in str_list:
        md_string += f"- [ ] {string}\n"

    if isinstance(filepath, str):
        filepath = pathlib.Path(filepath)
    if not filepath.parent.exists():
        filepath.parent.mkdir(parents=True)

    with open(filepath, "wt", encoding="utf8") as fout:
        fout.write(f"{md_string}")


def dump_paper_list_to_markdown_checklist(papers: list[Paper], filepath: str | pathlib.Path):
    string_list = [
        f"[{paper.venue.upper()}, {paper.year}] [{paper.title}]({paper.url})"
        for paper in papers
    ]
    dump_list_to_markdown_checklist(string_list, filepath)


def dump_paper_list_to_jsonlines(papers: list[Paper], filepath: str | pathlib.Path):
    dump = []
    for paper in papers:
        dump.append(paper.as_dict())
    dump_jsonlines(dump, filepath)


if __name__ == "__main__":
    parse_bib(
        pathlib.Path("cache/anthology+abstracts.bib.gz"),
        pathlib.Path("cache/anthology+abstracts.json.gz"),
    )