Spaces:

Spico
/

paper-hero

Runtime error

App Files Files Community

paper-hero / src /utils.py

Spico

- add `build_paper_list` and `build_and_search` methods to help build demo (direct API)

0841c28 over 2 years ago

raw

history blame

5.35 kB

	import functools
	import gzip
	import json
	import pathlib
	import re
	import shutil

	import requests
	from tqdm.auto import tqdm

	from src.interfaces import Paper


	def download(url: str, filepath: str) -> pathlib.Path:
	"""Download file from url

	Returns:
	filepath of the saved file
	"""
	r = requests.get(url, stream=True, allow_redirects=True)
	if r.status_code != 200:
	r.raise_for_status() # Will only raise for 4xx codes, so...
	raise RuntimeError(f"Request to {url} returned status code {r.status_code}")
	file_size = int(r.headers.get("Content-Length", 0))

	path = pathlib.Path(filepath).expanduser().resolve()
	path.parent.mkdir(parents=True, exist_ok=True)

	desc = "(Unknown total file size)" if file_size == 0 else ""
	r.raw.read = functools.partial(
	r.raw.read, decode_content=True
	) # Decompress if needed
	with tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
	with path.open("wb") as f:
	shutil.copyfileobj(r_raw, f)

	return path


	def parse_bib(
	input_filepath: pathlib.Path, output_filepath: pathlib.Path
	) -> list[dict]:
	if input_filepath.suffix == ".gz":
	open_func = gzip.open
	else:
	open_func = open

	data = []
	with open_func(input_filepath, "rt", encoding="utf8") as fin:
	tot_bib_string = fin.read()
	tot_bib_string = re.sub(
	r" and\n\s+", " and ", tot_bib_string, flags=re.MULTILINE
	)
	tot_entries = tot_bib_string.count("@")
	for bib in tqdm(
	re.finditer(
	r"@(\w+)\{(.+?),\n(.*?)\}$",
	tot_bib_string,
	flags=re.MULTILINE \| re.DOTALL,
	),
	desc="parse bib",
	total=tot_entries,
	):
	bib_type = bib.group(1)
	bib_key = bib.group(2)
	bib_content = {}
	content_string = bib.group(3).strip()
	for val in re.finditer(
	r"\s(.?)\s=\s(.+?),$\n", content_string, flags=re.MULTILINE
	):
	bib_content[val.group(1).strip()] = (
	val.group(2).strip().removeprefix('"').removesuffix('"')
	)
	ins = {"type": bib_type, "key": bib_key, "content": bib_content}

	if bib_type == "article":
	ins["content"]["volume"] = ins["content"]["journal"]
	elif bib_type == "inproceedings":
	ins["content"]["volume"] = ins["content"]["booktitle"]

	data.append(ins)

	with open_func(output_filepath, "wt", encoding="utf8") as fout:
	json.dump(data, fout, ensure_ascii=False)

	return data


	# fmt: off
	MONTH_MAP = {
	"january": 1, "february": 2, "march": 3, "april": 4, "may": 5, "june": 6, "july": 7, "august": 8, "september": 9, "october": 10, "november": 11, "december": 12,
	"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12,
	}
	# fmt: one


	def parse_bib_month(month: str) -> int:
	if month.isdigit():
	return int(month)
	elif month.lower() in MONTH_MAP:
	return MONTH_MAP[month.lower()]
	else:
	return 99


	def load_json(filepath: pathlib.Path) -> dict \| list:
	if isinstance(filepath, str):
	filepath = pathlib.Path(filepath)

	if filepath.suffix == ".gz":
	open_func = gzip.open
	else:
	open_func = open

	with open_func(filepath, "rt", encoding="utf8") as fin:
	data = json.load(fin)
	return data


	def dump_json(data: list \| dict, filepath: str \| pathlib.Path, **kwargs):
	with open(filepath, "wt", encoding="utf8") as fout:
	json.dump(data, fout, ensure_ascii=False, **kwargs)


	def load_jsonlines(filepath, **kwargs):
	data = list()
	with open(filepath, "rt", encoding="utf-8") as fin:
	for line in fin:
	line_data = json.loads(line.strip())
	data.append(line_data)
	return data


	def dump_jsonlines(obj, filepath, **kwargs):
	with open(filepath, "wt", encoding="utf-8") as fout:
	for d in obj:
	line_d = json.dumps(
	d, ensure_ascii=False, **kwargs
	)
	fout.write("{}\n".format(line_d))


	def dump_list_to_markdown_checklist(str_list: list[str], filepath: str \| pathlib.Path):
	md_string = ""
	for string in str_list:
	md_string += f"- [ ] {string}\n"

	if isinstance(filepath, str):
	filepath = pathlib.Path(filepath)
	if not filepath.parent.exists():
	filepath.parent.mkdir(parents=True)

	with open(filepath, "wt", encoding="utf8") as fout:
	fout.write(f"{md_string}")


	def dump_paper_list_to_markdown_checklist(papers: list[Paper], filepath: str \| pathlib.Path):
	string_list = [
	f"[{paper.venue.upper()}, {paper.year}] [{paper.title}]({paper.url})"
	for paper in papers
	]
	dump_list_to_markdown_checklist(string_list, filepath)


	def dump_paper_list_to_jsonlines(papers: list[Paper], filepath: str \| pathlib.Path):
	dump = []
	for paper in papers:
	dump.append(paper.as_dict())
	dump_jsonlines(dump, filepath)


	if __name__ == "__main__":
	parse_bib(
	pathlib.Path("cache/anthology+abstracts.bib.gz"),
	pathlib.Path("cache/anthology+abstracts.json.gz"),
	)