daily-papers / papers.py
hysts's picture
hysts HF staff
Add filter by date
4a02364
raw
history blame
3.46 kB
import dataclasses
import datetime
import operator
import pathlib
import pandas as pd
import requests
import tqdm.auto
@dataclasses.dataclass(frozen=True)
class PaperInfo:
date: str
arxiv_id: str
github: str
title: str
paper_page: str
upvotes: int
published_at: str
def __post_init__(self):
object.__setattr__(self, "published_at", PaperInfo.convert_timestamp(self.published_at))
@staticmethod
def convert_timestamp(timestamp: str) -> str:
try:
return datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%Y/%m/%d %H:%M:%S")
except ValueError:
return timestamp
def get_df(path: pathlib.Path | str) -> pd.DataFrame:
df = pd.read_csv(path, dtype=str).fillna("")
paper_info = []
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
res = requests.get(f"https://huggingface.co/api/papers/{row.arxiv_id}").json()
info = PaperInfo(
**row,
title=res["title"],
paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
upvotes=res["upvotes"],
published_at=res["publishedAt"],
)
paper_info.append(info)
return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])
class Prettifier:
@staticmethod
def get_github_link(link: str) -> str:
if not link:
return ""
return Prettifier.create_link("github", link)
@staticmethod
def create_link(text: str, url: str) -> str:
return f'<a href="{url}" target="_blank">{text}</a>'
@staticmethod
def to_div(text: str | None, category_name: str) -> str:
if text is None:
text = ""
class_name = f"{category_name}-{text.lower()}"
return f'<div class="{class_name}">{text}</div>'
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
df = df.sort_values("arxiv_id", ascending=False).reset_index(drop=True)
new_rows = []
for _, row in df.iterrows():
new_row = dict(row) | {
"date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
"paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
"github": self.get_github_link(row.github),
}
new_rows.append(new_row)
return pd.DataFrame(new_rows, columns=df.columns)
class PaperList:
COLUMN_INFO = [
["date", "markdown"],
["paper_page", "markdown"],
["title", "str"],
["github", "markdown"],
["upvotes", "number"],
]
def __init__(self, df: pd.DataFrame):
self.df_raw = df
self._prettifier = Prettifier()
self.df_prettified = self._prettifier(df).loc[:, self.column_names]
@property
def column_names(self):
return list(map(operator.itemgetter(0), self.COLUMN_INFO))
@property
def column_datatype(self):
return list(map(operator.itemgetter(1), self.COLUMN_INFO))
def filter_by_date(self, start_date: datetime.datetime, end_date: datetime.datetime) -> pd.DataFrame:
df = self.df_raw.copy()
df["date"] = pd.to_datetime(df["date"])
df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
df["date"] = df["date"].dt.strftime("%Y-%m-%d")
df_prettified = self._prettifier(df).loc[:, self.column_names]
return df_prettified