Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 3,625 Bytes
08080f2 5884212 08080f2 e2797b8 08080f2 857ce49 08080f2 9fb4b90 08080f2 9fb4b90 08080f2 5884212 08080f2 9fb4b90 08080f2 9fb4b90 08080f2 9fb4b90 08080f2 e2797b8 08080f2 9fb4b90 08080f2 9fb4b90 08080f2 9fb4b90 08080f2 9fb4b90 08080f2 4a02364 57102fb 4a02364 57102fb 4a02364 57102fb 4a02364 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import dataclasses
import datetime
import operator
import datasets
import pandas as pd
import requests
import tqdm.auto
@dataclasses.dataclass(frozen=True)
class PaperInfo:
date: str
arxiv_id: str
github: str
title: str
paper_page: str
upvotes: int
published_at: str
def __post_init__(self):
object.__setattr__(self, "published_at", PaperInfo.convert_timestamp(self.published_at))
@staticmethod
def convert_timestamp(timestamp: str) -> str:
try:
return datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%Y/%m/%d %H:%M:%S")
except ValueError:
return timestamp
def get_df() -> pd.DataFrame:
df = datasets.load_dataset("hysts-internal/daily-papers")["train"].to_pandas()
df = df.drop(columns=["title"])
paper_info = []
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
res = requests.get(f"https://huggingface.co/api/papers/{row.arxiv_id}").json()
info = PaperInfo(
**row,
title=res["title"],
paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
upvotes=res["upvotes"],
published_at=res["publishedAt"],
)
paper_info.append(info)
return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])
class Prettifier:
@staticmethod
def get_github_link(link: str) -> str:
if not link:
return ""
return Prettifier.create_link("github", link)
@staticmethod
def create_link(text: str, url: str) -> str:
return f'<a href="{url}" target="_blank">{text}</a>'
@staticmethod
def to_div(text: str | None, category_name: str) -> str:
if text is None:
text = ""
class_name = f"{category_name}-{text.lower()}"
return f'<div class="{class_name}">{text}</div>'
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
df = df.sort_values("arxiv_id", ascending=False).reset_index(drop=True)
new_rows = []
for _, row in df.iterrows():
new_row = dict(row) | {
"date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
"paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
"github": self.get_github_link(row.github),
}
new_rows.append(new_row)
return pd.DataFrame(new_rows, columns=df.columns)
class PaperList:
COLUMN_INFO = [
["date", "markdown"],
["paper_page", "markdown"],
["title", "str"],
["github", "markdown"],
["upvotes", "number"],
]
def __init__(self, df: pd.DataFrame):
self.df_raw = df
self._prettifier = Prettifier()
self.df_prettified = self._prettifier(df).loc[:, self.column_names]
@property
def column_names(self):
return list(map(operator.itemgetter(0), self.COLUMN_INFO))
@property
def column_datatype(self):
return list(map(operator.itemgetter(1), self.COLUMN_INFO))
def search(self, start_date: datetime.datetime, end_date: datetime.datetime, title: str) -> pd.DataFrame:
df = self.df_raw.copy()
df["date"] = pd.to_datetime(df["date"])
# Filter by date
df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
df["date"] = df["date"].dt.strftime("%Y-%m-%d")
# Filter by title
df = df[df["title"].str.contains(title, case=False)]
df_prettified = self._prettifier(df).loc[:, self.column_names]
return df_prettified
|