Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 3,109 Bytes
08080f2 5884212 08080f2 e2797b8 08080f2 5884212 5c4c264 08080f2 9fb4b90 08080f2 9fb4b90 08080f2 e2797b8 08080f2 9fb4b90 08080f2 9fb4b90 08080f2 9fb4b90 08080f2 4a02364 5c4c264 4a02364 57102fb 4a02364 57102fb 5c4c264 57102fb 4a02364 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import dataclasses
import datetime
import operator
import datasets
import pandas as pd
import tqdm.auto
@dataclasses.dataclass(frozen=True)
class PaperInfo:
date: str
arxiv_id: str
github: str
title: str
paper_page: str
upvotes: int
def get_df() -> pd.DataFrame:
df = pd.merge(
left=datasets.load_dataset("hysts-bot-data/daily-papers")["train"].to_pandas(),
right=datasets.load_dataset("hysts-bot-data/daily-papers-upvotes")["train"].to_pandas(),
on="arxiv_id",
)
paper_info = []
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
info = PaperInfo(
**row,
paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
)
paper_info.append(info)
return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])
class Prettifier:
@staticmethod
def get_github_link(link: str) -> str:
if not link:
return ""
return Prettifier.create_link("github", link)
@staticmethod
def create_link(text: str, url: str) -> str:
return f'<a href="{url}" target="_blank">{text}</a>'
@staticmethod
def to_div(text: str | None, category_name: str) -> str:
if text is None:
text = ""
class_name = f"{category_name}-{text.lower()}"
return f'<div class="{class_name}">{text}</div>'
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
new_rows = []
for _, row in df.iterrows():
new_row = dict(row) | {
"date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
"paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
"github": self.get_github_link(row.github),
}
new_rows.append(new_row)
return pd.DataFrame(new_rows, columns=df.columns)
class PaperList:
COLUMN_INFO = [
["date", "markdown"],
["paper_page", "markdown"],
["title", "str"],
["github", "markdown"],
["upvotes", "number"],
]
def __init__(self, df: pd.DataFrame):
self.df_raw = df
self._prettifier = Prettifier()
self.df_prettified = self._prettifier(df).loc[:, self.column_names]
@property
def column_names(self):
return list(map(operator.itemgetter(0), self.COLUMN_INFO))
@property
def column_datatype(self):
return list(map(operator.itemgetter(1), self.COLUMN_INFO))
def search(
self,
start_date: datetime.datetime,
end_date: datetime.datetime,
title_search_query: str,
) -> pd.DataFrame:
df = self.df_raw.copy()
df["date"] = pd.to_datetime(df["date"])
# Filter by date
df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
df["date"] = df["date"].dt.strftime("%Y-%m-%d")
# Filter by title
df = df[df["title"].str.contains(title_search_query, case=False)]
df_prettified = self._prettifier(df).loc[:, self.column_names]
return df_prettified
|