Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import dataclasses | |
import datetime | |
import operator | |
import pathlib | |
import pandas as pd | |
import requests | |
import tqdm.auto | |
class PaperInfo: | |
date: str | |
arxiv_id: str | |
github: str | |
title: str | |
paper_page: str | |
upvotes: int | |
published_at: str | |
def __post_init__(self): | |
object.__setattr__(self, 'published_at', | |
PaperInfo.convert_timestamp(self.published_at)) | |
def convert_timestamp(timestamp: str) -> str: | |
try: | |
return datetime.datetime.strptime( | |
timestamp, | |
'%Y-%m-%dT%H:%M:%S.%fZ').strftime('%Y/%m/%d %H:%M:%S') | |
except ValueError: | |
return timestamp | |
def get_df(path: pathlib.Path | str) -> pd.DataFrame: | |
df = pd.read_csv(path, dtype=str).fillna('') | |
paper_info = [] | |
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)): | |
res = requests.get( | |
f'https://huggingface.co/api/papers/{row.arxiv_id}').json() | |
info = PaperInfo( | |
**row, | |
title=res['title'], | |
paper_page=f'https://huggingface.co/papers/{row.arxiv_id}', | |
upvotes=res['upvotes'], | |
published_at=res['publishedAt']) | |
paper_info.append(info) | |
return pd.DataFrame([dataclasses.asdict(info) for info in paper_info]) | |
class Prettifier: | |
def get_github_link(link: str) -> str: | |
if not link: | |
return '' | |
return Prettifier.create_link('github', link) | |
def create_link(text: str, url: str) -> str: | |
return f'<a href="{url}" target="_blank">{text}</a>' | |
def to_div(text: str | None, category_name: str) -> str: | |
if text is None: | |
text = '' | |
class_name = f'{category_name}-{text.lower()}' | |
return f'<div class="{class_name}">{text}</div>' | |
def __call__(self, df: pd.DataFrame) -> pd.DataFrame: | |
df = df.sort_values('arxiv_id', ascending=False).reset_index(drop=True) | |
new_rows = [] | |
for _, row in df.iterrows(): | |
new_row = dict(row) | { | |
'date': | |
Prettifier.create_link( | |
row.date, | |
f'https://huggingface.co/papers?date={row.date}'), | |
'paper_page': | |
Prettifier.create_link(row.arxiv_id, row.paper_page), | |
'github': | |
self.get_github_link(row.github), | |
} | |
new_rows.append(new_row) | |
return pd.DataFrame(new_rows, columns=df.columns) | |
class PaperList: | |
COLUMN_INFO = [ | |
['date', 'markdown'], | |
['paper_page', 'markdown'], | |
['title', 'str'], | |
['github', 'markdown'], | |
['upvotes', 'number'], | |
] | |
def __init__(self, df: pd.DataFrame): | |
self.df_raw = df | |
self._prettifier = Prettifier() | |
self.df_prettified = self._prettifier(df).loc[:, self.column_names] | |
def column_names(self): | |
return list(map(operator.itemgetter(0), self.COLUMN_INFO)) | |
def column_datatype(self): | |
return list(map(operator.itemgetter(1), self.COLUMN_INFO)) | |