File size: 3,625 Bytes
08080f2
 
 
 
5884212
08080f2
 
 
 
 
 
 
e2797b8
08080f2
 
 
 
 
857ce49
08080f2
 
9fb4b90
08080f2
 
 
 
9fb4b90
08080f2
 
 
 
5884212
 
 
08080f2
 
9fb4b90
08080f2
 
9fb4b90
 
 
 
 
08080f2
 
 
 
 
 
 
 
9fb4b90
 
08080f2
 
 
e2797b8
08080f2
 
 
 
9fb4b90
 
08080f2
 
 
9fb4b90
08080f2
 
 
9fb4b90
 
 
08080f2
 
 
 
 
 
 
9fb4b90
 
 
 
 
08080f2
 
 
 
 
 
 
 
 
 
 
 
 
 
4a02364
57102fb
4a02364
 
57102fb
 
4a02364
 
57102fb
 
 
 
4a02364
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import dataclasses
import datetime
import operator

import datasets
import pandas as pd
import requests
import tqdm.auto


@dataclasses.dataclass(frozen=True)
class PaperInfo:
    date: str
    arxiv_id: str
    github: str
    title: str
    paper_page: str
    upvotes: int
    published_at: str

    def __post_init__(self):
        object.__setattr__(self, "published_at", PaperInfo.convert_timestamp(self.published_at))

    @staticmethod
    def convert_timestamp(timestamp: str) -> str:
        try:
            return datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%Y/%m/%d %H:%M:%S")
        except ValueError:
            return timestamp


def get_df() -> pd.DataFrame:
    df = datasets.load_dataset("hysts-internal/daily-papers")["train"].to_pandas()
    df = df.drop(columns=["title"])
    paper_info = []
    for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
        res = requests.get(f"https://huggingface.co/api/papers/{row.arxiv_id}").json()
        info = PaperInfo(
            **row,
            title=res["title"],
            paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
            upvotes=res["upvotes"],
            published_at=res["publishedAt"],
        )
        paper_info.append(info)
    return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])


class Prettifier:
    @staticmethod
    def get_github_link(link: str) -> str:
        if not link:
            return ""
        return Prettifier.create_link("github", link)

    @staticmethod
    def create_link(text: str, url: str) -> str:
        return f'<a href="{url}" target="_blank">{text}</a>'

    @staticmethod
    def to_div(text: str | None, category_name: str) -> str:
        if text is None:
            text = ""
        class_name = f"{category_name}-{text.lower()}"
        return f'<div class="{class_name}">{text}</div>'

    def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.sort_values("arxiv_id", ascending=False).reset_index(drop=True)
        new_rows = []
        for _, row in df.iterrows():
            new_row = dict(row) | {
                "date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
                "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
                "github": self.get_github_link(row.github),
            }
            new_rows.append(new_row)
        return pd.DataFrame(new_rows, columns=df.columns)


class PaperList:
    COLUMN_INFO = [
        ["date", "markdown"],
        ["paper_page", "markdown"],
        ["title", "str"],
        ["github", "markdown"],
        ["upvotes", "number"],
    ]

    def __init__(self, df: pd.DataFrame):
        self.df_raw = df
        self._prettifier = Prettifier()
        self.df_prettified = self._prettifier(df).loc[:, self.column_names]

    @property
    def column_names(self):
        return list(map(operator.itemgetter(0), self.COLUMN_INFO))

    @property
    def column_datatype(self):
        return list(map(operator.itemgetter(1), self.COLUMN_INFO))

    def search(self, start_date: datetime.datetime, end_date: datetime.datetime, title: str) -> pd.DataFrame:
        df = self.df_raw.copy()
        df["date"] = pd.to_datetime(df["date"])

        # Filter by date
        df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
        df["date"] = df["date"].dt.strftime("%Y-%m-%d")

        # Filter by title
        df = df[df["title"].str.contains(title, case=False)]

        df_prettified = self._prettifier(df).loc[:, self.column_names]
        return df_prettified