File size: 3,109 Bytes
08080f2
 
 
 
5884212
08080f2
 
 
 
 
 
e2797b8
08080f2
 
 
 
 
 
 
5884212
5c4c264
 
 
 
 
08080f2
 
 
 
9fb4b90
 
08080f2
 
 
 
 
 
 
 
9fb4b90
 
08080f2
 
 
e2797b8
08080f2
 
 
 
9fb4b90
 
08080f2
 
 
 
 
 
9fb4b90
 
 
08080f2
 
 
 
 
 
 
9fb4b90
 
 
 
 
08080f2
 
 
 
 
 
 
 
 
 
 
 
 
 
4a02364
5c4c264
 
 
 
 
 
4a02364
 
57102fb
 
4a02364
 
57102fb
 
5c4c264
57102fb
4a02364
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import dataclasses
import datetime
import operator

import datasets
import pandas as pd
import tqdm.auto


@dataclasses.dataclass(frozen=True)
class PaperInfo:
    date: str
    arxiv_id: str
    github: str
    title: str
    paper_page: str
    upvotes: int


def get_df() -> pd.DataFrame:
    df = pd.merge(
        left=datasets.load_dataset("hysts-bot-data/daily-papers")["train"].to_pandas(),
        right=datasets.load_dataset("hysts-bot-data/daily-papers-upvotes")["train"].to_pandas(),
        on="arxiv_id",
    )
    paper_info = []
    for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
        info = PaperInfo(
            **row,
            paper_page=f"https://huggingface.co/papers/{row.arxiv_id}",
        )
        paper_info.append(info)
    return pd.DataFrame([dataclasses.asdict(info) for info in paper_info])


class Prettifier:
    @staticmethod
    def get_github_link(link: str) -> str:
        if not link:
            return ""
        return Prettifier.create_link("github", link)

    @staticmethod
    def create_link(text: str, url: str) -> str:
        return f'<a href="{url}" target="_blank">{text}</a>'

    @staticmethod
    def to_div(text: str | None, category_name: str) -> str:
        if text is None:
            text = ""
        class_name = f"{category_name}-{text.lower()}"
        return f'<div class="{class_name}">{text}</div>'

    def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
        new_rows = []
        for _, row in df.iterrows():
            new_row = dict(row) | {
                "date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
                "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
                "github": self.get_github_link(row.github),
            }
            new_rows.append(new_row)
        return pd.DataFrame(new_rows, columns=df.columns)


class PaperList:
    COLUMN_INFO = [
        ["date", "markdown"],
        ["paper_page", "markdown"],
        ["title", "str"],
        ["github", "markdown"],
        ["upvotes", "number"],
    ]

    def __init__(self, df: pd.DataFrame):
        self.df_raw = df
        self._prettifier = Prettifier()
        self.df_prettified = self._prettifier(df).loc[:, self.column_names]

    @property
    def column_names(self):
        return list(map(operator.itemgetter(0), self.COLUMN_INFO))

    @property
    def column_datatype(self):
        return list(map(operator.itemgetter(1), self.COLUMN_INFO))

    def search(
        self,
        start_date: datetime.datetime,
        end_date: datetime.datetime,
        title_search_query: str,
    ) -> pd.DataFrame:
        df = self.df_raw.copy()
        df["date"] = pd.to_datetime(df["date"])

        # Filter by date
        df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
        df["date"] = df["date"].dt.strftime("%Y-%m-%d")

        # Filter by title
        df = df[df["title"].str.contains(title_search_query, case=False)]

        df_prettified = self._prettifier(df).loc[:, self.column_names]
        return df_prettified