File size: 9,076 Bytes
35b4c0e
 
 
 
 
 
 
 
 
 
cf55fa7
35b4c0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf55fa7
 
03bdc59
35b4c0e
 
 
 
 
 
9784357
 
35b4c0e
 
 
 
 
 
 
 
9784357
35b4c0e
 
 
 
 
7cd24de
35b4c0e
 
 
 
 
 
 
 
 
 
 
 
 
 
e67fd82
35b4c0e
 
 
 
 
 
 
 
 
 
 
 
 
8679092
 
35b4c0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9bc1648
35b4c0e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
#!/usr/bin/env python

import datetime
import operator
import pandas as pd
import tqdm.auto
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi
from ragatouille import RAGPretrainedModel

import gradio as gr
from gradio_calendar import Calendar
import datasets

# --- Data Loading and Processing ---

api = HfApi()

INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
api.snapshot_download(
    repo_id=INDEX_REPO_ID,
    repo_type="dataset",
    local_dir=INDEX_DIR_PATH,
)
abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
# Run once to initialize the retriever
abstract_retriever.search("LLM")


def update_abstract_index() -> None:
    global abstract_retriever

    api.snapshot_download(
        repo_id=INDEX_REPO_ID,
        repo_type="dataset",
        local_dir=INDEX_DIR_PATH,
    )
    abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
    abstract_retriever.search("LLM")


scheduler_abstract = BackgroundScheduler()
scheduler_abstract.add_job(
    func=update_abstract_index,
    trigger="cron",
    minute=0,  # Every hour at minute 0
    timezone="UTC",
    misfire_grace_time=3 * 60,
)
scheduler_abstract.start()


def get_df() -> pd.DataFrame:
    df = pd.merge(
        left=datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
        right=datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
        on="arxiv_id",
    )
    df = df[::-1].reset_index(drop=True)
    df["date"] = df["date"].dt.strftime("%Y-%m-%d")

    paper_info = []
    for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
        info = row.copy()
        del info["abstract"]
        info["paper_page"] = f"https://huggingface.co/papers/{row.arxiv_id}"
        paper_info.append(info)
    return pd.DataFrame(paper_info)


class Prettifier:
    @staticmethod
    def get_github_link(link: str) -> str:
        if not link:
            return ""
        return Prettifier.create_link("github", link)

    @staticmethod
    def create_link(text: str, url: str) -> str:
        return f'<a href="{url}" target="_blank">{text}</a>'

    @staticmethod
    def to_div(text: str | None, category_name: str) -> str:
        if text is None:
            text = ""
        class_name = f"{category_name}-{text.lower()}"
        return f'<div class="{class_name}">{text}</div>'

    def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
        new_rows = []
        for _, row in df.iterrows():
            new_row = {
                "date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
                "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
                "title": row["title"],
                "github": self.get_github_link(row.github),
                "๐Ÿ‘": row["upvotes"],
                "๐Ÿ’ฌ": row["num_comments"],
            }
            new_rows.append(new_row)
        return pd.DataFrame(new_rows)


class PaperList:
    COLUMN_INFO = [
        ["date", "markdown"],
        ["paper_page", "markdown"],
        ["title", "str"],
        ["github", "markdown"],
        ["๐Ÿ‘", "number"],
        ["๐Ÿ’ฌ", "number"],
    ]

    def __init__(self, df: pd.DataFrame):
        self.df_raw = df
        self._prettifier = Prettifier()
        self.df_prettified = self._prettifier(df).loc[:, self.column_names]

    @property
    def column_names(self):
        return list(map(operator.itemgetter(0), self.COLUMN_INFO))

    @property
    def column_datatype(self):
        return list(map(operator.itemgetter(1), self.COLUMN_INFO))

    def search(
        self,
        start_date: datetime.datetime,
        end_date: datetime.datetime,
        title_search_query: str,
        abstract_search_query: str,
        max_num_to_retrieve: int,
    ) -> pd.DataFrame:
        df = self.df_raw.copy()
        df["date"] = pd.to_datetime(df["date"])

        # Filter by date
        df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
        df["date"] = df["date"].dt.strftime("%Y-%m-%d")

        # Filter by title
        if title_search_query:
            df = df[df["title"].str.contains(title_search_query, case=False)]

        # Filter by abstract
        if abstract_search_query:
            results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
            remaining_ids = set(df["arxiv_id"])
            found_id_set = set()
            found_ids = []
            for x in results:
                arxiv_id = x["document_id"]
                if arxiv_id not in remaining_ids:
                    continue
                if arxiv_id in found_id_set:
                    continue
                found_id_set.add(arxiv_id)
                found_ids.append(arxiv_id)
            df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()

        df_prettified = self._prettifier(df).loc[:, self.column_names]
        return df_prettified


paper_list = PaperList(get_df())


def update_paper_list() -> None:
    global paper_list
    paper_list = PaperList(get_df())


scheduler_data = BackgroundScheduler()
scheduler_data.add_job(
    func=update_paper_list,
    trigger="cron",
    minute=0,  # Every hour at minute 0
    timezone="UTC",
    misfire_grace_time=60,
)
scheduler_data.start()

# --- Gradio App ---

DESCRIPTION = "# [Daily Papers](https://huggingface.co/papers)"

FOOT_NOTE = """\
Related useful Spaces:
- [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien)
- [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy)
- [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
"""


def update_df() -> pd.DataFrame:
    return paper_list.df_prettified


def update_num_papers(df: pd.DataFrame) -> str:
    return f"{len(df)} / {len(paper_list.df_raw)}"


def search(
    start_date: datetime.datetime,
    end_date: datetime.datetime,
    search_title: str,
    search_abstract: str,
    max_num_to_retrieve: int,
) -> pd.DataFrame:
    return paper_list.search(start_date, end_date, search_title, search_abstract, max_num_to_retrieve)


with gr.Blocks(css="style.css") as demo:
    gr.Markdown(DESCRIPTION)
    with gr.Group():
        search_title = gr.Textbox(label="Search title")
        with gr.Row():
            with gr.Column(scale=4):
                search_abstract = gr.Textbox(
                    label="Search abstract",
                    info="The result may not be accurate as the abstract does not contain all the information.",
                )
            with gr.Column(scale=1):
                max_num_to_retrieve = gr.Slider(
                    label="Max number to retrieve",
                    info="This is used only for search on abstracts.",
                    minimum=1,
                    maximum=len(paper_list.df_raw),
                    step=1,
                    value=100,
                )
        with gr.Row():
            start_date = Calendar(label="Start date", type="date", value="2023-05-05")
            end_date = Calendar(label="End date", type="date", value=datetime.datetime.utcnow().strftime("%Y-%m-%d"))

    num_papers = gr.Textbox(label="Number of papers", value=update_num_papers(paper_list.df_raw), interactive=False)
    df = gr.Dataframe(
        value=paper_list.df_prettified,
        datatype=paper_list.column_datatype,
        type="pandas",
        interactive=False,
        height=1000,
        elem_id="table",
        column_widths=["10%", "10%", "60%", "10%", "5%", "5%"],
        wrap=True,
    )

    gr.Markdown(FOOT_NOTE)

    # Define the triggers and corresponding functions
    search_event = gr.Button("Search")
    search_event.click(
        fn=search,
        inputs=[start_date, end_date, search_title, search_abstract, max_num_to_retrieve],
        outputs=df,
    ).then(
        fn=update_num_papers,
        inputs=df,
        outputs=num_papers,
        queue=False,
    )

    # Automatically trigger search when inputs change
    for trigger in [start_date, end_date, search_title, search_abstract, max_num_to_retrieve]:
        trigger.change(
            fn=search,
            inputs=[start_date, end_date, search_title, search_abstract, max_num_to_retrieve],
            outputs=df,
        ).then(
            fn=update_num_papers,
            inputs=df,
            outputs=num_papers,
            queue=False,
        )

    # Load the initial dataframe and number of papers
    demo.load(
        fn=update_df,
        outputs=df,
        queue=False,
    ).then(
        fn=update_num_papers,
        inputs=df,
        outputs=num_papers,
        queue=False,
    )

if __name__ == "__main__":
    demo.queue(api_open=False).launch(show_api=False)