#!/usr/bin/env python
import datetime
import operator
import pandas as pd
import tqdm.auto
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi
from ragatouille import RAGPretrainedModel
import gradio as gr
from gradio_calendar import Calendar
import datasets
import requests
# --- Data Loading and Processing ---
api = HfApi()
INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
api.snapshot_download(
repo_id=INDEX_REPO_ID,
repo_type="dataset",
local_dir=INDEX_DIR_PATH,
)
abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
# Run once to initialize the retriever
abstract_retriever.search("LLM")
def update_abstract_index() -> None:
global abstract_retriever
api.snapshot_download(
repo_id=INDEX_REPO_ID,
repo_type="dataset",
local_dir=INDEX_DIR_PATH,
)
abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
abstract_retriever.search("LLM")
# Scheduler for updating abstract index every hour
scheduler_abstract = BackgroundScheduler()
scheduler_abstract.add_job(
func=update_abstract_index,
trigger="cron",
minute=0, # Every hour at minute 0
timezone="UTC",
misfire_grace_time=3 * 60,
)
scheduler_abstract.start()
def get_df() -> pd.DataFrame:
df = pd.merge(
left=datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
right=datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
on="arxiv_id",
)
df = df[::-1].reset_index(drop=True)
df["date"] = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d")
paper_info = []
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
info = row.copy()
del info["abstract"]
info["paper_page"] = f"https://huggingface.co/papers/{row.arxiv_id}"
paper_info.append(info)
return pd.DataFrame(paper_info)
class Prettifier:
@staticmethod
def get_github_link(link: str) -> str:
if not link:
return ""
return Prettifier.create_link("github", link)
@staticmethod
def create_link(text: str, url: str) -> str:
return f'{text}'
@staticmethod
def to_div(text: str | None, category_name: str) -> str:
if text is None:
text = ""
class_name = f"{category_name}-{text.lower()}"
return f'
{text}
'
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
new_rows = []
for _, row in df.iterrows():
new_row = {
"date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
"paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
"title": row["title"],
"github": self.get_github_link(row.github),
"👍": row["upvotes"],
"💬": row["num_comments"],
}
new_rows.append(new_row)
return pd.DataFrame(new_rows)
class PaperList:
COLUMN_INFO = [
["date", "markdown"],
["paper_page", "markdown"],
["title", "str"],
["github", "markdown"],
["👍", "number"],
["💬", "number"],
]
def __init__(self, df: pd.DataFrame):
self.df_raw = df
self._prettifier = Prettifier()
self.df_prettified = self._prettifier(df).loc[:, self.column_names]
@property
def column_names(self):
return list(map(operator.itemgetter(0), self.COLUMN_INFO))
@property
def column_datatype(self):
return list(map(operator.itemgetter(1), self.COLUMN_INFO))
def search(
self,
start_date: datetime.datetime,
end_date: datetime.datetime,
title_search_query: str,
abstract_search_query: str,
max_num_to_retrieve: int,
) -> pd.DataFrame:
df = self.df_raw.copy()
df["date"] = pd.to_datetime(df["date"])
# Filter by date
df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
df["date"] = df["date"].dt.strftime("%Y-%m-%d")
# Filter by title
if title_search_query:
df = df[df["title"].str.contains(title_search_query, case=False)]
# Filter by abstract
if abstract_search_query:
results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
remaining_ids = set(df["arxiv_id"])
found_id_set = set()
found_ids = []
for x in results:
arxiv_id = x["document_id"]
if arxiv_id not in remaining_ids:
continue
if arxiv_id in found_id_set:
continue
found_id_set.add(arxiv_id)
found_ids.append(arxiv_id)
df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()
df_prettified = self._prettifier(df).loc[:, self.column_names]
return df_prettified
# Initialize PaperList
paper_list = PaperList(get_df())
def update_paper_list() -> None:
global paper_list
paper_list = PaperList(get_df())
# Scheduler for updating paper list every hour
scheduler_data = BackgroundScheduler()
scheduler_data.add_job(
func=update_paper_list,
trigger="cron",
minute=0, # Every hour at minute 0
timezone="UTC",
misfire_grace_time=60,
)
scheduler_data.start()
# --- Gradio App ---
DESCRIPTION = "# [Daily Papers](https://huggingface.co/papers)"
FOOT_NOTE = """\
Related useful Spaces:
- [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien)
- [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy)
- [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
"""
# --- Sorting and Pagination Management ---
class PaperManager:
def __init__(self, paper_list: PaperList, papers_per_page=30):
self.paper_list = paper_list
self.papers_per_page = papers_per_page
self.current_page = 1
self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1)
self.sort_method = "hot" # Default sort method
def calculate_score(self, paper):
"""
Calculate the score of a paper based on upvotes and age.
This mimics the "hotness" algorithm used by platforms like Hacker News.
"""
upvotes = paper.get('upvotes', 0)
published_at_str = paper.get('date', datetime.datetime.now(timezone.utc).isoformat())
try:
published_time = datetime.datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
except ValueError:
# If parsing fails, use current time to minimize the impact on sorting
published_time = datetime.datetime.now(datetime.timezone.utc)
time_diff = datetime.datetime.now(datetime.timezone.utc) - published_time
time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours
# Avoid division by zero and apply the hotness formula
score = upvotes / ((time_diff_hours + 2) ** 1.5)
return score
def sort_papers(self):
df = self.paper_list.df_raw.copy()
if self.sort_method == "hot":
df['score'] = df.apply(self.calculate_score, axis=1)
df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
elif self.sort_method == "new":
df_sorted = df.sort_values(by='date', ascending=False)
else:
df_sorted = df
self.paper_list.df_raw = df_sorted.reset_index(drop=True)
self.paper_list.df_prettified = self.paper_list._prettifier(self.paper_list.df_raw).loc[:, self.paper_list.column_names]
self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1)
self.current_page = 1
def set_sort_method(self, method):
if method not in ["hot", "new"]:
method = "hot"
print(f"Setting sort method to: {method}")
self.sort_method = method
self.sort_papers()
return True # Assume success
def get_current_page_papers(self):
start = (self.current_page - 1) * self.papers_per_page
end = start + self.papers_per_page
current_papers = self.paper_list.df_prettified.iloc[start:end]
return current_papers
def next_page(self):
if self.current_page < self.total_pages:
self.current_page += 1
return self.get_current_page_papers()
def prev_page(self):
if self.current_page > 1:
self.current_page -= 1
return self.get_current_page_papers()
def refresh(self):
self.sort_papers()
return self.get_current_page_papers()
# Initialize PaperManager
paper_manager = PaperManager(paper_list)
def refresh_paper_manager():
global paper_manager
paper_manager = PaperManager(paper_list)
if paper_manager.sort_method:
paper_manager.sort_papers()
return paper_manager.get_current_page_papers()
# --- Gradio Interface Functions ---
def update_num_papers(current_df: pd.DataFrame) -> str:
return f"{len(current_df)} / {len(paper_manager.paper_list.df_raw)}"
def perform_search(
start_date: datetime.datetime,
end_date: datetime.datetime,
search_title: str,
search_abstract: str,
max_num_to_retrieve: int,
sort_method: str
) -> pd.DataFrame:
# Update sort method
paper_manager.set_sort_method(sort_method.lower())
# Perform search
searched_df = paper_manager.paper_list.search(start_date, end_date, search_title, search_abstract, max_num_to_retrieve)
# Update PaperList with searched results
paper_manager.paper_list.df_raw = searched_df.copy()
paper_manager.paper_list.df_prettified = paper_manager.paper_list._prettifier(searched_df).loc[:, paper_manager.paper_list.column_names]
paper_manager.total_pages = max((len(searched_df) + paper_manager.papers_per_page - 1) // paper_manager.papers_per_page, 1)
paper_manager.current_page = 1
# Apply sorting
paper_manager.sort_papers()
return paper_manager.get_current_page_papers()
def change_sort_method(method: str) -> pd.DataFrame:
paper_manager.set_sort_method(method.lower())
return paper_manager.get_current_page_papers()
def get_initial_papers() -> pd.DataFrame:
return paper_manager.get_current_page_papers()
# --- CSS Styling ---
css = """
/* Existing CSS remains unchanged */
body {
background-color: white;
font-family: Verdana, Geneva, sans-serif;
margin: 0;
padding: 0;
}
a {
color: #0000ff;
text-decoration: none;
}
a:visited {
color: #551A8B;
}
.container {
width: 85%;
margin: auto;
}
table {
width: 100%;
}
.header-table {
width: 100%;
background-color: #ff6600;
padding: 2px 10px;
}
.header-table a {
color: black;
font-weight: bold;
font-size: 14pt;
text-decoration: none;
}
.itemlist .athing {
background-color: #f6f6ef;
}
.rank {
font-size: 14pt;
color: #828282;
padding-right: 5px;
}
.storylink {
font-size: 10pt;
}
.subtext {
font-size: 8pt;
color: #828282;
padding-left: 40px;
}
.subtext a {
color: #828282;
text-decoration: none;
}
#refresh-button {
background: none;
border: none;
color: black;
font-weight: bold;
font-size: 14pt;
cursor: pointer;
}
.no-papers {
text-align: center;
color: #828282;
padding: 1rem;
font-size: 14pt;
}
@media (max-width: 640px) {
.header-table a {
font-size: 12pt;
}
.storylink {
font-size: 9pt;
}
.subtext {
font-size: 7pt;
}
}
/* Dark mode */
@media (prefers-color-scheme: dark) {
body {
background-color: #121212;
color: #e0e0e0;
}
a {
color: #add8e6;
}
a:visited {
color: #9370db;
}
.header-table {
background-color: #ff6600;
}
.header-table a {
color: black;
}
.itemlist .athing {
background-color: #1e1e1e;
}
.rank {
color: #b0b0b0;
}
.subtext {
color: #b0b0b0;
}
.subtext a {
color: #b0b0b0;
}
#refresh-button {
color: #e0e0e0;
}
.no-papers {
color: #b0b0b0;
}
}
"""
# --- Initialize Gradio Blocks ---
demo = gr.Blocks(css=css)
with demo:
with gr.Column(elem_classes=["container"]):
# Accordion for Submission Instructions
with gr.Accordion("How to Submit a Paper", open=False):
gr.Markdown("""
**Submit the paper to Daily Papers:**
[https://huggingface.co/papers/submit](https://huggingface.co/papers/submit)
Once your paper is submitted, it will automatically appear in this demo.
""")
# Header with Refresh Button
with gr.Row():
gr.HTML("""
""")
# Sorting Options
with gr.Row():
sort_radio = gr.Radio(
choices=["Hot", "New"],
value="Hot",
label="Sort By",
interactive=True
)
# Search and Filter Inputs
with gr.Group():
search_title = gr.Textbox(label="Search Title")
with gr.Row():
with gr.Column(scale=4):
search_abstract = gr.Textbox(
label="Search Abstract",
info="The result may not be accurate as the abstract does not contain all the information.",
)
with gr.Column(scale=1):
max_num_to_retrieve = gr.Slider(
label="Max Number to Retrieve",
info="This is used only for search on abstracts.",
minimum=1,
maximum=1000, # Adjust as needed
step=1,
value=100,
)
with gr.Row():
start_date = Calendar(label="Start Date", type="date", value="2023-05-05")
end_date = Calendar(label="End Date", type="date", value=datetime.datetime.utcnow().strftime("%Y-%m-%d"))
search_button = gr.Button("Search")
# Number of Papers Display
num_papers = gr.Textbox(label="Number of Papers", value=update_num_papers(paper_manager.get_current_page_papers()), interactive=False)
# Paper List Display
df_display = gr.DataFrame(
value=paper_manager.get_current_page_papers(),
datatype=paper_manager.paper_list.column_datatype,
type="pandas",
interactive=False,
height=600,
elem_id="table",
column_widths=["10%", "10%", "60%", "10%", "5%", "5%"],
wrap=True,
)
# Pagination Buttons
with gr.Row():
prev_button = gr.Button("Prev")
next_button = gr.Button("Next")
# Footer
gr.Markdown(FOOT_NOTE)
# Hidden Refresh Button
refresh_button = gr.Button("Refresh", visible=False, elem_id="refresh-hidden")
refresh_button.click(refresh_paper_manager, outputs=[df_display])
# Bind the visible Refresh button to the hidden one using JavaScript
gr.HTML("""
""")
# Event Handlers
# Search Button Click
search_button.click(
fn=perform_search,
inputs=[start_date, end_date, search_title, search_abstract, max_num_to_retrieve, sort_radio],
outputs=[df_display],
).then(
fn=update_num_papers,
inputs=df_display,
outputs=num_papers,
queue=False,
)
# Sort Radio Change
sort_radio.change(
fn=change_sort_method,
inputs=[sort_radio],
outputs=[df_display],
).then(
fn=update_num_papers,
inputs=df_display,
outputs=num_papers,
queue=False,
)
# Pagination Buttons
prev_button.click(
fn=paper_manager.prev_page,
inputs=None,
outputs=[df_display],
).then(
fn=update_num_papers,
inputs=df_display,
outputs=num_papers,
queue=False,
)
next_button.click(
fn=paper_manager.next_page,
inputs=None,
outputs=[df_display],
).then(
fn=update_num_papers,
inputs=df_display,
outputs=num_papers,
queue=False,
)
# Initial Load
demo.load(
fn=get_initial_papers,
outputs=[df_display],
).then(
fn=update_num_papers,
inputs=df_display,
outputs=num_papers,
queue=False,
)
# --- Launch the App ---
if __name__ == "__main__":
demo.launch()