Spaces:
Sleeping
Sleeping
#!/usr/bin/env python | |
import datetime | |
import pandas as pd | |
import tqdm.auto | |
from apscheduler.schedulers.background import BackgroundScheduler | |
from huggingface_hub import HfApi | |
import gradio as gr | |
import datasets # Added import for datasets | |
from datetime import timezone # Ensure timezone is imported | |
# --- Data Loading and Processing --- | |
api = HfApi() | |
def get_df() -> pd.DataFrame: | |
""" | |
Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns, | |
and adds a 'paper_page' link for each paper. | |
""" | |
# Load datasets | |
df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas() | |
df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas() | |
# Merge datasets on 'arxiv_id' | |
df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id", suffixes=('_papers', '_stats')) | |
# Reverse the DataFrame to have the latest papers first | |
df = df[::-1].reset_index(drop=True) | |
# Ensure 'date' is in datetime format and handle missing dates | |
df["date"] = pd.to_datetime(df["date"], errors='coerce') | |
df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) | |
# Prepare the DataFrame by removing 'abstract' | |
paper_info = [] | |
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)): | |
info = row.copy() | |
if "abstract" in info: | |
del info["abstract"] | |
paper_info.append(info) | |
df_prepared = pd.DataFrame(paper_info) | |
# Add 'paper_page' links | |
df_prepared["paper_page"] = df_prepared["arxiv_id"].apply(lambda x: f"https://huggingface.co/papers/{x}") | |
return df_prepared | |
class Prettifier: | |
""" | |
Converts raw DataFrame rows into a prettified format suitable for display. | |
""" | |
REQUIRED_COLUMNS = ["arxiv_id", "date_display", "date", "paper_page", "title", "github", "π", "π¬"] | |
def get_github_link(link: str) -> str: | |
if not link: | |
return "" | |
return Prettifier.create_link("github", link) | |
def create_link(text: str, url: str) -> str: | |
return f'<a href="{url}" target="_blank">{text}</a>' | |
def __call__(self, df: pd.DataFrame) -> pd.DataFrame: | |
new_rows = [] | |
for _, row in df.iterrows(): | |
# Handle date_display as a clickable link | |
date_display = Prettifier.create_link(row.get("date", ""), f"https://huggingface.co/papers?date={row.get('date', '')}") | |
new_row = { | |
"arxiv_id": row.get("arxiv_id", ""), # Include arxiv_id | |
"date_display": date_display, # For display | |
"date": row.get("date", datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")), # For internal calculations | |
"paper_page": Prettifier.create_link(row.get("arxiv_id", ""), row.get("paper_page", "#")), | |
"title": row.get("title", "No title"), | |
"github": Prettifier.get_github_link(row.get("github", "")), | |
"π": row.get("upvotes", 0), | |
"π¬": row.get("num_comments", 0), | |
} | |
new_rows.append(new_row) | |
# If no rows, return empty DataFrame with required columns to prevent KeyError | |
if not new_rows: | |
return pd.DataFrame(columns=self.REQUIRED_COLUMNS) | |
return pd.DataFrame(new_rows) | |
class PaperList: | |
""" | |
Manages the list of papers. | |
""" | |
COLUMN_INFO = [ | |
["arxiv_id", "str"], # Added arxiv_id | |
["date_display", "markdown"], # For display | |
["date", "str"], # For internal use | |
["paper_page", "markdown"], | |
["title", "str"], | |
["github", "markdown"], | |
["π", "number"], | |
["π¬", "number"], | |
] | |
def __init__(self, df: pd.DataFrame): | |
self.df_raw = df | |
self._prettifier = Prettifier() | |
self.df_prettified = self._prettifier(df).loc[:, self.column_names] | |
def column_names(self): | |
return [col[0] for col in self.COLUMN_INFO] | |
def column_datatype(self): | |
return [col[1] for col in self.COLUMN_INFO] | |
def get_prettified_df(self) -> pd.DataFrame: | |
""" | |
Returns the prettified DataFrame. | |
""" | |
return self.df_prettified | |
# --- Sorting and Pagination Management --- | |
class PaperManager: | |
""" | |
Manages sorting, pagination, and search queries for the list of papers. | |
""" | |
def __init__(self, paper_list: PaperList, papers_per_page=30): | |
self.paper_list = paper_list | |
self.papers_per_page = papers_per_page | |
self.sort_method = "hot" # Default sort method | |
self.top_time_frame = "all time" # Default time frame for "Top" sorting | |
self.sort_papers() | |
# 'current_page' and 'total_pages' are set in 'sort_papers()' | |
def calculate_score(self, row): | |
""" | |
Calculate the score of a paper based on upvotes and age. | |
This mimics the "hotness" algorithm used by platforms like Hacker News. | |
""" | |
upvotes = row.get('upvotes', 0) # Corrected from 'π' to 'upvotes' | |
date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) | |
try: | |
published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc) | |
except ValueError: | |
# If parsing fails, use current time to minimize the impact on sorting | |
published_time = datetime.datetime.now(timezone.utc) | |
time_diff = datetime.datetime.now(timezone.utc) - published_time | |
time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours | |
# Avoid division by zero and apply the hotness formula | |
score = upvotes / ((time_diff_hours + 2) ** 1.5) if (time_diff_hours + 2) > 0 else 0 | |
return score | |
def sort_papers(self): | |
""" | |
Sorts the papers based on the current sort method. | |
""" | |
df = self.paper_list.df_raw.copy() | |
if self.sort_method == "hot": | |
if not df.empty: | |
df = df.drop(columns=['score'], errors='ignore') # Remove existing 'score' column if present | |
df['score'] = df.apply(self.calculate_score, axis=1) | |
df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score']) | |
else: | |
df_sorted = df | |
elif self.sort_method == "new": | |
df_sorted = df.sort_values(by='date', ascending=False) # Sort by 'date' | |
elif self.sort_method == "top": | |
# Filter based on the selected time frame | |
now = datetime.datetime.now(timezone.utc) | |
if self.top_time_frame == "day": | |
time_threshold = now - datetime.timedelta(days=1) | |
elif self.top_time_frame == "week": | |
time_threshold = now - datetime.timedelta(weeks=1) | |
elif self.top_time_frame == "month": | |
time_threshold = now - datetime.timedelta(days=30) | |
elif self.top_time_frame == "year": | |
time_threshold = now - datetime.timedelta(days=365) | |
elif self.top_time_frame == "all time": | |
time_threshold = datetime.datetime.min.replace(tzinfo=timezone.utc) | |
else: | |
time_threshold = datetime.datetime.min.replace(tzinfo=timezone.utc) | |
# Convert 'date' column to datetime | |
df_sorted = df.copy() | |
df_sorted['date_parsed'] = pd.to_datetime(df_sorted['date'], errors='coerce').dt.tz_localize(timezone.utc) | |
df_sorted = df_sorted[df_sorted['date_parsed'] >= time_threshold] | |
df_sorted = df_sorted.sort_values(by='upvotes', ascending=False).drop(columns=['date_parsed']) | |
else: | |
df_sorted = df | |
self.paper_list.df_raw = df_sorted.reset_index(drop=True) | |
self.paper_list.df_prettified = self.paper_list._prettifier(self.paper_list.df_raw).loc[:, self.paper_list.column_names] | |
self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1) | |
self.current_page = 1 | |
def set_sort_method(self, method, time_frame=None): | |
""" | |
Sets the sort method ('hot', 'new', 'top') and re-sorts the papers. | |
If 'top' is selected, also sets the time frame. | |
""" | |
if method not in ["hot", "new", "top"]: | |
method = "hot" | |
print(f"Setting sort method to: {method}") | |
self.sort_method = method | |
if method == "top" and time_frame: | |
self.top_time_frame = time_frame.lower() | |
print(f"Setting top time frame to: {self.top_time_frame}") | |
self.sort_papers() | |
return True # Assume success | |
def get_current_page_papers(self) -> str: | |
""" | |
Retrieves the HTML string of the current page's papers. | |
""" | |
start = (self.current_page - 1) * self.papers_per_page | |
end = start + self.papers_per_page | |
current_papers = self.paper_list.df_prettified.iloc[start:end] | |
if current_papers.empty: | |
return "<div class='no-papers'>No papers available for this page.</div>" | |
papers_html = "".join([self.format_paper(row, idx + start + 1) for idx, row in current_papers.iterrows()]) | |
return f""" | |
<table border="0" cellpadding="0" cellspacing="0" class="itemlist"> | |
{papers_html} | |
</table> | |
""" | |
def format_paper(self, row, rank): | |
""" | |
Formats a single paper entry into HTML. | |
""" | |
title = row.get('title', 'No title') | |
paper_id = row.get('arxiv_id', '') | |
url = f"https://huggingface.co/papers/{paper_id}" | |
upvotes = row.get('π', 0) | |
comments = row.get('π¬', 0) | |
date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) | |
try: | |
published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc) | |
except ValueError: | |
published_time = datetime.datetime.now(timezone.utc) | |
time_diff = datetime.datetime.now(timezone.utc) - published_time | |
time_ago_days = time_diff.days | |
time_ago = f"{time_ago_days} days ago" if time_ago_days > 0 else "today" | |
return f""" | |
<tr class="athing"> | |
<td align="right" valign="top" class="title"><span class="rank">{rank}.</span></td> | |
<td valign="top" class="title"> | |
<a href="{url}" class="storylink" target="_blank">{title}</a> | |
</td> | |
</tr> | |
<tr> | |
<td colspan="1"></td> | |
<td class="subtext"> | |
<span class="score">{upvotes} upvotes</span><br> | |
{time_ago} | <a href="#">{comments} comments</a> | |
</td> | |
</tr> | |
<tr style="height:5px"></tr> | |
""" | |
def next_page(self) -> str: | |
""" | |
Navigates to the next page if possible. | |
""" | |
if self.current_page < self.total_pages: | |
self.current_page += 1 | |
return self.get_current_page_papers() | |
def prev_page(self) -> str: | |
""" | |
Navigates to the previous page if possible. | |
""" | |
if self.current_page > 1: | |
self.current_page -= 1 | |
return self.get_current_page_papers() | |
def refresh(self) -> str: | |
""" | |
Refreshes the current list of papers. | |
""" | |
self.sort_papers() | |
return self.get_current_page_papers() | |
# Initialize PaperList and PaperManager | |
def initialize_paper_manager() -> str: | |
""" | |
Initializes the PaperList and PaperManager with the current DataFrame. | |
""" | |
df = get_df() | |
paper_list = PaperList(df) | |
manager = PaperManager(paper_list) | |
return manager.get_current_page_papers() # Return HTML string instead of the manager object | |
paper_manager = None # Initialize globally | |
def setup_paper_manager(): | |
""" | |
Sets up the global PaperManager instance. | |
""" | |
global paper_manager | |
df = get_df() | |
paper_list = PaperList(df) | |
paper_manager = PaperManager(paper_list) | |
# Initialize PaperManager at the start | |
setup_paper_manager() | |
def update_paper_manager() -> str: | |
""" | |
Updates the global PaperManager with the latest DataFrame. | |
""" | |
global paper_manager | |
df = get_df() | |
paper_manager.paper_list = PaperList(df) | |
paper_manager.sort_papers() | |
return paper_manager.get_current_page_papers() | |
# Scheduler for updating paper list every hour | |
scheduler_data = BackgroundScheduler() | |
scheduler_data.add_job( | |
func=update_paper_manager, | |
trigger="cron", | |
minute=0, # Every hour at minute 0 | |
timezone="UTC", | |
misfire_grace_time=60, | |
) | |
scheduler_data.start() | |
# --- Gradio Interface Functions --- | |
def change_sort_method_ui(method: str, time_frame: str = "all time") -> str: | |
""" | |
Changes the sort method and, if 'top' is selected, sets the time frame. | |
""" | |
if method.lower() == "top": | |
paper_manager.set_sort_method(method.lower(), time_frame) | |
else: | |
paper_manager.set_sort_method(method.lower()) | |
return paper_manager.get_current_page_papers() | |
# --- CSS Styling --- | |
css = """ | |
/* Existing CSS remains unchanged */ | |
body { | |
background-color: white; | |
font-family: Verdana, Geneva, sans-serif; | |
margin: 0; | |
padding: 0; | |
} | |
a { | |
color: #0000ff; | |
text-decoration: none; | |
} | |
a:visited { | |
color: #551A8B; | |
} | |
.container { | |
width: 85%; | |
margin: auto; | |
} | |
table { | |
width: 100%; | |
} | |
.header-table { | |
width: 100%; | |
background-color: #ff6600; | |
padding: 2px 10px; | |
} | |
.header-table a { | |
color: black; | |
font-weight: bold; | |
font-size: 14pt; | |
text-decoration: none; | |
} | |
.itemlist .athing { | |
background-color: #f6f6ef; | |
} | |
.rank { | |
font-size: 14pt; | |
color: #828282; | |
padding-right: 5px; | |
} | |
.storylink { | |
font-size: 10pt; | |
} | |
.subtext { | |
font-size: 8pt; | |
color: #828282; | |
padding-left: 40px; | |
} | |
.subtext a { | |
color: #828282; | |
text-decoration: none; | |
} | |
.no-papers { | |
text-align: center; | |
color: #828282; | |
padding: 1rem; | |
font-size: 14pt; | |
} | |
@media (max-width: 640px) { | |
.header-table a { | |
font-size: 12pt; | |
} | |
.storylink { | |
font-size: 9pt; | |
} | |
.subtext { | |
font-size: 7pt; | |
} | |
} | |
/* Dark mode */ | |
@media (prefers-color-scheme: dark) { | |
body { | |
background-color: #121212; | |
color: #e0e0e0; | |
} | |
a { | |
color: #add8e6; | |
} | |
a:visited { | |
color: #9370db; | |
} | |
.header-table { | |
background-color: #ff6600; | |
} | |
.header-table a { | |
color: black; | |
} | |
.itemlist .athing { | |
background-color: #1e1e1e; | |
} | |
.rank { | |
color: #b0b0b0; | |
} | |
.subtext { | |
color: #b0b0b0; | |
} | |
.subtext a { | |
color: #b0b0b0; | |
} | |
.no-papers { | |
color: #b0b0b0; | |
} | |
} | |
""" | |
# --- Initialize Gradio Blocks --- | |
demo = gr.Blocks(css=css) | |
with demo: | |
with gr.Column(elem_classes=["container"]): | |
# Accordion for Submission Instructions | |
with gr.Accordion("How to Submit a Paper", open=False): | |
gr.Markdown(""" | |
**Submit the paper to Daily Papers:** | |
[https://huggingface.co/papers/submit](https://huggingface.co/papers/submit) | |
Once your paper is submitted, it will automatically appear in this demo. | |
""") | |
# Header without Refresh Button | |
with gr.Row(): | |
gr.Markdown("<b>Daily Papers</b>") | |
# Sort Options and Time Frame (conditionally visible) | |
with gr.Row(): | |
sort_radio = gr.Radio( | |
choices=["Hot", "New", "Top"], | |
value="Hot", | |
label="Sort By", | |
interactive=True | |
) | |
time_frame_dropdown = gr.Dropdown( | |
choices=["day", "week", "month", "year", "all time"], | |
value="all time", | |
label="Time Frame for Top", | |
visible=False, | |
interactive=True | |
) | |
# Paper list | |
paper_list = gr.HTML() | |
# Navigation Buttons | |
with gr.Row(): | |
prev_button = gr.Button("Prev") | |
next_button = gr.Button("Next") | |
# Load papers on app start | |
demo.load( | |
fn=lambda: paper_manager.get_current_page_papers(), | |
outputs=[paper_list] | |
) | |
# Button clicks for pagination | |
prev_button.click(paper_manager.prev_page, outputs=[paper_list]) | |
next_button.click(paper_manager.next_page, outputs=[paper_list]) | |
# Sort option change: Toggle visibility of time_frame_dropdown based on sort method | |
sort_radio.change( | |
fn=lambda method: gr.update(visible=True) if method.lower() == "top" else gr.update(visible=False), | |
inputs=[sort_radio], | |
outputs=[time_frame_dropdown] | |
) | |
# Sort option change: Apply sorting method with time frame if applicable | |
sort_radio.change( | |
fn=change_sort_method_ui, | |
inputs=[sort_radio, time_frame_dropdown], | |
outputs=[paper_list] | |
) | |
# Footer | |
gr.Markdown(""" | |
Related useful Spaces: | |
- [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien) | |
- [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy) | |
- [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung) | |
""") | |
# --- Launch the App --- | |
if __name__ == "__main__": | |
demo.launch() |