akhaliq's picture
akhaliq HF staff
Update app.py
6790790 verified
raw
history blame
21 kB
#!/usr/bin/env python
import datetime
import operator
import pandas as pd
import tqdm.auto
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi
import gradio as gr
from gradio_calendar import Calendar
import datasets
import requests
from datetime import timezone # Ensure timezone is imported
# --- Data Loading and Processing ---
api = HfApi()
def get_df() -> pd.DataFrame:
"""
Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns,
and adds a 'paper_page' link for each paper.
"""
# Load datasets
df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas()
# Merge datasets on 'arxiv_id'
df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id")
# Reverse the DataFrame to have the latest papers first
df = df[::-1].reset_index(drop=True)
# Ensure 'date' is in datetime format and handle missing dates
df["date"] = pd.to_datetime(df["date"], errors='coerce')
df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
# Prepare the DataFrame by removing 'abstract'
paper_info = []
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
info = row.copy()
if "abstract" in info:
del info["abstract"]
paper_info.append(info)
df_prepared = pd.DataFrame(paper_info)
# Add 'paper_page' links
df_prepared["paper_page"] = df_prepared["arxiv_id"].apply(lambda x: f"https://huggingface.co/papers/{x}")
return df_prepared
class Prettifier:
"""
Converts raw DataFrame rows into a prettified format suitable for display.
"""
@staticmethod
def get_github_link(link: str) -> str:
if not link:
return ""
return Prettifier.create_link("github", link)
@staticmethod
def create_link(text: str, url: str) -> str:
return f'<a href="{url}" target="_blank">{text}</a>'
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
new_rows = []
for _, row in df.iterrows():
# Handle date_display as a clickable link
date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
new_row = {
"arxiv_id": row.get("arxiv_id", ""), # Include arxiv_id
"date_display": date_display, # For display
"date": row.get("date", datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")), # For internal calculations
"paper_page": Prettifier.create_link(row.get("arxiv_id", ""), row.get("paper_page", "#")),
"title": row.get("title", "No title"),
"github": Prettifier.get_github_link(row.get("github", "")),
"๐Ÿ‘": row.get("upvotes", 0),
"๐Ÿ’ฌ": row.get("num_comments", 0),
}
new_rows.append(new_row)
return pd.DataFrame(new_rows)
class PaperList:
"""
Manages the list of papers, including search functionality.
"""
COLUMN_INFO = [
["arxiv_id", "str"], # Added arxiv_id
["date_display", "markdown"], # For display
["date", "str"], # For internal use
["paper_page", "markdown"],
["title", "str"],
["github", "markdown"],
["๐Ÿ‘", "number"],
["๐Ÿ’ฌ", "number"],
]
def __init__(self, df: pd.DataFrame):
self.df_raw = df
self._prettifier = Prettifier()
self.df_prettified = self._prettifier(df).loc[:, self.column_names]
@property
def column_names(self):
return list(map(operator.itemgetter(0), self.COLUMN_INFO))
@property
def column_datatype(self):
return list(map(operator.itemgetter(1), self.COLUMN_INFO))
def search(
self,
title_search_query: str,
max_num_to_retrieve: int = 1000, # Set a high default to include all if not specified
) -> pd.DataFrame:
"""
Filters the DataFrame based on the title search query and limits the number of results.
"""
df = self.df_raw.copy()
# Filter by title if search query is provided
if title_search_query:
df = df[df["title"].str.contains(title_search_query, case=False, na=False)]
# Limit the number of papers to retrieve if max_num_to_retrieve is set
if max_num_to_retrieve:
df = df.head(max_num_to_retrieve)
# Prettify the DataFrame
df_prettified = self._prettifier(df).loc[:, self.column_names]
return df_prettified
# --- Sorting and Pagination Management ---
class PaperManager:
"""
Manages sorting, pagination, and search queries for the list of papers.
"""
def __init__(self, paper_list: PaperList, papers_per_page=30):
self.paper_list = paper_list
self.papers_per_page = papers_per_page
self.sort_method = "hot" # Default sort method
self.current_search_query = "" # Initialize with no search query
self.top_time_frame = "all time" # Default time frame for "Top" sorting
self.sort_papers()
# 'current_page' and 'total_pages' are set in 'sort_papers()'
def calculate_score(self, row):
"""
Calculate the score of a paper based on upvotes and age.
This mimics the "hotness" algorithm used by platforms like Hacker News.
"""
upvotes = row.get('๐Ÿ‘', 0)
date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
try:
published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
except ValueError:
# If parsing fails, use current time to minimize the impact on sorting
published_time = datetime.datetime.now(timezone.utc)
time_diff = datetime.datetime.now(timezone.utc) - published_time
time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours
# Avoid division by zero and apply the hotness formula
score = upvotes / ((time_diff_hours + 2) ** 1.5) if (time_diff_hours + 2) > 0 else 0
return score
def sort_papers(self):
"""
Sorts the papers based on the current sort method and search query.
"""
df = self.paper_list.df_raw.copy()
# Apply search filter if a search query exists
if self.current_search_query:
df = df[df["title"].str.contains(self.current_search_query, case=False, na=False)]
if self.sort_method == "hot":
df['score'] = df.apply(self.calculate_score, axis=1)
df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
elif self.sort_method == "new":
df_sorted = df.sort_values(by='date', ascending=False) # Sort by 'date'
elif self.sort_method == "top":
# Filter based on the selected time frame
now = datetime.datetime.now(timezone.utc)
if self.top_time_frame == "day":
time_threshold = now - datetime.timedelta(days=1)
elif self.top_time_frame == "week":
time_threshold = now - datetime.timedelta(weeks=1)
elif self.top_time_frame == "month":
time_threshold = now - datetime.timedelta(days=30)
elif self.top_time_frame == "year":
time_threshold = now - datetime.timedelta(days=365)
elif self.top_time_frame == "all time":
time_threshold = datetime.datetime.min.replace(tzinfo=timezone.utc)
else:
time_threshold = datetime.datetime.min.replace(tzinfo=timezone.utc)
# Convert 'date' column to datetime
df_sorted = df.copy()
df_sorted['date_parsed'] = pd.to_datetime(df_sorted['date'], errors='coerce').dt.tz_localize(timezone.utc)
df_sorted = df_sorted[df_sorted['date_parsed'] >= time_threshold]
df_sorted = df_sorted.sort_values(by='upvotes', ascending=False).drop(columns=['date_parsed'])
else:
df_sorted = df
self.paper_list.df_raw = df_sorted.reset_index(drop=True)
self.paper_list.df_prettified = self.paper_list._prettifier(self.paper_list.df_raw).loc[:, self.paper_list.column_names]
self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1)
self.current_page = 1
def set_sort_method(self, method, time_frame=None):
"""
Sets the sort method ('hot', 'new', 'top') and re-sorts the papers.
If 'top' is selected, also sets the time frame.
"""
if method not in ["hot", "new", "top"]:
method = "hot"
print(f"Setting sort method to: {method}")
self.sort_method = method
if method == "top" and time_frame:
self.top_time_frame = time_frame.lower()
print(f"Setting top time frame to: {self.top_time_frame}")
self.sort_papers()
return True # Assume success
def set_search_query(self, query: str):
"""
Sets the current search query and re-sorts the papers.
"""
print(f"Setting search query to: {query}")
self.current_search_query = query
self.sort_papers()
return True # Assume success
def get_current_page_papers(self) -> str:
"""
Retrieves the HTML string of the current page's papers.
"""
start = (self.current_page - 1) * self.papers_per_page
end = start + self.papers_per_page
current_papers = self.paper_list.df_prettified.iloc[start:end]
if current_papers.empty:
return "<div class='no-papers'>No papers available for this page.</div>"
papers_html = "".join([self.format_paper(row, idx + start + 1) for idx, row in current_papers.iterrows()])
return f"""
<table border="0" cellpadding="0" cellspacing="0" class="itemlist">
{papers_html}
</table>
"""
def format_paper(self, row, rank):
"""
Formats a single paper entry into HTML.
"""
title = row.get('title', 'No title')
paper_id = row.get('arxiv_id', '')
url = f"https://huggingface.co/papers/{paper_id}"
upvotes = row.get('๐Ÿ‘', 0)
comments = row.get('๐Ÿ’ฌ', 0)
date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
try:
published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
except ValueError:
published_time = datetime.datetime.now(timezone.utc)
time_diff = datetime.datetime.now(timezone.utc) - published_time
time_ago_days = time_diff.days
time_ago = f"{time_ago_days} days ago" if time_ago_days > 0 else "today"
return f"""
<tr class="athing">
<td align="right" valign="top" class="title"><span class="rank">{rank}.</span></td>
<td valign="top" class="title">
<a href="{url}" class="storylink" target="_blank">{title}</a>
</td>
</tr>
<tr>
<td colspan="1"></td>
<td class="subtext">
<span class="score">{upvotes} upvotes</span><br>
{time_ago} | <a href="#">{comments} comments</a>
</td>
</tr>
<tr style="height:5px"></tr>
"""
def next_page(self) -> str:
"""
Navigates to the next page if possible.
"""
if self.current_page < self.total_pages:
self.current_page += 1
return self.get_current_page_papers()
def prev_page(self) -> str:
"""
Navigates to the previous page if possible.
"""
if self.current_page > 1:
self.current_page -= 1
return self.get_current_page_papers()
def refresh(self) -> str:
"""
Refreshes the current list of papers.
"""
self.sort_papers()
return self.get_current_page_papers()
# Initialize PaperList and PaperManager
def initialize_paper_manager() -> str:
"""
Initializes the PaperList and PaperManager with the current DataFrame.
"""
df = get_df()
paper_list = PaperList(df)
manager = PaperManager(paper_list)
return manager.get_current_page_papers() # Return HTML string instead of the manager object
paper_manager = None # Initialize globally
def setup_paper_manager():
"""
Sets up the global PaperManager instance.
"""
global paper_manager
df = get_df()
paper_list = PaperList(df)
paper_manager = PaperManager(paper_list)
# Initialize PaperManager at the start
setup_paper_manager()
def update_paper_manager() -> str:
"""
Updates the global PaperManager with the latest DataFrame.
"""
global paper_manager
df = get_df()
paper_manager.paper_list = PaperList(df)
paper_manager.sort_papers()
return paper_manager.get_current_page_papers()
# Scheduler for updating paper list every hour
scheduler_data = BackgroundScheduler()
scheduler_data.add_job(
func=update_paper_manager,
trigger="cron",
minute=0, # Every hour at minute 0
timezone="UTC",
misfire_grace_time=60,
)
scheduler_data.start()
# --- Gradio Interface Functions ---
def change_sort_method_ui(method: str, time_frame: str = "all time") -> str:
"""
Changes the sort method and, if 'top' is selected, sets the time frame.
"""
if method.lower() == "top":
paper_manager.set_sort_method(method.lower(), time_frame)
else:
paper_manager.set_sort_method(method.lower())
return paper_manager.get_current_page_papers()
def refresh_papers_ui() -> str:
"""
Refreshes the paper list.
"""
return paper_manager.refresh()
def search_papers_ui(query: str) -> str:
"""
Searches for papers based on the title search query.
"""
paper_manager.set_search_query(query)
return paper_manager.get_current_page_papers()
def clear_search_ui() -> str:
"""
Clears the current search query and refreshes the paper list.
"""
paper_manager.set_search_query("")
return paper_manager.get_current_page_papers()
# --- CSS Styling ---
css = """
/* Existing CSS remains unchanged */
body {
background-color: white;
font-family: Verdana, Geneva, sans-serif;
margin: 0;
padding: 0;
}
a {
color: #0000ff;
text-decoration: none;
}
a:visited {
color: #551A8B;
}
.container {
width: 85%;
margin: auto;
}
table {
width: 100%;
}
.header-table {
width: 100%;
background-color: #ff6600;
padding: 2px 10px;
}
.header-table a {
color: black;
font-weight: bold;
font-size: 14pt;
text-decoration: none;
}
.itemlist .athing {
background-color: #f6f6ef;
}
.rank {
font-size: 14pt;
color: #828282;
padding-right: 5px;
}
.storylink {
font-size: 10pt;
}
.subtext {
font-size: 8pt;
color: #828282;
padding-left: 40px;
}
.subtext a {
color: #828282;
text-decoration: none;
}
#refresh-button {
background: none;
border: none;
color: black;
font-weight: bold;
font-size: 14pt;
cursor: pointer;
}
.no-papers {
text-align: center;
color: #828282;
padding: 1rem;
font-size: 14pt;
}
@media (max-width: 640px) {
.header-table a {
font-size: 12pt;
}
.storylink {
font-size: 9pt;
}
.subtext {
font-size: 7pt;
}
}
/* Dark mode */
@media (prefers-color-scheme: dark) {
body {
background-color: #121212;
color: #e0e0e0;
}
a {
color: #add8e6;
}
a:visited {
color: #9370db;
}
.header-table {
background-color: #ff6600;
}
.header-table a {
color: black;
}
.itemlist .athing {
background-color: #1e1e1e;
}
.rank {
color: #b0b0b0;
}
.subtext {
color: #b0b0b0;
}
.subtext a {
color: #b0b0b0;
}
#refresh-button {
color: #e0e0e0;
}
.no-papers {
color: #b0b0b0;
}
}
"""
# --- Initialize Gradio Blocks ---
demo = gr.Blocks(css=css)
with demo:
with gr.Column(elem_classes=["container"]):
# Accordion for Submission Instructions
with gr.Accordion("How to Submit a Paper", open=False):
gr.Markdown("""
**Submit the paper to Daily Papers:**
[https://huggingface.co/papers/submit](https://huggingface.co/papers/submit)
Once your paper is submitted, it will automatically appear in this demo.
""")
# Header with Refresh Button
with gr.Row():
gr.HTML("""
<table border="0" cellpadding="0" cellspacing="0" class="header-table">
<tr>
<td>
<span class="pagetop">
<b class="hnname"><a href="#">Daily Papers</a></b>
</span>
</td>
<td align="right">
<button id="refresh-button">Refresh</button>
</td>
</tr>
</table>
""")
# Search Bar and Clear Search Button
with gr.Row():
search_box = gr.Textbox(
label="Search Papers by Title",
placeholder="Enter keywords to search...",
lines=1,
interactive=True
)
search_button = gr.Button("Search")
clear_search_button = gr.Button("Clear Search")
# Sort Options and Time Frame (conditionally visible)
with gr.Row():
sort_radio = gr.Radio(
choices=["Hot", "New", "Top"],
value="Hot",
label="Sort By",
interactive=True
)
time_frame_dropdown = gr.Dropdown(
choices=["day", "week", "month", "year", "all time"],
value="all time",
label="Time Frame for Top",
visible=False,
interactive=True
)
# Paper list
paper_list = gr.HTML()
# Navigation Buttons
with gr.Row():
prev_button = gr.Button("Prev")
next_button = gr.Button("Next")
# Load papers on app start
demo.load(
fn=lambda: paper_manager.get_current_page_papers(),
outputs=[paper_list]
)
# Button clicks for pagination
prev_button.click(paper_manager.prev_page, outputs=[paper_list])
next_button.click(paper_manager.next_page, outputs=[paper_list])
# Refresh functionality
refresh_button = gr.Button("Refresh", visible=False, elem_id="refresh-hidden")
refresh_button.click(update_paper_manager, outputs=[paper_list])
# Bind the visible Refresh button to the hidden one using JavaScript
gr.HTML("""
<script>
document.getElementById('refresh-button').addEventListener('click', function() {
document.getElementById('refresh-hidden').click();
});
</script>
""")
# Sort option change
sort_radio.change(
fn=lambda method: method.lower(),
inputs=[sort_radio],
outputs=None,
_js="""
(method) => {
if (method === 'top') {
document.querySelector('[label="Time Frame for Top"]').style.display = 'block';
} else {
document.querySelector('[label="Time Frame for Top"]').style.display = 'none';
}
return method;
}
"""
)
sort_radio.change(
fn=change_sort_method_ui,
inputs=[sort_radio, time_frame_dropdown],
outputs=[paper_list]
)
# Search functionality
search_button.click(
fn=search_papers_ui,
inputs=[search_box],
outputs=[paper_list]
)
# Clear search functionality
clear_search_button.click(
fn=clear_search_ui,
inputs=None,
outputs=[paper_list]
)
# Footer
gr.Markdown("""
Related useful Spaces:
- [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien)
- [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy)
- [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
""")
# --- Launch the App ---
if __name__ == "__main__":
demo.launch()