#!/usr/bin/env python import datetime import pandas as pd import tqdm.auto from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import HfApi import gradio as gr import datasets # Ensure the datasets library is imported from datetime import timezone import atexit # To gracefully shut down the scheduler import logging # For logging purposes # --- Logging Configuration --- logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # --- Data Loading and Processing --- api = HfApi() def get_df() -> pd.DataFrame: """ Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns, and adds a 'paper_page' link for each paper. """ try: # Load datasets logger.info("Loading 'daily-papers' dataset.") df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas() logger.info("Loading 'daily-papers-stats' dataset.") df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas() # Merge datasets on 'arxiv_id' logger.info("Merging datasets on 'arxiv_id'.") df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id", suffixes=('_papers', '_stats')) # Reverse the DataFrame to have the latest papers first df = df[::-1].reset_index(drop=True) # Ensure 'date' is in datetime format and handle missing dates logger.info("Processing 'date' column.") df["date"] = pd.to_datetime(df["date"], errors='coerce') df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) # Prepare the DataFrame by removing 'abstract' logger.info("Removing 'abstract' column if present.") if 'abstract' in df.columns: df = df.drop(columns=['abstract']) # Add 'paper_page' links logger.info("Adding 'paper_page' links.") df["paper_page"] = df["arxiv_id"].apply(lambda x: f"https://huggingface.co/papers/{x}") # Verify that 'date' column exists if 'date' not in df.columns: logger.error("'date' column is missing from the DataFrame. Filling with current date.") df["date"] = datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d") logger.info("DataFrame preparation complete.") return df except Exception as e: logger.error(f"Error in get_df: {e}") return pd.DataFrame() # Return empty DataFrame on error class Prettifier: """ Converts raw DataFrame rows into a prettified format suitable for display. """ REQUIRED_COLUMNS = ["arxiv_id", "date_display", "date", "paper_page", "title", "github", "👍", "💬"] @staticmethod def get_github_link(link: str) -> str: if not link: return "" return Prettifier.create_link("github", link) @staticmethod def create_link(text: str, url: str) -> str: return f'{text}' def __call__(self, df: pd.DataFrame) -> pd.DataFrame: new_rows = [] for _, row in df.iterrows(): # Handle date_display as a clickable link date_display = Prettifier.create_link(row.get("date", ""), f"https://huggingface.co/papers?date={row.get('date', '')}") new_row = { "arxiv_id": row.get("arxiv_id", ""), # Include arxiv_id "date_display": date_display, # For display "date": row.get("date", datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")), # For internal calculations "paper_page": Prettifier.create_link(row.get("arxiv_id", ""), row.get("paper_page", "#")), "title": row.get("title", "No title"), "github": Prettifier.get_github_link(row.get("github", "")), "👍": row.get("upvotes", 0), "💬": row.get("num_comments", 0), } new_rows.append(new_row) # If no rows, return empty DataFrame with required columns to prevent KeyError if not new_rows: return pd.DataFrame(columns=self.REQUIRED_COLUMNS) return pd.DataFrame(new_rows) class PaperList: """ Manages the list of papers. """ COLUMN_INFO = [ ["arxiv_id", "str"], # Added arxiv_id ["date_display", "markdown"], # For display ["date", "str"], # For internal use ["paper_page", "markdown"], ["title", "str"], ["github", "markdown"], ["👍", "number"], ["💬", "number"], ] def __init__(self, df: pd.DataFrame): self.df_raw = df self._prettifier = Prettifier() self.df_prettified = self._prettifier(df).loc[:, self.column_names] @property def column_names(self): return [col[0] for col in self.COLUMN_INFO] @property def column_datatype(self): return [col[1] for col in self.COLUMN_INFO] def get_prettified_df(self) -> pd.DataFrame: """ Returns the prettified DataFrame. """ return self.df_prettified # --- Sorting and Pagination Management --- class PaperManager: """ Manages sorting and pagination for the list of papers. """ def __init__(self, paper_list: PaperList, papers_per_page=30): self.paper_list = paper_list self.papers_per_page = papers_per_page self.sort_method = "hot" # Default sort method self.sort_papers() # 'current_page' and 'total_pages' are set in 'sort_papers()' def calculate_score(self, row): """ Calculate the score of a paper based on upvotes and age. This mimics the "hotness" algorithm used by platforms like Hacker News. """ upvotes = row.get('upvotes', 0) # Corrected from '👍' to 'upvotes' date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) try: published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc) except ValueError: # If parsing fails, use current time to minimize the impact on sorting published_time = datetime.datetime.now(timezone.utc) time_diff = datetime.datetime.now(timezone.utc) - published_time time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours # Avoid division by zero and apply the hotness formula score = upvotes / ((time_diff_hours + 2) ** 1.5) if (time_diff_hours + 2) > 0 else 0 return score def sort_papers(self): """ Sorts the papers based on the current sort method. """ df = self.paper_list.df_raw.copy() if self.sort_method == "hot": if not df.empty: df = df.drop(columns=['score'], errors='ignore') # Remove existing 'score' column if present df['score'] = df.apply(self.calculate_score, axis=1) df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score']) else: df_sorted = df elif self.sort_method == "new": df_sorted = df.sort_values(by='date', ascending=False) # Sort by 'date' else: df_sorted = df self.paper_list.df_raw = df_sorted.reset_index(drop=True) self.paper_list.df_prettified = self.paper_list._prettifier(self.paper_list.df_raw).loc[:, self.paper_list.column_names] self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1) self.current_page = 1 logger.info(f"Papers sorted by {self.sort_method}. Total pages: {self.total_pages}") def set_sort_method(self, method, time_frame=None): """ Sets the sort method ('hot', 'new') and re-sorts the papers. """ if method not in ["hot", "new"]: method = "hot" logger.info(f"Setting sort method to: {method}") self.sort_method = method self.sort_papers() return True # Assume success def get_current_page_papers(self) -> str: """ Retrieves the HTML string of the current page's papers. """ start = (self.current_page - 1) * self.papers_per_page end = start + self.papers_per_page current_papers = self.paper_list.df_prettified.iloc[start:end] if current_papers.empty: return "
No papers available for this page.
" papers_html = "".join([self.format_paper(row, idx + start + 1) for idx, row in current_papers.iterrows()]) return f""" {papers_html}
""" def format_paper(self, row, rank): """ Formats a single paper entry into HTML. """ title = row.get('title', 'No title') paper_id = row.get('arxiv_id', '') url = f"https://huggingface.co/papers/{paper_id}" upvotes = row.get('👍', 0) comments = row.get('💬', 0) date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) try: published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc) except ValueError: published_time = datetime.datetime.now(timezone.utc) time_diff = datetime.datetime.now(timezone.utc) - published_time time_ago_days = time_diff.days time_ago = f"{time_ago_days} days ago" if time_ago_days > 0 else "today" return f""" {rank}. {title} {upvotes} upvotes
{time_ago} | {comments} comments """ def next_page(self) -> str: """ Navigates to the next page if possible. """ if self.current_page < self.total_pages: self.current_page += 1 logger.info(f"Navigated to page {self.current_page}.") else: logger.info("Already on the last page.") return self.get_current_page_papers() def prev_page(self) -> str: """ Navigates to the previous page if possible. """ if self.current_page > 1: self.current_page -= 1 logger.info(f"Navigated to page {self.current_page}.") else: logger.info("Already on the first page.") return self.get_current_page_papers() def refresh(self) -> str: """ Refreshes the current list of papers. """ logger.info("Refreshing papers.") self.sort_papers() return self.get_current_page_papers() # Initialize PaperList and PaperManager def initialize_paper_manager() -> str: """ Initializes the PaperList and PaperManager with the current DataFrame. """ df = get_df() if df.empty: logger.warning("Initialized with an empty DataFrame.") paper_list = PaperList(df) manager = PaperManager(paper_list) logger.info("PaperManager initialized.") return manager.get_current_page_papers() # Return HTML string instead of the manager object paper_manager = None # Initialize globally def setup_paper_manager(): """ Sets up the global PaperManager instance. """ global paper_manager df = get_df() paper_list = PaperList(df) paper_manager = PaperManager(paper_list) logger.info("PaperManager setup complete.") # Initialize PaperManager at the start setup_paper_manager() def update_paper_manager() -> str: """ Updates the global PaperManager with the latest DataFrame. """ global paper_manager logger.info("Updating PaperManager with latest data.") df = get_df() if df.empty: logger.warning("DataFrame is empty. Skipping update.") return paper_manager.get_current_page_papers() paper_manager.paper_list = PaperList(df) paper_manager.sort_papers() logger.info("PaperManager updated successfully.") return paper_manager.get_current_page_papers() # Scheduler for updating paper list every hour scheduler_data = BackgroundScheduler() scheduler_data.add_job( func=update_paper_manager, trigger="cron", minute=0, # Every hour at minute 0 timezone="UTC", misfire_grace_time=60, ) scheduler_data.start() logger.info("BackgroundScheduler started.") # Ensure the scheduler shuts down gracefully on exit atexit.register(lambda: scheduler_data.shutdown()) logger.info("Scheduler shutdown registered.") # --- Gradio Interface Functions --- def change_sort_method_ui(method: str) -> str: """ Changes the sort method based on user selection. """ logger.info(f"Changing sort method to: {method}") success = paper_manager.set_sort_method(method.lower()) if success: return paper_manager.get_current_page_papers() else: return "
Failed to change sort method.
" # --- CSS Styling --- css = """ /* Hacker News-like CSS */ body { background-color: white; font-family: Verdana, Geneva, sans-serif; margin: 0; padding: 0; } a { color: #0000ff; text-decoration: none; } a:visited { color: #551A8B; } .container { width: 85%; margin: auto; } table { width: 100%; } .header-table { width: 100%; background-color: #ff6600; padding: 2px 10px; } .header-table a { color: black; font-weight: bold; font-size: 14pt; text-decoration: none; } .header-table .sort-buttons button { background: none; border: none; color: #0000ff; cursor: pointer; font-size: 14pt; text-decoration: underline; padding: 0 10px; } .header-table .sort-buttons button:hover { color: #551A8B; } .itemlist .athing { background-color: #f6f6ef; } .rank { font-size: 14pt; color: #828282; padding-right: 5px; } .storylink { font-size: 10pt; } .subtext { font-size: 8pt; color: #828282; padding-left: 40px; } .subtext a { color: #828282; text-decoration: none; } .no-papers { text-align: center; color: #828282; padding: 1rem; font-size: 14pt; } @media (max-width: 640px) { .header-table a { font-size: 12pt; } .sort-buttons button { font-size: 12pt; padding: 0 5px; } .storylink { font-size: 9pt; } .subtext { font-size: 7pt; } } /* Dark mode */ @media (prefers-color-scheme: dark) { body { background-color: #121212; color: #e0e0e0; } a { color: #add8e6; } a:visited { color: #9370db; } .header-table { background-color: #ff6600; } .header-table a { color: black; } .header-table .sort-buttons button { color: #add8e6; } .header-table .sort-buttons button:hover { color: #9370db; } .itemlist .athing { background-color: #1e1e1e; } .rank { color: #b0b0b0; } .subtext { color: #b0b0b0; } .subtext a { color: #b0b0b0; } .no-papers { color: #b0b0b0; } } """ # --- Initialize Gradio Blocks --- demo = gr.Blocks(css=css) with demo: with gr.Column(elem_classes=["container"]): # Accordion for Submission Instructions with gr.Accordion("How to Submit a Paper", open=False): gr.Markdown(""" **Submit the paper to Daily Papers:** [https://huggingface.co/papers/submit](https://huggingface.co/papers/submit) Once your paper is submitted, it will automatically appear in this demo. """) # Hacker News-like Header with "Hot" and "New" sort options with gr.Row(): # Left side: Site title gr.Markdown("""
Daily Papers
""", show_label=False) # Right side: Gradio Buttons for "Hot" and "New" with gr.Column(elem_classes=["sort-buttons"]): hot_button = gr.Button("Hot", elem_id="hot_button") new_button = gr.Button("New", elem_id="new_button") # Paper list paper_list = gr.HTML() # Navigation Buttons with gr.Row(): prev_button = gr.Button("Prev") next_button = gr.Button("Next") # Load papers on app start demo.load( fn=lambda: paper_manager.get_current_page_papers(), outputs=[paper_list] ) # Button clicks for pagination prev_button.click( fn=lambda: paper_manager.prev_page(), inputs=[], outputs=[paper_list] ) next_button.click( fn=lambda: paper_manager.next_page(), inputs=[], outputs=[paper_list] ) # Gradio Buttons trigger sort methods directly hot_button.click( fn=lambda: change_sort_method_ui("hot"), inputs=[], outputs=[paper_list] ) new_button.click( fn=lambda: change_sort_method_ui("new"), inputs=[], outputs=[paper_list] ) # Footer - Removed as per request # Removed the footer markdown section # --- Launch the App --- if __name__ == "__main__": demo.launch()