dailypapershackernews-dev

Running

File size: 18,479 Bytes

35b4c0e
 
 
 
 
 
 
 
cf55fa7
33ea647
35b4c0e
33ea647
 
 
 
 
 
 
91c14ed
35b4c0e
 
 
 
 
6790790
 
a7e2292
6790790
33ea647
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bc512d
 
33ea647
 
 
2bc512d
33ea647
a7e2292
2bc512d
 
 
a7e2292
33ea647
2bc512d
33ea647
 
 
35b4c0e
 
 
6790790
 
 
a7e2292
a575839
35b4c0e
 
 
 
 
 
 
 
 
 
 
 
 
3e6fd58
a575839
3b04ee1
35b4c0e
a575839
 
6790790
a575839
6790790
a575839
6790790
 
35b4c0e
 
a575839
 
157015b
a575839
 
 
35b4c0e
 
 
6790790
d0290b6
6790790
35b4c0e
6790790
 
 
35b4c0e
 
 
 
 
 
 
 
 
 
 
 
 
 
a575839
35b4c0e
 
 
a575839
35b4c0e
d0290b6
6790790
d0290b6
6790790
d0290b6
35b4c0e
 
4f9c2ea
 
 
6790790
a7e2292
6790790
4f9c2ea
 
 
6790790
1714fcd
3b04ee1
4f9c2ea
1714fcd
4f9c2ea
 
 
 
4523ad3
3e6fd58
4f9c2ea
3e6fd58
4f9c2ea
 
91c14ed
4f9c2ea
91c14ed
4f9c2ea
 
 
3b04ee1
4f9c2ea
 
 
6790790
d0290b6
6790790
4f9c2ea
 
 
4523ad3
 
 
 
 
 
4f9c2ea
6790790
4f9c2ea
 
 
 
 
 
 
22d5f09
4f9c2ea
6790790
 
c6e36bd
6790790
c6e36bd
4f9c2ea
33ea647
4f9c2ea
 
 
 
1714fcd
6790790
 
 
4f9c2ea
 
 
 
1714fcd
 
 
 
 
 
 
 
 
 
 
6790790
 
 
1714fcd
 
 
 
 
3e6fd58
1714fcd
3e6fd58
1714fcd
91c14ed
 
1714fcd
 
 
 
 
 
 
 
 
 
 
 
 
 
a7e2292
1714fcd
 
 
 
 
 
6790790
 
 
4f9c2ea
 
33ea647
 
 
4f9c2ea
 
1714fcd
6790790
 
 
4f9c2ea
 
33ea647
 
 
4f9c2ea
 
1714fcd
6790790
 
 
33ea647
4f9c2ea
 
 
 
1714fcd
d2a8abe
6790790
 
 
1714fcd
22d5f09
 
1714fcd
 
22d5f09
d2a8abe
1714fcd
 
d2a8abe
 
 
6790790
 
 
d2a8abe
 
 
 
33ea647
d2a8abe
 
 
 
4f9c2ea
 
1714fcd
6790790
 
 
4f9c2ea
33ea647
1714fcd
33ea647
 
 
1714fcd
 
33ea647
4f9c2ea
 
 
1714fcd
 
 
 
 
 
 
 
 
 
33ea647
 
 
 
 
4f9c2ea
 
1714fcd
4f9c2ea
c6e36bd
6790790
c6e36bd
6790790
c6e36bd
a767c65
 
 
 
 
4f9c2ea
 
 
 
 
33ea647
 
4f9c2ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03f9262
 
 
 
c6e36bd
03f9262
 
 
 
 
 
 
c6e36bd
 
4f9c2ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03f9262
c6e36bd
03f9262
c6e36bd
 
4f9c2ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03f9262
 
 
 
 
 
c6e36bd
 
4f9c2ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22d5f09
4f9c2ea
 
 
 
 
 
 
 
 
 
 
9784357
4f9c2ea
 
a767c65
2bc512d
7cd24de
03f9262
 
 
 
 
 
 
 
 
 
a767c65
03f9262
 
 
 
a767c65
 
 
 
 
1714fcd
 
a767c65
1714fcd
4f9c2ea
 
 
a767c65
1714fcd
d2a8abe
 
 
 
a767c65
1714fcd
a767c65
 
 
 
 
 
 
 
 
 
 
 
 
2bc512d
 
 
 
a767c65
2bc512d
 
1714fcd
 
a767c65
2bc512d
 
6790790
3b04ee1
4f9c2ea
9bc1648
35b4c0e
a767c65

#!/usr/bin/env python

import datetime
import pandas as pd
import tqdm.auto
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi

import gradio as gr
import datasets  # Ensure the datasets library is imported

from datetime import timezone
import atexit  # To gracefully shut down the scheduler
import logging  # For logging purposes

# --- Logging Configuration ---
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# --- Data Loading and Processing ---

api = HfApi()

def get_df() -> pd.DataFrame:
    """
    Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns,
    and adds a 'paper_page' link for each paper.
    """
    try:
        # Load datasets
        logger.info("Loading 'daily-papers' dataset.")
        df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
        logger.info("Loading 'daily-papers-stats' dataset.")
        df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas()
        
        # Merge datasets on 'arxiv_id'
        logger.info("Merging datasets on 'arxiv_id'.")
        df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id", suffixes=('_papers', '_stats'))
        
        # Reverse the DataFrame to have the latest papers first
        df = df[::-1].reset_index(drop=True)
        
        # Ensure 'date' is in datetime format and handle missing dates
        logger.info("Processing 'date' column.")
        df["date"] = pd.to_datetime(df["date"], errors='coerce')
        df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
        
        # Prepare the DataFrame by removing 'abstract'
        logger.info("Removing 'abstract' column if present.")
        if 'abstract' in df.columns:
            df = df.drop(columns=['abstract'])
        
        # Add 'paper_page' links
        logger.info("Adding 'paper_page' links.")
        df["paper_page"] = df["arxiv_id"].apply(lambda x: f"https://huggingface.co/papers/{x}")
        
        # Verify that 'date' column exists
        if 'date' not in df.columns:
            logger.error("'date' column is missing from the DataFrame. Filling with current date.")
            df["date"] = datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")
        
        logger.info("DataFrame preparation complete.")
        return df
    except Exception as e:
        logger.error(f"Error in get_df: {e}")
        return pd.DataFrame()  # Return empty DataFrame on error


class Prettifier:
    """
    Converts raw DataFrame rows into a prettified format suitable for display.
    """
    REQUIRED_COLUMNS = ["arxiv_id", "date_display", "date", "paper_page", "title", "github", "👍", "💬"]
    
    @staticmethod
    def get_github_link(link: str) -> str:
        if not link:
            return ""
        return Prettifier.create_link("github", link)

    @staticmethod
    def create_link(text: str, url: str) -> str:
        return f'<a href="{url}" target="_blank">{text}</a>'

    def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
        new_rows = []
        for _, row in df.iterrows():
            # Handle date_display as a clickable link
            date_display = Prettifier.create_link(row.get("date", ""), f"https://huggingface.co/papers?date={row.get('date', '')}")
            
            new_row = {
                "arxiv_id": row.get("arxiv_id", ""),  # Include arxiv_id
                "date_display": date_display,         # For display
                "date": row.get("date", datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")),  # For internal calculations
                "paper_page": Prettifier.create_link(row.get("arxiv_id", ""), row.get("paper_page", "#")),
                "title": row.get("title", "No title"),
                "github": Prettifier.get_github_link(row.get("github", "")),
                "👍": row.get("upvotes", 0),
                "💬": row.get("num_comments", 0),
            }
            new_rows.append(new_row)
        
        # If no rows, return empty DataFrame with required columns to prevent KeyError
        if not new_rows:
            return pd.DataFrame(columns=self.REQUIRED_COLUMNS)
        
        return pd.DataFrame(new_rows)


class PaperList:
    """
    Manages the list of papers.
    """
    COLUMN_INFO = [
        ["arxiv_id", "str"],          # Added arxiv_id
        ["date_display", "markdown"], # For display
        ["date", "str"],              # For internal use
        ["paper_page", "markdown"],
        ["title", "str"],
        ["github", "markdown"],
        ["👍", "number"],
        ["💬", "number"],
    ]

    def __init__(self, df: pd.DataFrame):
        self.df_raw = df
        self._prettifier = Prettifier()
        self.df_prettified = self._prettifier(df).loc[:, self.column_names]

    @property
    def column_names(self):
        return [col[0] for col in self.COLUMN_INFO]

    @property
    def column_datatype(self):
        return [col[1] for col in self.COLUMN_INFO]

    def get_prettified_df(self) -> pd.DataFrame:
        """
        Returns the prettified DataFrame.
        """
        return self.df_prettified


# --- Sorting and Pagination Management ---

class PaperManager:
    """
    Manages sorting and pagination for the list of papers.
    """
    def __init__(self, paper_list: PaperList, papers_per_page=30):
        self.paper_list = paper_list
        self.papers_per_page = papers_per_page
        self.sort_method = "hot"              # Default sort method
        self.sort_papers()
        # 'current_page' and 'total_pages' are set in 'sort_papers()'

    def calculate_score(self, row):
        """
        Calculate the score of a paper based on upvotes and age.
        This mimics the "hotness" algorithm used by platforms like Hacker News.
        """
        upvotes = row.get('upvotes', 0)  # Corrected from '👍' to 'upvotes'
        date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
        try:
            published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
        except ValueError:
            # If parsing fails, use current time to minimize the impact on sorting
            published_time = datetime.datetime.now(timezone.utc)

        time_diff = datetime.datetime.now(timezone.utc) - published_time
        time_diff_hours = time_diff.total_seconds() / 3600  # Convert time difference to hours

        # Avoid division by zero and apply the hotness formula
        score = upvotes / ((time_diff_hours + 2) ** 1.5) if (time_diff_hours + 2) > 0 else 0
        return score

    def sort_papers(self):
        """
        Sorts the papers based on the current sort method.
        """
        df = self.paper_list.df_raw.copy()

        if self.sort_method == "hot":
            if not df.empty:
                df = df.drop(columns=['score'], errors='ignore')  # Remove existing 'score' column if present
                df['score'] = df.apply(self.calculate_score, axis=1)
                df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
            else:
                df_sorted = df
        elif self.sort_method == "new":
            df_sorted = df.sort_values(by='date', ascending=False)  # Sort by 'date'
        else:
            df_sorted = df

        self.paper_list.df_raw = df_sorted.reset_index(drop=True)
        self.paper_list.df_prettified = self.paper_list._prettifier(self.paper_list.df_raw).loc[:, self.paper_list.column_names]
        self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1)
        self.current_page = 1
        logger.info(f"Papers sorted by {self.sort_method}. Total pages: {self.total_pages}")

    def set_sort_method(self, method, time_frame=None):
        """
        Sets the sort method ('hot', 'new') and re-sorts the papers.
        """
        if method not in ["hot", "new"]:
            method = "hot"
        logger.info(f"Setting sort method to: {method}")
        self.sort_method = method
        self.sort_papers()
        return True  # Assume success

    def get_current_page_papers(self) -> str:
        """
        Retrieves the HTML string of the current page's papers.
        """
        start = (self.current_page - 1) * self.papers_per_page
        end = start + self.papers_per_page
        current_papers = self.paper_list.df_prettified.iloc[start:end]

        if current_papers.empty:
            return "<div class='no-papers'>No papers available for this page.</div>"

        papers_html = "".join([self.format_paper(row, idx + start + 1) for idx, row in current_papers.iterrows()])
        return f"""
        <table border="0" cellpadding="0" cellspacing="0" class="itemlist">
            {papers_html}
        </table>
        """

    def format_paper(self, row, rank):
        """
        Formats a single paper entry into HTML.
        """
        title = row.get('title', 'No title')
        paper_id = row.get('arxiv_id', '')
        url = f"https://huggingface.co/papers/{paper_id}"
        upvotes = row.get('👍', 0)
        comments = row.get('💬', 0)
        date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
        try:
            published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
        except ValueError:
            published_time = datetime.datetime.now(timezone.utc)
        time_diff = datetime.datetime.now(timezone.utc) - published_time
        time_ago_days = time_diff.days
        time_ago = f"{time_ago_days} days ago" if time_ago_days > 0 else "today"

        return f"""
        <tr class="athing">
            <td align="right" valign="top" class="title"><span class="rank">{rank}.</span></td>
            <td valign="top" class="title">
                <a href="{url}" class="storylink" target="_blank">{title}</a>
            </td>
        </tr>
        <tr>
            <td colspan="1"></td>
            <td class="subtext">
                <span class="score">{upvotes} upvotes</span><br>
                {time_ago} | <a href="#">{comments} comments</a>
            </td>
        </tr>
        <tr style="height:5px"></tr>
        """

    def next_page(self) -> str:
        """
        Navigates to the next page if possible.
        """
        if self.current_page < self.total_pages:
            self.current_page += 1
            logger.info(f"Navigated to page {self.current_page}.")
        else:
            logger.info("Already on the last page.")
        return self.get_current_page_papers()

    def prev_page(self) -> str:
        """
        Navigates to the previous page if possible.
        """
        if self.current_page > 1:
            self.current_page -= 1
            logger.info(f"Navigated to page {self.current_page}.")
        else:
            logger.info("Already on the first page.")
        return self.get_current_page_papers()

    def refresh(self) -> str:
        """
        Refreshes the current list of papers.
        """
        logger.info("Refreshing papers.")
        self.sort_papers()
        return self.get_current_page_papers()


# Initialize PaperList and PaperManager
def initialize_paper_manager() -> str:
    """
    Initializes the PaperList and PaperManager with the current DataFrame.
    """
    df = get_df()
    if df.empty:
        logger.warning("Initialized with an empty DataFrame.")
    paper_list = PaperList(df)
    manager = PaperManager(paper_list)
    logger.info("PaperManager initialized.")
    return manager.get_current_page_papers()  # Return HTML string instead of the manager object


paper_manager = None  # Initialize globally

def setup_paper_manager():
    """
    Sets up the global PaperManager instance.
    """
    global paper_manager
    df = get_df()
    paper_list = PaperList(df)
    paper_manager = PaperManager(paper_list)
    logger.info("PaperManager setup complete.")


# Initialize PaperManager at the start
setup_paper_manager()


def update_paper_manager() -> str:
    """
    Updates the global PaperManager with the latest DataFrame.
    """
    global paper_manager
    logger.info("Updating PaperManager with latest data.")
    df = get_df()
    if df.empty:
        logger.warning("DataFrame is empty. Skipping update.")
        return paper_manager.get_current_page_papers()
    paper_manager.paper_list = PaperList(df)
    paper_manager.sort_papers()
    logger.info("PaperManager updated successfully.")
    return paper_manager.get_current_page_papers()


# Scheduler for updating paper list every hour
scheduler_data = BackgroundScheduler()
scheduler_data.add_job(
    func=update_paper_manager,
    trigger="cron",
    minute=0,  # Every hour at minute 0
    timezone="UTC",
    misfire_grace_time=60,
)
scheduler_data.start()
logger.info("BackgroundScheduler started.")

# Ensure the scheduler shuts down gracefully on exit
atexit.register(lambda: scheduler_data.shutdown())
logger.info("Scheduler shutdown registered.")


# --- Gradio Interface Functions ---

def change_sort_method_ui(method: str) -> str:
    """
    Changes the sort method based on user selection.
    """
    logger.info(f"Changing sort method to: {method}")
    success = paper_manager.set_sort_method(method.lower())
    if success:
        return paper_manager.get_current_page_papers()
    else:
        return "<div class='no-papers'>Failed to change sort method.</div>"


# --- CSS Styling ---

css = """
/* Hacker News-like CSS */

body {
    background-color: white;
    font-family: Verdana, Geneva, sans-serif;
    margin: 0;
    padding: 0;
}

a {
    color: #0000ff;
    text-decoration: none;
}

a:visited {
    color: #551A8B;
}

.container {
    width: 85%;
    margin: auto;
}

table {
    width: 100%;
}

.header-table {
    width: 100%;
    background-color: #ff6600;
    padding: 2px 10px;
}

.header-table a {
    color: black;
    font-weight: bold;
    font-size: 14pt;
    text-decoration: none;
}

.header-table .sort-buttons button {
    background: none;
    border: none;
    color: #0000ff;
    cursor: pointer;
    font-size: 14pt;
    text-decoration: underline;
    padding: 0 10px;
}

.header-table .sort-buttons button:hover {
    color: #551A8B;
}

.itemlist .athing {
    background-color: #f6f6ef;
}

.rank {
    font-size: 14pt;
    color: #828282;
    padding-right: 5px;
}

.storylink {
    font-size: 10pt;
}

.subtext {
    font-size: 8pt;
    color: #828282;
    padding-left: 40px;
}

.subtext a {
    color: #828282;
    text-decoration: none;
}

.no-papers {
    text-align: center;
    color: #828282;
    padding: 1rem;
    font-size: 14pt;
}

@media (max-width: 640px) {
    .header-table a {
        font-size: 12pt;
    }

    .sort-buttons button {
        font-size: 12pt;
        padding: 0 5px;
    }

    .storylink {
        font-size: 9pt;
    }

    .subtext {
        font-size: 7pt;
    }
}

/* Dark mode */
@media (prefers-color-scheme: dark) {
    body {
        background-color: #121212;
        color: #e0e0e0;
    }

    a {
        color: #add8e6;
    }

    a:visited {
        color: #9370db;
    }

    .header-table {
        background-color: #ff6600;
    }

    .header-table a {
        color: black;
    }

    .header-table .sort-buttons button {
        color: #add8e6;
    }

    .header-table .sort-buttons button:hover {
        color: #9370db;
    }

    .itemlist .athing {
        background-color: #1e1e1e;
    }

    .rank {
        color: #b0b0b0;
    }

    .subtext {
        color: #b0b0b0;
    }

    .subtext a {
        color: #b0b0b0;
    }

    .no-papers {
        color: #b0b0b0;
    }
}
"""


# --- Initialize Gradio Blocks ---

demo = gr.Blocks(css=css)

with demo:
    with gr.Column(elem_classes=["container"]):
        # Accordion for Submission Instructions
        with gr.Accordion("How to Submit a Paper", open=False):
            gr.Markdown("""
            **Submit the paper to Daily Papers:**
            [https://huggingface.co/papers/submit](https://huggingface.co/papers/submit)

            Once your paper is submitted, it will automatically appear in this demo.
            """)
        
        # Hacker News-like Header with "Hot" and "New" sort options
        with gr.Row():
            # Left side: Site title
            gr.Markdown("""
                <table border="0" cellpadding="0" cellspacing="0" class="header-table">
                    <tr>
                        <td>
                            <span class="pagetop">
                                <b class="hnname"><a href="#">Daily Papers</a></b>
                            </span>
                        </td>
                        <td align="right" class="sort-buttons">
                            <!-- Removed custom HTML buttons -->
                        </td>
                    </tr>
                </table>
            """, show_label=False)
            # Right side: Gradio Buttons for "Hot" and "New"
            with gr.Column(elem_classes=["sort-buttons"]):
                hot_button = gr.Button("Hot", elem_id="hot_button")
                new_button = gr.Button("New", elem_id="new_button")
        
        # Paper list
        paper_list = gr.HTML()
        
        # Navigation Buttons
        with gr.Row():
            prev_button = gr.Button("Prev")
            next_button = gr.Button("Next")
    
    # Load papers on app start
    demo.load(
        fn=lambda: paper_manager.get_current_page_papers(),
        outputs=[paper_list]
    )
    
    # Button clicks for pagination
    prev_button.click(
        fn=lambda: paper_manager.prev_page(),
        inputs=[],
        outputs=[paper_list]
    )
    next_button.click(
        fn=lambda: paper_manager.next_page(),
        inputs=[],
        outputs=[paper_list]
    )
    
    # Gradio Buttons trigger sort methods directly
    hot_button.click(
        fn=lambda: change_sort_method_ui("hot"),
        inputs=[],
        outputs=[paper_list]
    )
    new_button.click(
        fn=lambda: change_sort_method_ui("new"),
        inputs=[],
        outputs=[paper_list]
    )
    
    # Footer - Removed as per request
    # Removed the footer markdown section


# --- Launch the App ---

if __name__ == "__main__":
    demo.launch()