dailypapershackernews-dev

Running

File size: 21,036 Bytes

#!/usr/bin/env python

import datetime
import operator
import pandas as pd
import tqdm.auto
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi

import gradio as gr
from gradio_calendar import Calendar
import datasets
import requests

from datetime import timezone  # Ensure timezone is imported

# --- Data Loading and Processing ---

api = HfApi()

def get_df() -> pd.DataFrame:
    """
    Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns,
    and adds a 'paper_page' link for each paper.
    """
    # Load datasets
    df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
    df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas()
    
    # Merge datasets on 'arxiv_id'
    df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id")
    
    # Reverse the DataFrame to have the latest papers first
    df = df[::-1].reset_index(drop=True)
    
    # Ensure 'date' is in datetime format and handle missing dates
    df["date"] = pd.to_datetime(df["date"], errors='coerce')
    df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
    
    # Prepare the DataFrame by removing 'abstract'
    paper_info = []
    for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
        info = row.copy()
        if "abstract" in info:
            del info["abstract"]
        paper_info.append(info)
    df_prepared = pd.DataFrame(paper_info)
    
    # Add 'paper_page' links
    df_prepared["paper_page"] = df_prepared["arxiv_id"].apply(lambda x: f"https://huggingface.co/papers/{x}")
    
    return df_prepared


class Prettifier:
    """
    Converts raw DataFrame rows into a prettified format suitable for display.
    """
    @staticmethod
    def get_github_link(link: str) -> str:
        if not link:
            return ""
        return Prettifier.create_link("github", link)

    @staticmethod
    def create_link(text: str, url: str) -> str:
        return f'<a href="{url}" target="_blank">{text}</a>'

    def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
        new_rows = []
        for _, row in df.iterrows():
            # Handle date_display as a clickable link
            date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
            
            new_row = {
                "arxiv_id": row.get("arxiv_id", ""),                        # Include arxiv_id
                "date_display": date_display,                               # For display
                "date": row.get("date", datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")),  # For internal calculations
                "paper_page": Prettifier.create_link(row.get("arxiv_id", ""), row.get("paper_page", "#")),
                "title": row.get("title", "No title"),
                "github": Prettifier.get_github_link(row.get("github", "")),
                "👍": row.get("upvotes", 0),
                "💬": row.get("num_comments", 0),
            }
            new_rows.append(new_row)
        return pd.DataFrame(new_rows)


class PaperList:
    """
    Manages the list of papers, including search functionality.
    """
    COLUMN_INFO = [
        ["arxiv_id", "str"],          # Added arxiv_id
        ["date_display", "markdown"], # For display
        ["date", "str"],              # For internal use
        ["paper_page", "markdown"],
        ["title", "str"],
        ["github", "markdown"],
        ["👍", "number"],
        ["💬", "number"],
    ]

    def __init__(self, df: pd.DataFrame):
        self.df_raw = df
        self._prettifier = Prettifier()
        self.df_prettified = self._prettifier(df).loc[:, self.column_names]

    @property
    def column_names(self):
        return list(map(operator.itemgetter(0), self.COLUMN_INFO))

    @property
    def column_datatype(self):
        return list(map(operator.itemgetter(1), self.COLUMN_INFO))

    def search(
        self,
        title_search_query: str,
        max_num_to_retrieve: int = 1000,  # Set a high default to include all if not specified
    ) -> pd.DataFrame:
        """
        Filters the DataFrame based on the title search query and limits the number of results.
        """
        df = self.df_raw.copy()

        # Filter by title if search query is provided
        if title_search_query:
            df = df[df["title"].str.contains(title_search_query, case=False, na=False)]
        
        # Limit the number of papers to retrieve if max_num_to_retrieve is set
        if max_num_to_retrieve:
            df = df.head(max_num_to_retrieve)

        # Prettify the DataFrame
        df_prettified = self._prettifier(df).loc[:, self.column_names]
        return df_prettified


# --- Sorting and Pagination Management ---

class PaperManager:
    """
    Manages sorting, pagination, and search queries for the list of papers.
    """
    def __init__(self, paper_list: PaperList, papers_per_page=30):
        self.paper_list = paper_list
        self.papers_per_page = papers_per_page
        self.sort_method = "hot"              # Default sort method
        self.current_search_query = ""        # Initialize with no search query
        self.top_time_frame = "all time"      # Default time frame for "Top" sorting
        self.sort_papers()
        # 'current_page' and 'total_pages' are set in 'sort_papers()'

    def calculate_score(self, row):
        """
        Calculate the score of a paper based on upvotes and age.
        This mimics the "hotness" algorithm used by platforms like Hacker News.
        """
        upvotes = row.get('👍', 0)
        date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
        try:
            published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
        except ValueError:
            # If parsing fails, use current time to minimize the impact on sorting
            published_time = datetime.datetime.now(timezone.utc)

        time_diff = datetime.datetime.now(timezone.utc) - published_time
        time_diff_hours = time_diff.total_seconds() / 3600  # Convert time difference to hours

        # Avoid division by zero and apply the hotness formula
        score = upvotes / ((time_diff_hours + 2) ** 1.5) if (time_diff_hours + 2) > 0 else 0
        return score

    def sort_papers(self):
        """
        Sorts the papers based on the current sort method and search query.
        """
        df = self.paper_list.df_raw.copy()

        # Apply search filter if a search query exists
        if self.current_search_query:
            df = df[df["title"].str.contains(self.current_search_query, case=False, na=False)]

        if self.sort_method == "hot":
            df['score'] = df.apply(self.calculate_score, axis=1)
            df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
        elif self.sort_method == "new":
            df_sorted = df.sort_values(by='date', ascending=False)  # Sort by 'date'
        elif self.sort_method == "top":
            # Filter based on the selected time frame
            now = datetime.datetime.now(timezone.utc)
            if self.top_time_frame == "day":
                time_threshold = now - datetime.timedelta(days=1)
            elif self.top_time_frame == "week":
                time_threshold = now - datetime.timedelta(weeks=1)
            elif self.top_time_frame == "month":
                time_threshold = now - datetime.timedelta(days=30)
            elif self.top_time_frame == "year":
                time_threshold = now - datetime.timedelta(days=365)
            elif self.top_time_frame == "all time":
                time_threshold = datetime.datetime.min.replace(tzinfo=timezone.utc)
            else:
                time_threshold = datetime.datetime.min.replace(tzinfo=timezone.utc)
            
            # Convert 'date' column to datetime
            df_sorted = df.copy()
            df_sorted['date_parsed'] = pd.to_datetime(df_sorted['date'], errors='coerce').dt.tz_localize(timezone.utc)
            df_sorted = df_sorted[df_sorted['date_parsed'] >= time_threshold]
            df_sorted = df_sorted.sort_values(by='upvotes', ascending=False).drop(columns=['date_parsed'])
        else:
            df_sorted = df

        self.paper_list.df_raw = df_sorted.reset_index(drop=True)
        self.paper_list.df_prettified = self.paper_list._prettifier(self.paper_list.df_raw).loc[:, self.paper_list.column_names]
        self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1)
        self.current_page = 1

    def set_sort_method(self, method, time_frame=None):
        """
        Sets the sort method ('hot', 'new', 'top') and re-sorts the papers.
        If 'top' is selected, also sets the time frame.
        """
        if method not in ["hot", "new", "top"]:
            method = "hot"
        print(f"Setting sort method to: {method}")
        self.sort_method = method
        if method == "top" and time_frame:
            self.top_time_frame = time_frame.lower()
            print(f"Setting top time frame to: {self.top_time_frame}")
        self.sort_papers()
        return True  # Assume success

    def set_search_query(self, query: str):
        """
        Sets the current search query and re-sorts the papers.
        """
        print(f"Setting search query to: {query}")
        self.current_search_query = query
        self.sort_papers()
        return True  # Assume success

    def get_current_page_papers(self) -> str:
        """
        Retrieves the HTML string of the current page's papers.
        """
        start = (self.current_page - 1) * self.papers_per_page
        end = start + self.papers_per_page
        current_papers = self.paper_list.df_prettified.iloc[start:end]

        if current_papers.empty:
            return "<div class='no-papers'>No papers available for this page.</div>"

        papers_html = "".join([self.format_paper(row, idx + start + 1) for idx, row in current_papers.iterrows()])
        return f"""
        <table border="0" cellpadding="0" cellspacing="0" class="itemlist">
            {papers_html}
        </table>
        """

    def format_paper(self, row, rank):
        """
        Formats a single paper entry into HTML.
        """
        title = row.get('title', 'No title')
        paper_id = row.get('arxiv_id', '')
        url = f"https://huggingface.co/papers/{paper_id}"
        upvotes = row.get('👍', 0)
        comments = row.get('💬', 0)
        date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
        try:
            published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
        except ValueError:
            published_time = datetime.datetime.now(timezone.utc)
        time_diff = datetime.datetime.now(timezone.utc) - published_time
        time_ago_days = time_diff.days
        time_ago = f"{time_ago_days} days ago" if time_ago_days > 0 else "today"

        return f"""
        <tr class="athing">
            <td align="right" valign="top" class="title"><span class="rank">{rank}.</span></td>
            <td valign="top" class="title">
                <a href="{url}" class="storylink" target="_blank">{title}</a>
            </td>
        </tr>
        <tr>
            <td colspan="1"></td>
            <td class="subtext">
                <span class="score">{upvotes} upvotes</span><br>
                {time_ago} | <a href="#">{comments} comments</a>
            </td>
        </tr>
        <tr style="height:5px"></tr>
        """

    def next_page(self) -> str:
        """
        Navigates to the next page if possible.
        """
        if self.current_page < self.total_pages:
            self.current_page += 1
        return self.get_current_page_papers()

    def prev_page(self) -> str:
        """
        Navigates to the previous page if possible.
        """
        if self.current_page > 1:
            self.current_page -= 1
        return self.get_current_page_papers()

    def refresh(self) -> str:
        """
        Refreshes the current list of papers.
        """
        self.sort_papers()
        return self.get_current_page_papers()


# Initialize PaperList and PaperManager
def initialize_paper_manager() -> str:
    """
    Initializes the PaperList and PaperManager with the current DataFrame.
    """
    df = get_df()
    paper_list = PaperList(df)
    manager = PaperManager(paper_list)
    return manager.get_current_page_papers()  # Return HTML string instead of the manager object


paper_manager = None  # Initialize globally


def setup_paper_manager():
    """
    Sets up the global PaperManager instance.
    """
    global paper_manager
    df = get_df()
    paper_list = PaperList(df)
    paper_manager = PaperManager(paper_list)


# Initialize PaperManager at the start
setup_paper_manager()


def update_paper_manager() -> str:
    """
    Updates the global PaperManager with the latest DataFrame.
    """
    global paper_manager
    df = get_df()
    paper_manager.paper_list = PaperList(df)
    paper_manager.sort_papers()
    return paper_manager.get_current_page_papers()


# Scheduler for updating paper list every hour
scheduler_data = BackgroundScheduler()
scheduler_data.add_job(
    func=update_paper_manager,
    trigger="cron",
    minute=0,  # Every hour at minute 0
    timezone="UTC",
    misfire_grace_time=60,
)
scheduler_data.start()


# --- Gradio Interface Functions ---

def change_sort_method_ui(method: str, time_frame: str = "all time") -> str:
    """
    Changes the sort method and, if 'top' is selected, sets the time frame.
    """
    if method.lower() == "top":
        paper_manager.set_sort_method(method.lower(), time_frame)
    else:
        paper_manager.set_sort_method(method.lower())
    return paper_manager.get_current_page_papers()


def refresh_papers_ui() -> str:
    """
    Refreshes the paper list.
    """
    return paper_manager.refresh()


def search_papers_ui(query: str) -> str:
    """
    Searches for papers based on the title search query.
    """
    paper_manager.set_search_query(query)
    return paper_manager.get_current_page_papers()


def clear_search_ui() -> str:
    """
    Clears the current search query and refreshes the paper list.
    """
    paper_manager.set_search_query("")
    return paper_manager.get_current_page_papers()


# --- CSS Styling ---

css = """
/* Existing CSS remains unchanged */
body {
    background-color: white;
    font-family: Verdana, Geneva, sans-serif;
    margin: 0;
    padding: 0;
}

a {
    color: #0000ff;
    text-decoration: none;
}

a:visited {
    color: #551A8B;
}

.container {
    width: 85%;
    margin: auto;
}

table {
    width: 100%;
}

.header-table {
    width: 100%;
    background-color: #ff6600;
    padding: 2px 10px;
}

.header-table a {
    color: black;
    font-weight: bold;
    font-size: 14pt;
    text-decoration: none;
}

.itemlist .athing {
    background-color: #f6f6ef;
}

.rank {
    font-size: 14pt;
    color: #828282;
    padding-right: 5px;
}

.storylink {
    font-size: 10pt;
}

.subtext {
    font-size: 8pt;
    color: #828282;
    padding-left: 40px;
}

.subtext a {
    color: #828282;
    text-decoration: none;
}

#refresh-button {
    background: none;
    border: none;
    color: black;
    font-weight: bold;
    font-size: 14pt;
    cursor: pointer;
}

.no-papers {
    text-align: center;
    color: #828282;
    padding: 1rem;
    font-size: 14pt;
}

@media (max-width: 640px) {
    .header-table a {
        font-size: 12pt;
    }

    .storylink {
        font-size: 9pt;
    }

    .subtext {
        font-size: 7pt;
    }
}

/* Dark mode */
@media (prefers-color-scheme: dark) {
    body {
        background-color: #121212;
        color: #e0e0e0;
    }

    a {
        color: #add8e6;
    }

    a:visited {
        color: #9370db;
    }

    .header-table {
        background-color: #ff6600;
    }

    .header-table a {
        color: black;
    }

    .itemlist .athing {
        background-color: #1e1e1e;
    }

    .rank {
        color: #b0b0b0;
    }

    .subtext {
        color: #b0b0b0;
    }

    .subtext a {
        color: #b0b0b0;
    }

    #refresh-button {
        color: #e0e0e0;
    }

    .no-papers {
        color: #b0b0b0;
    }
}
"""

# --- Initialize Gradio Blocks ---

demo = gr.Blocks(css=css)

with demo:
    with gr.Column(elem_classes=["container"]):
        # Accordion for Submission Instructions
        with gr.Accordion("How to Submit a Paper", open=False):
            gr.Markdown("""
            **Submit the paper to Daily Papers:**
            [https://huggingface.co/papers/submit](https://huggingface.co/papers/submit)

            Once your paper is submitted, it will automatically appear in this demo.
            """)
        # Header with Refresh Button
        with gr.Row():
            gr.HTML("""
            <table border="0" cellpadding="0" cellspacing="0" class="header-table">
                <tr>
                    <td>
                        <span class="pagetop">
                            <b class="hnname"><a href="#">Daily Papers</a></b>
                        </span>
                    </td>
                    <td align="right">
                        <button id="refresh-button">Refresh</button>
                    </td>
                </tr>
            </table>
            """)
        # Search Bar and Clear Search Button
        with gr.Row():
            search_box = gr.Textbox(
                label="Search Papers by Title",
                placeholder="Enter keywords to search...",
                lines=1,
                interactive=True
            )
            search_button = gr.Button("Search")
            clear_search_button = gr.Button("Clear Search")
        # Sort Options and Time Frame (conditionally visible)
        with gr.Row():
            sort_radio = gr.Radio(
                choices=["Hot", "New", "Top"],
                value="Hot",
                label="Sort By",
                interactive=True
            )
            time_frame_dropdown = gr.Dropdown(
                choices=["day", "week", "month", "year", "all time"],
                value="all time",
                label="Time Frame for Top",
                visible=False,
                interactive=True
            )
        # Paper list
        paper_list = gr.HTML()
        # Navigation Buttons
        with gr.Row():
            prev_button = gr.Button("Prev")
            next_button = gr.Button("Next")

    # Load papers on app start
    demo.load(
        fn=lambda: paper_manager.get_current_page_papers(),
        outputs=[paper_list]
    )

    # Button clicks for pagination
    prev_button.click(paper_manager.prev_page, outputs=[paper_list])
    next_button.click(paper_manager.next_page, outputs=[paper_list])

    # Refresh functionality
    refresh_button = gr.Button("Refresh", visible=False, elem_id="refresh-hidden")
    refresh_button.click(update_paper_manager, outputs=[paper_list])

    # Bind the visible Refresh button to the hidden one using JavaScript
    gr.HTML("""
    <script>
    document.getElementById('refresh-button').addEventListener('click', function() {
        document.getElementById('refresh-hidden').click();
    });
    </script>
    """)

    # Sort option change
    sort_radio.change(
        fn=lambda method: method.lower(),
        inputs=[sort_radio],
        outputs=None,
        _js="""
        (method) => {
            if (method === 'top') {
                document.querySelector('[label="Time Frame for Top"]').style.display = 'block';
            } else {
                document.querySelector('[label="Time Frame for Top"]').style.display = 'none';
            }
            return method;
        }
        """
    )

    sort_radio.change(
        fn=change_sort_method_ui,
        inputs=[sort_radio, time_frame_dropdown],
        outputs=[paper_list]
    )

    # Search functionality
    search_button.click(
        fn=search_papers_ui,
        inputs=[search_box],
        outputs=[paper_list]
    )

    # Clear search functionality
    clear_search_button.click(
        fn=clear_search_ui,
        inputs=None,
        outputs=[paper_list]
    )

    # Footer
    gr.Markdown("""
    Related useful Spaces:
    - [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien)
    - [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy)
    - [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
    """)


# --- Launch the App ---

if __name__ == "__main__":
    demo.launch()