#!/usr/bin/env python import datetime import operator import pandas as pd import tqdm.auto from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import HfApi import gradio as gr from gradio_calendar import Calendar import datasets import requests from datetime import timezone # Ensure timezone is imported # --- Data Loading and Processing --- api = HfApi() INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index" INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/" # Removed ragatouille and abstract_retriever initialization # If INDEX_REPO_ID is not used elsewhere, consider removing related lines # Removed abstract_retriever initialization and search def update_abstract_index() -> None: """ Removed abstract_retriever update functionality since ragatouille is no longer used. """ pass # No operation needed # Scheduler for updating abstract index every hour # Removed scheduler_abstract as it's no longer necessary # If INDEX_REPO_ID is not used elsewhere, consider removing the download # Optionally, remove the snapshot_download if the index is not needed # api.snapshot_download( # repo_id=INDEX_REPO_ID, # repo_type="dataset", # local_dir=INDEX_DIR_PATH, # ) # --- DataFrame Preparation --- def get_df() -> pd.DataFrame: # Load and merge datasets df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas() df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas() df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id") # Reverse the DataFrame to have the latest papers first df = df[::-1].reset_index(drop=True) # Ensure 'date' is in datetime format and handle missing dates df["date"] = pd.to_datetime(df["date"], errors='coerce') df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) # Prepare the DataFrame by removing 'abstract' and adding 'paper_page' paper_info = [] for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)): info = row.copy() if "abstract" in info: del info["abstract"] info["paper_page"] = f"https://huggingface.co/papers/{row.arxiv_id}" paper_info.append(info) return pd.DataFrame(paper_info) class Prettifier: @staticmethod def get_github_link(link: str) -> str: if not link: return "" return Prettifier.create_link("github", link) @staticmethod def create_link(text: str, url: str) -> str: return f'{text}' def __call__(self, df: pd.DataFrame) -> pd.DataFrame: new_rows = [] for _, row in df.iterrows(): # Handle authors: list of dicts or list of strings if "authors" in row and isinstance(row["authors"], list): authors = ', '.join([ author.get('name', '') if isinstance(author, dict) else str(author) for author in row["authors"] ]) else: authors = 'Unknown' # Handle published_at: original date published_at = row["date"] # Already formatted as "%Y-%m-%d" # Handle date link date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}") new_row = { "arxiv_id": row["arxiv_id"], # Include arxiv_id "date_display": date_display, # For display "published_at": published_at, # For internal calculations "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page), "title": row["title"], "authors": authors, # Include authors "github": Prettifier.get_github_link(row.get("github", "")), "👍": row["upvotes"], "💬": row["num_comments"], } new_rows.append(new_row) return pd.DataFrame(new_rows) class PaperList: COLUMN_INFO = [ ["arxiv_id", "str"], # Added arxiv_id ["date_display", "markdown"],# For display ["published_at", "str"], # For internal use ["paper_page", "markdown"], ["title", "str"], ["authors", "str"], # Added authors ["github", "markdown"], ["👍", "number"], ["💬", "number"], ] def __init__(self, df: pd.DataFrame): self.df_raw = df self._prettifier = Prettifier() self.df_prettified = self._prettifier(df).loc[:, self.column_names] @property def column_names(self): return list(map(operator.itemgetter(0), self.COLUMN_INFO)) @property def column_datatype(self): return list(map(operator.itemgetter(1), self.COLUMN_INFO)) def search( self, start_date: datetime.datetime, end_date: datetime.datetime, title_search_query: str, max_num_to_retrieve: int, ) -> pd.DataFrame: df = self.df_raw.copy() df["date"] = pd.to_datetime(df["date"], errors='coerce') # Filter by date df = df[(df["date"] >= start_date) & (df["date"] <= end_date)] df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) # Filter by title if title_search_query: df = df[df["title"].str.contains(title_search_query, case=False, na=False)] # Removed abstract_search_query filtering since ragatouille is no longer used # Prettify the DataFrame df_prettified = self._prettifier(df).loc[:, self.column_names] return df_prettified # --- Sorting and Pagination Management --- class PaperManager: def __init__(self, paper_list: PaperList, papers_per_page=30): self.paper_list = paper_list self.papers_per_page = papers_per_page self.sort_method = "hot" # Default sort method self.sort_papers() # 'current_page' and 'total_pages' are set in 'sort_papers()' def calculate_score(self, row): """ Calculate the score of a paper based on upvotes and age. This mimics the "hotness" algorithm used by platforms like Hacker News. """ upvotes = row.get('👍', 0) published_at_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) # **FIX** Changed from 'published_at' to 'date' try: published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d").replace(tzinfo=timezone.utc) except ValueError: # If parsing fails, use current time to minimize the impact on sorting published_time = datetime.datetime.now(timezone.utc) time_diff = datetime.datetime.now(timezone.utc) - published_time time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours # Avoid division by zero and apply the hotness formula score = upvotes / ((time_diff_hours + 2) ** 1.5) if (time_diff_hours + 2) > 0 else 0 return score def sort_papers(self): df = self.paper_list.df_raw.copy() if self.sort_method == "hot": df['score'] = df.apply(self.calculate_score, axis=1) df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score']) elif self.sort_method == "new": df_sorted = df.sort_values(by='date', ascending=False) # **FIX** Changed from 'published_at' to 'date' else: df_sorted = df self.paper_list.df_raw = df_sorted.reset_index(drop=True) self.paper_list.df_prettified = self.paper_list._prettifier(self.paper_list.df_raw).loc[:, self.paper_list.column_names] self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1) self.current_page = 1 def set_sort_method(self, method): if method not in ["hot", "new"]: method = "hot" print(f"Setting sort method to: {method}") self.sort_method = method self.sort_papers() return True # Assume success def get_current_page_papers(self) -> str: start = (self.current_page - 1) * self.papers_per_page end = start + self.papers_per_page current_papers = self.paper_list.df_prettified.iloc[start:end] if current_papers.empty: return "
Daily Papers |