|
|
|
|
|
import datetime |
|
import pandas as pd |
|
import tqdm.auto |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
from huggingface_hub import HfApi |
|
|
|
import gradio as gr |
|
import datasets |
|
import requests |
|
|
|
from datetime import timezone |
|
import atexit |
|
import logging |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
api = HfApi() |
|
|
|
def get_repo_counts(arxiv_id: str) -> dict: |
|
""" |
|
Fetches the number of models, datasets, and Spaces linked to a given arxiv_id using Hugging Face API. |
|
""" |
|
url = f"https://huggingface.co/api/arxiv/{arxiv_id}/repos" |
|
try: |
|
response = requests.get(url, timeout=10) |
|
response.raise_for_status() |
|
data = response.json() |
|
|
|
models = data.get('models', []) |
|
datasets_list = data.get('datasets', []) |
|
spaces = data.get('spaces', []) |
|
|
|
return { |
|
'models_count': len(models), |
|
'datasets_count': len(datasets_list), |
|
'spaces_count': len(spaces) |
|
} |
|
except requests.exceptions.RequestException as e: |
|
logger.error(f"Error fetching repo counts for {arxiv_id}: {e}") |
|
return { |
|
'models_count': 0, |
|
'datasets_count': 0, |
|
'spaces_count': 0 |
|
} |
|
|
|
def get_df() -> pd.DataFrame: |
|
""" |
|
Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns, |
|
adds a 'paper_page' link for each paper, and fetches counts of models, datasets, and Spaces linked to each paper. |
|
""" |
|
try: |
|
|
|
logger.info("Loading 'daily-papers' dataset.") |
|
df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas() |
|
logger.info("Loading 'daily-papers-stats' dataset.") |
|
df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas() |
|
|
|
|
|
logger.info("Merging datasets on 'arxiv_id'.") |
|
df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id", suffixes=('_papers', '_stats')) |
|
|
|
|
|
df = df[::-1].reset_index(drop=True) |
|
|
|
|
|
logger.info("Processing 'date' column.") |
|
df["date"] = pd.to_datetime(df["date"], errors='coerce') |
|
df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) |
|
|
|
|
|
logger.info("Removing 'abstract' column if present.") |
|
paper_info = [] |
|
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)): |
|
info = row.copy() |
|
if "abstract" in info: |
|
del info["abstract"] |
|
|
|
arxiv_id = info.get("arxiv_id", "") |
|
if arxiv_id: |
|
counts = get_repo_counts(arxiv_id) |
|
info.update(counts) |
|
else: |
|
info.update({ |
|
'models_count': 0, |
|
'datasets_count': 0, |
|
'spaces_count': 0 |
|
}) |
|
paper_info.append(info) |
|
df_prepared = pd.DataFrame(paper_info) |
|
|
|
|
|
logger.info("Adding 'paper_page' links.") |
|
df_prepared["paper_page"] = df_prepared["arxiv_id"].apply(lambda x: f"https://huggingface.co/papers/{x}") |
|
|
|
logger.info("DataFrame preparation complete.") |
|
return df_prepared |
|
except Exception as e: |
|
logger.error(f"Error in get_df: {e}") |
|
return pd.DataFrame() |
|
|
|
|
|
class Prettifier: |
|
""" |
|
Converts raw DataFrame rows into a prettified format suitable for display. |
|
""" |
|
REQUIRED_COLUMNS = [ |
|
"arxiv_id", "date_display", "date", "paper_page", |
|
"title", "github", "👍", "💬", |
|
"models_count", "datasets_count", "spaces_count" |
|
] |
|
|
|
@staticmethod |
|
def get_github_link(link: str) -> str: |
|
if not link: |
|
return "" |
|
return Prettifier.create_link("github", link) |
|
|
|
@staticmethod |
|
def create_link(text: str, url: str) -> str: |
|
return f'<a href="{url}" target="_blank">{text}</a>' |
|
|
|
def __call__(self, df: pd.DataFrame) -> pd.DataFrame: |
|
new_rows = [] |
|
for _, row in df.iterrows(): |
|
|
|
date_display = Prettifier.create_link(row.get("date", ""), f"https://huggingface.co/papers?date={row.get('date', '')}") |
|
|
|
new_row = { |
|
"arxiv_id": row.get("arxiv_id", ""), |
|
"date_display": date_display, |
|
"date": row.get("date", datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")), |
|
"paper_page": Prettifier.create_link(row.get("arxiv_id", ""), row.get("paper_page", "#")), |
|
"title": row.get("title", "No title"), |
|
"github": Prettifier.get_github_link(row.get("github", "")), |
|
"👍": row.get("upvotes", 0), |
|
"💬": row.get("num_comments", 0), |
|
"models_count": row.get("models_count", 0), |
|
"datasets_count": row.get("datasets_count", 0), |
|
"spaces_count": row.get("spaces_count", 0), |
|
} |
|
new_rows.append(new_row) |
|
|
|
|
|
if not new_rows: |
|
return pd.DataFrame(columns=self.REQUIRED_COLUMNS) |
|
|
|
return pd.DataFrame(new_rows) |
|
|
|
|
|
class PaperList: |
|
""" |
|
Manages the list of papers. |
|
""" |
|
COLUMN_INFO = [ |
|
["arxiv_id", "str"], |
|
["date_display", "markdown"], |
|
["date", "str"], |
|
["paper_page", "markdown"], |
|
["title", "str"], |
|
["github", "markdown"], |
|
["👍", "number"], |
|
["💬", "number"], |
|
["models_count", "number"], |
|
["datasets_count", "number"], |
|
["spaces_count", "number"], |
|
] |
|
|
|
def __init__(self, df: pd.DataFrame): |
|
self.df_raw = df |
|
self._prettifier = Prettifier() |
|
self.df_prettified = self._prettifier(df).loc[:, self.column_names] |
|
|
|
@property |
|
def column_names(self): |
|
return [col[0] for col in self.COLUMN_INFO] |
|
|
|
@property |
|
def column_datatype(self): |
|
return [col[1] for col in self.COLUMN_INFO] |
|
|
|
def get_prettified_df(self) -> pd.DataFrame: |
|
""" |
|
Returns the prettified DataFrame. |
|
""" |
|
return self.df_prettified |
|
|
|
|
|
|
|
|
|
class PaperManager: |
|
""" |
|
Manages sorting, pagination, and repository-based sorting for the list of papers. |
|
""" |
|
def __init__(self, paper_list: PaperList, papers_per_page=30): |
|
self.paper_list = paper_list |
|
self.papers_per_page = papers_per_page |
|
self.sort_method = "hot" |
|
self.sort_papers() |
|
|
|
|
|
def calculate_score(self, row): |
|
""" |
|
Calculate the score of a paper based on upvotes and age. |
|
This mimics the "hotness" algorithm used by platforms like Hacker News. |
|
""" |
|
upvotes = row.get('upvotes', 0) |
|
date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) |
|
try: |
|
published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc) |
|
except ValueError: |
|
|
|
published_time = datetime.datetime.now(timezone.utc) |
|
|
|
time_diff = datetime.datetime.now(timezone.utc) - published_time |
|
time_diff_hours = time_diff.total_seconds() / 3600 |
|
|
|
|
|
score = upvotes / ((time_diff_hours + 2) ** 1.5) if (time_diff_hours + 2) > 0 else 0 |
|
return score |
|
|
|
def sort_papers(self): |
|
""" |
|
Sorts the papers based on the current sort method. |
|
""" |
|
df = self.paper_list.df_raw.copy() |
|
|
|
if self.sort_method == "hot": |
|
if not df.empty: |
|
df = df.drop(columns=['score'], errors='ignore') |
|
df['score'] = df.apply(self.calculate_score, axis=1) |
|
df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score']) |
|
else: |
|
df_sorted = df |
|
elif self.sort_method == "new": |
|
df_sorted = df.sort_values(by='date', ascending=False) |
|
elif self.sort_method == "most_models": |
|
df_sorted = df.sort_values(by='models_count', ascending=False) |
|
elif self.sort_method == "most_datasets": |
|
df_sorted = df.sort_values(by='datasets_count', ascending=False) |
|
elif self.sort_method == "most_spaces": |
|
df_sorted = df.sort_values(by='spaces_count', ascending=False) |
|
else: |
|
df_sorted = df |
|
|
|
self.paper_list.df_raw = df_sorted.reset_index(drop=True) |
|
self.paper_list.df_prettified = self.paper_list._prettifier(self.paper_list.df_raw).loc[:, self.paper_list.column_names] |
|
self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1) |
|
self.current_page = 1 |
|
logger.info(f"Papers sorted by {self.sort_method}. Total pages: {self.total_pages}") |
|
|
|
def set_sort_method(self, method, time_frame=None): |
|
""" |
|
Sets the sort method ('hot', 'new', 'most_models', 'most_datasets', 'most_spaces') and re-sorts the papers. |
|
""" |
|
valid_methods = ["hot", "new", "most_models", "most_datasets", "most_spaces"] |
|
if method not in valid_methods: |
|
method = "hot" |
|
logger.info(f"Setting sort method to: {method}") |
|
self.sort_method = method |
|
self.sort_papers() |
|
return True |
|
|
|
def get_current_page_papers(self) -> str: |
|
""" |
|
Retrieves the HTML string of the current page's papers. |
|
""" |
|
start = (self.current_page - 1) * self.papers_per_page |
|
end = start + self.papers_per_page |
|
current_papers = self.paper_list.df_prettified.iloc[start:end] |
|
|
|
if current_papers.empty: |
|
return "<div class='no-papers'>No papers available for this page.</div>" |
|
|
|
papers_html = "".join([self.format_paper(row, idx + start + 1) for idx, row in current_papers.iterrows()]) |
|
return f""" |
|
<table border="0" cellpadding="0" cellspacing="0" class="itemlist"> |
|
{papers_html} |
|
</table> |
|
""" |
|
|
|
def format_paper(self, row, rank): |
|
""" |
|
Formats a single paper entry into HTML. |
|
""" |
|
title = row.get('title', 'No title') |
|
paper_id = row.get('arxiv_id', '') |
|
url = f"https://huggingface.co/papers/{paper_id}" |
|
upvotes = row.get('👍', 0) |
|
comments = row.get('💬', 0) |
|
models = row.get('models_count', 0) |
|
datasets_count = row.get('datasets_count', 0) |
|
spaces = row.get('spaces_count', 0) |
|
date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) |
|
try: |
|
published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc) |
|
except ValueError: |
|
published_time = datetime.datetime.now(timezone.utc) |
|
time_diff = datetime.datetime.now(timezone.utc) - published_time |
|
time_ago_days = time_diff.days |
|
time_ago = f"{time_ago_days} days ago" if time_ago_days > 0 else "today" |
|
|
|
return f""" |
|
<tr class="athing"> |
|
<td align="right" valign="top" class="title"><span class="rank">{rank}.</span></td> |
|
<td valign="top" class="title"> |
|
<a href="{url}" class="storylink" target="_blank">{title}</a> |
|
</td> |
|
</tr> |
|
<tr> |
|
<td colspan="1"></td> |
|
<td class="subtext"> |
|
<span class="score">{upvotes} upvotes</span><br> |
|
{time_ago} | <a href="#">{comments} comments</a><br> |
|
Models: {models} | Datasets: {datasets_count} | Spaces: {spaces} |
|
</td> |
|
</tr> |
|
<tr style="height:5px"></tr> |
|
""" |
|
|
|
def next_page(self) -> str: |
|
""" |
|
Navigates to the next page if possible. |
|
""" |
|
if self.current_page < self.total_pages: |
|
self.current_page += 1 |
|
logger.info(f"Navigated to page {self.current_page}.") |
|
else: |
|
logger.info("Already on the last page.") |
|
return self.get_current_page_papers() |
|
|
|
def prev_page(self) -> str: |
|
""" |
|
Navigates to the previous page if possible. |
|
""" |
|
if self.current_page > 1: |
|
self.current_page -= 1 |
|
logger.info(f"Navigated to page {self.current_page}.") |
|
else: |
|
logger.info("Already on the first page.") |
|
return self.get_current_page_papers() |
|
|
|
def refresh(self) -> str: |
|
""" |
|
Refreshes the current list of papers. |
|
""" |
|
logger.info("Refreshing papers.") |
|
self.sort_papers() |
|
return self.get_current_page_papers() |
|
|
|
|
|
|
|
def initialize_paper_manager() -> str: |
|
""" |
|
Initializes the PaperList and PaperManager with the current DataFrame. |
|
""" |
|
df = get_df() |
|
if df.empty: |
|
logger.warning("Initialized with an empty DataFrame.") |
|
paper_list = PaperList(df) |
|
manager = PaperManager(paper_list) |
|
logger.info("PaperManager initialized.") |
|
return manager.get_current_page_papers() |
|
|
|
|
|
paper_manager = None |
|
|
|
def setup_paper_manager(): |
|
""" |
|
Sets up the global PaperManager instance. |
|
""" |
|
global paper_manager |
|
df = get_df() |
|
paper_list = PaperList(df) |
|
paper_manager = PaperManager(paper_list) |
|
logger.info("PaperManager setup complete.") |
|
|
|
|
|
|
|
setup_paper_manager() |
|
|
|
|
|
def update_paper_manager() -> str: |
|
""" |
|
Updates the global PaperManager with the latest DataFrame. |
|
""" |
|
global paper_manager |
|
logger.info("Updating PaperManager with latest data.") |
|
df = get_df() |
|
if df.empty: |
|
logger.warning("DataFrame is empty. Skipping update.") |
|
return paper_manager.get_current_page_papers() |
|
paper_manager.paper_list = PaperList(df) |
|
paper_manager.sort_papers() |
|
logger.info("PaperManager updated successfully.") |
|
return paper_manager.get_current_page_papers() |
|
|
|
|
|
|
|
scheduler_data = BackgroundScheduler() |
|
scheduler_data.add_job( |
|
func=update_paper_manager, |
|
trigger="cron", |
|
minute=0, |
|
timezone="UTC", |
|
misfire_grace_time=60, |
|
) |
|
scheduler_data.start() |
|
logger.info("BackgroundScheduler started.") |
|
|
|
|
|
atexit.register(lambda: scheduler_data.shutdown()) |
|
logger.info("Scheduler shutdown registered.") |
|
|
|
|
|
|
|
|
|
def change_sort_method_ui(method: str, time_frame: str = "all time") -> str: |
|
""" |
|
Changes the sort method and, if applicable, sets additional parameters. |
|
""" |
|
logger.info(f"Changing sort method to: {method} with time frame: {time_frame}") |
|
if method.lower() in ["most_models", "most_datasets", "most_spaces"]: |
|
paper_manager.set_sort_method(method.lower()) |
|
elif method.lower() == "hot": |
|
paper_manager.set_sort_method(method.lower()) |
|
elif method.lower() == "new": |
|
paper_manager.set_sort_method(method.lower()) |
|
else: |
|
|
|
paper_manager.set_sort_method("hot") |
|
return paper_manager.get_current_page_papers() |
|
|
|
|
|
|
|
|
|
css = """ |
|
/* Hacker News-like CSS */ |
|
|
|
body { |
|
background-color: white; |
|
font-family: Verdana, Geneva, sans-serif; |
|
margin: 0; |
|
padding: 0; |
|
} |
|
|
|
a { |
|
color: #0000ff; |
|
text-decoration: none; |
|
} |
|
|
|
a:visited { |
|
color: #551A8B; |
|
} |
|
|
|
.container { |
|
width: 85%; |
|
margin: auto; |
|
} |
|
|
|
table { |
|
width: 100%; |
|
} |
|
|
|
.header-table { |
|
width: 100%; |
|
background-color: #ff6600; |
|
padding: 2px 10px; |
|
} |
|
|
|
.header-table a { |
|
color: black; |
|
font-weight: bold; |
|
font-size: 14pt; |
|
text-decoration: none; |
|
} |
|
|
|
.itemlist .athing { |
|
background-color: #f6f6ef; |
|
} |
|
|
|
.rank { |
|
font-size: 14pt; |
|
color: #828282; |
|
padding-right: 5px; |
|
} |
|
|
|
.storylink { |
|
font-size: 10pt; |
|
} |
|
|
|
.subtext { |
|
font-size: 8pt; |
|
color: #828282; |
|
padding-left: 40px; |
|
} |
|
|
|
.subtext a { |
|
color: #828282; |
|
text-decoration: none; |
|
} |
|
|
|
.no-papers { |
|
text-align: center; |
|
color: #828282; |
|
padding: 1rem; |
|
font-size: 14pt; |
|
} |
|
|
|
@media (max-width: 640px) { |
|
.header-table a { |
|
font-size: 12pt; |
|
} |
|
|
|
.storylink { |
|
font-size: 9pt; |
|
} |
|
|
|
.subtext { |
|
font-size: 7pt; |
|
} |
|
} |
|
|
|
/* Dark mode */ |
|
@media (prefers-color-scheme: dark) { |
|
body { |
|
background-color: #121212; |
|
color: #e0e0e0; |
|
} |
|
|
|
a { |
|
color: #add8e6; |
|
} |
|
|
|
a:visited { |
|
color: #9370db; |
|
} |
|
|
|
.header-table { |
|
background-color: #ff6600; |
|
} |
|
|
|
.header-table a { |
|
color: black; |
|
} |
|
|
|
.itemlist .athing { |
|
background-color: #1e1e1e; |
|
} |
|
|
|
.rank { |
|
color: #b0b0b0; |
|
} |
|
|
|
.subtext { |
|
color: #b0b0b0; |
|
} |
|
|
|
.subtext a { |
|
color: #b0b0b0; |
|
} |
|
|
|
.no-papers { |
|
color: #b0b0b0; |
|
} |
|
} |
|
""" |
|
|
|
|
|
|
|
|
|
demo = gr.Blocks(css=css) |
|
|
|
with demo: |
|
with gr.Column(elem_classes=["container"]): |
|
|
|
with gr.Accordion("How to Submit a Paper", open=False): |
|
gr.Markdown(""" |
|
**Submit the paper to Daily Papers:** |
|
[https://huggingface.co/papers/submit](https://huggingface.co/papers/submit) |
|
|
|
Once your paper is submitted, it will automatically appear in this demo. |
|
""") |
|
|
|
with gr.Row(): |
|
gr.HTML(""" |
|
<table border="0" cellpadding="0" cellspacing="0" class="header-table"> |
|
<tr> |
|
<td> |
|
<span class="pagetop"> |
|
<b class="hnname"><a href="#">Daily Papers</a></b> |
|
</span> |
|
</td> |
|
<td align="right"> |
|
<!-- Future Navigation Links Can Be Added Here --> |
|
</td> |
|
</tr> |
|
</table> |
|
""") |
|
|
|
with gr.Row(): |
|
sort_radio = gr.Radio( |
|
choices=["Hot", "New", "Most Models", "Most Datasets", "Most Spaces"], |
|
value="Hot", |
|
label="Sort By", |
|
interactive=True |
|
) |
|
|
|
|
|
paper_list = gr.HTML() |
|
|
|
with gr.Row(): |
|
prev_button = gr.Button("Prev") |
|
next_button = gr.Button("Next") |
|
|
|
|
|
demo.load( |
|
fn=lambda: paper_manager.get_current_page_papers(), |
|
outputs=[paper_list] |
|
) |
|
|
|
|
|
prev_button.click(paper_manager.prev_page, outputs=[paper_list]) |
|
next_button.click(paper_manager.next_page, outputs=[paper_list]) |
|
|
|
|
|
sort_radio.change( |
|
fn=change_sort_method_ui, |
|
inputs=[sort_radio, None], |
|
outputs=[paper_list] |
|
) |
|
|
|
|
|
gr.Markdown(""" |
|
Related useful Spaces: |
|
- [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien) |
|
- [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy) |
|
- [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung) |
|
""") |
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |