dailypapershackernews-dev

Running

App Files Files Community

dailypapershackernews-dev / app.py

akhaliq HF Staff

Update app.py

6790790 verified 10 months ago

raw

history blame

21 kB

	#!/usr/bin/env python

	import datetime
	import operator
	import pandas as pd
	import tqdm.auto
	from apscheduler.schedulers.background import BackgroundScheduler
	from huggingface_hub import HfApi

	import gradio as gr
	from gradio_calendar import Calendar
	import datasets
	import requests

	from datetime import timezone # Ensure timezone is imported

	# --- Data Loading and Processing ---

	api = HfApi()

	def get_df() -> pd.DataFrame:
	"""
	Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns,
	and adds a 'paper_page' link for each paper.
	"""
	# Load datasets
	df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
	df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas()

	# Merge datasets on 'arxiv_id'
	df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id")

	# Reverse the DataFrame to have the latest papers first
	df = df[::-1].reset_index(drop=True)

	# Ensure 'date' is in datetime format and handle missing dates
	df["date"] = pd.to_datetime(df["date"], errors='coerce')
	df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))

	# Prepare the DataFrame by removing 'abstract'
	paper_info = []
	for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
	info = row.copy()
	if "abstract" in info:
	del info["abstract"]
	paper_info.append(info)
	df_prepared = pd.DataFrame(paper_info)

	# Add 'paper_page' links
	df_prepared["paper_page"] = df_prepared["arxiv_id"].apply(lambda x: f"https://huggingface.co/papers/{x}")

	return df_prepared


	class Prettifier:
	"""
	Converts raw DataFrame rows into a prettified format suitable for display.
	"""
	@staticmethod
	def get_github_link(link: str) -> str:
	if not link:
	return ""
	return Prettifier.create_link("github", link)

	@staticmethod
	def create_link(text: str, url: str) -> str:
	return f'<a href="{url}" target="_blank">{text}</a>'

	def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
	new_rows = []
	for _, row in df.iterrows():
	# Handle date_display as a clickable link
	date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")

	new_row = {
	"arxiv_id": row.get("arxiv_id", ""), # Include arxiv_id
	"date_display": date_display, # For display
	"date": row.get("date", datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")), # For internal calculations
	"paper_page": Prettifier.create_link(row.get("arxiv_id", ""), row.get("paper_page", "#")),
	"title": row.get("title", "No title"),
	"github": Prettifier.get_github_link(row.get("github", "")),
	"👍": row.get("upvotes", 0),
	"💬": row.get("num_comments", 0),
	}
	new_rows.append(new_row)
	return pd.DataFrame(new_rows)


	class PaperList:
	"""
	Manages the list of papers, including search functionality.
	"""
	COLUMN_INFO = [
	["arxiv_id", "str"], # Added arxiv_id
	["date_display", "markdown"], # For display
	["date", "str"], # For internal use
	["paper_page", "markdown"],
	["title", "str"],
	["github", "markdown"],
	["👍", "number"],
	["💬", "number"],
	]

	def __init__(self, df: pd.DataFrame):
	self.df_raw = df
	self._prettifier = Prettifier()
	self.df_prettified = self._prettifier(df).loc[:, self.column_names]

	@property
	def column_names(self):
	return list(map(operator.itemgetter(0), self.COLUMN_INFO))

	@property
	def column_datatype(self):
	return list(map(operator.itemgetter(1), self.COLUMN_INFO))

	def search(
	self,
	title_search_query: str,
	max_num_to_retrieve: int = 1000, # Set a high default to include all if not specified
	) -> pd.DataFrame:
	"""
	Filters the DataFrame based on the title search query and limits the number of results.
	"""
	df = self.df_raw.copy()

	# Filter by title if search query is provided
	if title_search_query:
	df = df[df["title"].str.contains(title_search_query, case=False, na=False)]

	# Limit the number of papers to retrieve if max_num_to_retrieve is set
	if max_num_to_retrieve:
	df = df.head(max_num_to_retrieve)

	# Prettify the DataFrame
	df_prettified = self._prettifier(df).loc[:, self.column_names]
	return df_prettified


	# --- Sorting and Pagination Management ---

	class PaperManager:
	"""
	Manages sorting, pagination, and search queries for the list of papers.
	"""
	def __init__(self, paper_list: PaperList, papers_per_page=30):
	self.paper_list = paper_list
	self.papers_per_page = papers_per_page
	self.sort_method = "hot" # Default sort method
	self.current_search_query = "" # Initialize with no search query
	self.top_time_frame = "all time" # Default time frame for "Top" sorting
	self.sort_papers()
	# 'current_page' and 'total_pages' are set in 'sort_papers()'

	def calculate_score(self, row):
	"""
	Calculate the score of a paper based on upvotes and age.
	This mimics the "hotness" algorithm used by platforms like Hacker News.
	"""
	upvotes = row.get('👍', 0)
	date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
	try:
	published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
	except ValueError:
	# If parsing fails, use current time to minimize the impact on sorting
	published_time = datetime.datetime.now(timezone.utc)

	time_diff = datetime.datetime.now(timezone.utc) - published_time
	time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours

	# Avoid division by zero and apply the hotness formula
	score = upvotes / ((time_diff_hours + 2) ** 1.5) if (time_diff_hours + 2) > 0 else 0
	return score

	def sort_papers(self):
	"""
	Sorts the papers based on the current sort method and search query.
	"""
	df = self.paper_list.df_raw.copy()

	# Apply search filter if a search query exists
	if self.current_search_query:
	df = df[df["title"].str.contains(self.current_search_query, case=False, na=False)]

	if self.sort_method == "hot":
	df['score'] = df.apply(self.calculate_score, axis=1)
	df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
	elif self.sort_method == "new":
	df_sorted = df.sort_values(by='date', ascending=False) # Sort by 'date'
	elif self.sort_method == "top":
	# Filter based on the selected time frame
	now = datetime.datetime.now(timezone.utc)
	if self.top_time_frame == "day":
	time_threshold = now - datetime.timedelta(days=1)
	elif self.top_time_frame == "week":
	time_threshold = now - datetime.timedelta(weeks=1)
	elif self.top_time_frame == "month":
	time_threshold = now - datetime.timedelta(days=30)
	elif self.top_time_frame == "year":
	time_threshold = now - datetime.timedelta(days=365)
	elif self.top_time_frame == "all time":
	time_threshold = datetime.datetime.min.replace(tzinfo=timezone.utc)
	else:
	time_threshold = datetime.datetime.min.replace(tzinfo=timezone.utc)

	# Convert 'date' column to datetime
	df_sorted = df.copy()
	df_sorted['date_parsed'] = pd.to_datetime(df_sorted['date'], errors='coerce').dt.tz_localize(timezone.utc)
	df_sorted = df_sorted[df_sorted['date_parsed'] >= time_threshold]
	df_sorted = df_sorted.sort_values(by='upvotes', ascending=False).drop(columns=['date_parsed'])
	else:
	df_sorted = df

	self.paper_list.df_raw = df_sorted.reset_index(drop=True)
	self.paper_list.df_prettified = self.paper_list._prettifier(self.paper_list.df_raw).loc[:, self.paper_list.column_names]
	self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1)
	self.current_page = 1

	def set_sort_method(self, method, time_frame=None):
	"""
	Sets the sort method ('hot', 'new', 'top') and re-sorts the papers.
	If 'top' is selected, also sets the time frame.
	"""
	if method not in ["hot", "new", "top"]:
	method = "hot"
	print(f"Setting sort method to: {method}")
	self.sort_method = method
	if method == "top" and time_frame:
	self.top_time_frame = time_frame.lower()
	print(f"Setting top time frame to: {self.top_time_frame}")
	self.sort_papers()
	return True # Assume success

	def set_search_query(self, query: str):
	"""
	Sets the current search query and re-sorts the papers.
	"""
	print(f"Setting search query to: {query}")
	self.current_search_query = query
	self.sort_papers()
	return True # Assume success

	def get_current_page_papers(self) -> str:
	"""
	Retrieves the HTML string of the current page's papers.
	"""
	start = (self.current_page - 1) * self.papers_per_page
	end = start + self.papers_per_page
	current_papers = self.paper_list.df_prettified.iloc[start:end]

	if current_papers.empty:
	return "<div class='no-papers'>No papers available for this page.</div>"

	papers_html = "".join([self.format_paper(row, idx + start + 1) for idx, row in current_papers.iterrows()])
	return f"""
	<table border="0" cellpadding="0" cellspacing="0" class="itemlist">
	{papers_html}
	</table>
	"""

	def format_paper(self, row, rank):
	"""
	Formats a single paper entry into HTML.
	"""
	title = row.get('title', 'No title')
	paper_id = row.get('arxiv_id', '')
	url = f"https://huggingface.co/papers/{paper_id}"
	upvotes = row.get('👍', 0)
	comments = row.get('💬', 0)
	date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
	try:
	published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
	except ValueError:
	published_time = datetime.datetime.now(timezone.utc)
	time_diff = datetime.datetime.now(timezone.utc) - published_time
	time_ago_days = time_diff.days
	time_ago = f"{time_ago_days} days ago" if time_ago_days > 0 else "today"

	return f"""
	<tr class="athing">
	<td align="right" valign="top" class="title"><span class="rank">{rank}.</span></td>
	<td valign="top" class="title">
	<a href="{url}" class="storylink" target="_blank">{title}</a>
	</td>
	</tr>
	<tr>
	<td colspan="1"></td>
	<td class="subtext">
	<span class="score">{upvotes} upvotes</span><br>
	{time_ago} \| <a href="#">{comments} comments</a>
	</td>
	</tr>
	<tr style="height:5px"></tr>
	"""

	def next_page(self) -> str:
	"""
	Navigates to the next page if possible.
	"""
	if self.current_page < self.total_pages:
	self.current_page += 1
	return self.get_current_page_papers()

	def prev_page(self) -> str:
	"""
	Navigates to the previous page if possible.
	"""
	if self.current_page > 1:
	self.current_page -= 1
	return self.get_current_page_papers()

	def refresh(self) -> str:
	"""
	Refreshes the current list of papers.
	"""
	self.sort_papers()
	return self.get_current_page_papers()


	# Initialize PaperList and PaperManager
	def initialize_paper_manager() -> str:
	"""
	Initializes the PaperList and PaperManager with the current DataFrame.
	"""
	df = get_df()
	paper_list = PaperList(df)
	manager = PaperManager(paper_list)
	return manager.get_current_page_papers() # Return HTML string instead of the manager object


	paper_manager = None # Initialize globally


	def setup_paper_manager():
	"""
	Sets up the global PaperManager instance.
	"""
	global paper_manager
	df = get_df()
	paper_list = PaperList(df)
	paper_manager = PaperManager(paper_list)


	# Initialize PaperManager at the start
	setup_paper_manager()


	def update_paper_manager() -> str:
	"""
	Updates the global PaperManager with the latest DataFrame.
	"""
	global paper_manager
	df = get_df()
	paper_manager.paper_list = PaperList(df)
	paper_manager.sort_papers()
	return paper_manager.get_current_page_papers()


	# Scheduler for updating paper list every hour
	scheduler_data = BackgroundScheduler()
	scheduler_data.add_job(
	func=update_paper_manager,
	trigger="cron",
	minute=0, # Every hour at minute 0
	timezone="UTC",
	misfire_grace_time=60,
	)
	scheduler_data.start()


	# --- Gradio Interface Functions ---

	def change_sort_method_ui(method: str, time_frame: str = "all time") -> str:
	"""
	Changes the sort method and, if 'top' is selected, sets the time frame.
	"""
	if method.lower() == "top":
	paper_manager.set_sort_method(method.lower(), time_frame)
	else:
	paper_manager.set_sort_method(method.lower())
	return paper_manager.get_current_page_papers()


	def refresh_papers_ui() -> str:
	"""
	Refreshes the paper list.
	"""
	return paper_manager.refresh()


	def search_papers_ui(query: str) -> str:
	"""
	Searches for papers based on the title search query.
	"""
	paper_manager.set_search_query(query)
	return paper_manager.get_current_page_papers()


	def clear_search_ui() -> str:
	"""
	Clears the current search query and refreshes the paper list.
	"""
	paper_manager.set_search_query("")
	return paper_manager.get_current_page_papers()


	# --- CSS Styling ---

	css = """
	/* Existing CSS remains unchanged */
	body {
	background-color: white;
	font-family: Verdana, Geneva, sans-serif;
	margin: 0;
	padding: 0;
	}

	a {
	color: #0000ff;
	text-decoration: none;
	}

	a:visited {
	color: #551A8B;
	}

	.container {
	width: 85%;
	margin: auto;
	}

	table {
	width: 100%;
	}

	.header-table {
	width: 100%;
	background-color: #ff6600;
	padding: 2px 10px;
	}

	.header-table a {
	color: black;
	font-weight: bold;
	font-size: 14pt;
	text-decoration: none;
	}

	.itemlist .athing {
	background-color: #f6f6ef;
	}

	.rank {
	font-size: 14pt;
	color: #828282;
	padding-right: 5px;
	}

	.storylink {
	font-size: 10pt;
	}

	.subtext {
	font-size: 8pt;
	color: #828282;
	padding-left: 40px;
	}

	.subtext a {
	color: #828282;
	text-decoration: none;
	}

	#refresh-button {
	background: none;
	border: none;
	color: black;
	font-weight: bold;
	font-size: 14pt;
	cursor: pointer;
	}

	.no-papers {
	text-align: center;
	color: #828282;
	padding: 1rem;
	font-size: 14pt;
	}

	@media (max-width: 640px) {
	.header-table a {
	font-size: 12pt;
	}

	.storylink {
	font-size: 9pt;
	}

	.subtext {
	font-size: 7pt;
	}
	}

	/* Dark mode */
	@media (prefers-color-scheme: dark) {
	body {
	background-color: #121212;
	color: #e0e0e0;
	}

	a {
	color: #add8e6;
	}

	a:visited {
	color: #9370db;
	}

	.header-table {
	background-color: #ff6600;
	}

	.header-table a {
	color: black;
	}

	.itemlist .athing {
	background-color: #1e1e1e;
	}

	.rank {
	color: #b0b0b0;
	}

	.subtext {
	color: #b0b0b0;
	}

	.subtext a {
	color: #b0b0b0;
	}

	#refresh-button {
	color: #e0e0e0;
	}

	.no-papers {
	color: #b0b0b0;
	}
	}
	"""

	# --- Initialize Gradio Blocks ---

	demo = gr.Blocks(css=css)

	with demo:
	with gr.Column(elem_classes=["container"]):
	# Accordion for Submission Instructions
	with gr.Accordion("How to Submit a Paper", open=False):
	gr.Markdown("""
	Submit the paper to Daily Papers:
	[https://huggingface.co/papers/submit](https://huggingface.co/papers/submit)

	Once your paper is submitted, it will automatically appear in this demo.
	""")
	# Header with Refresh Button
	with gr.Row():
	gr.HTML("""
	<table border="0" cellpadding="0" cellspacing="0" class="header-table">
	<tr>
	<td>
	<span class="pagetop">
	<b class="hnname"><a href="#">Daily Papers</a></b>
	</span>
	</td>
	<td align="right">
	<button id="refresh-button">Refresh</button>
	</td>
	</tr>
	</table>
	""")
	# Search Bar and Clear Search Button
	with gr.Row():
	search_box = gr.Textbox(
	label="Search Papers by Title",
	placeholder="Enter keywords to search...",
	lines=1,
	interactive=True
	)
	search_button = gr.Button("Search")
	clear_search_button = gr.Button("Clear Search")
	# Sort Options and Time Frame (conditionally visible)
	with gr.Row():
	sort_radio = gr.Radio(
	choices=["Hot", "New", "Top"],
	value="Hot",
	label="Sort By",
	interactive=True
	)
	time_frame_dropdown = gr.Dropdown(
	choices=["day", "week", "month", "year", "all time"],
	value="all time",
	label="Time Frame for Top",
	visible=False,
	interactive=True
	)
	# Paper list
	paper_list = gr.HTML()
	# Navigation Buttons
	with gr.Row():
	prev_button = gr.Button("Prev")
	next_button = gr.Button("Next")

	# Load papers on app start
	demo.load(
	fn=lambda: paper_manager.get_current_page_papers(),
	outputs=[paper_list]
	)

	# Button clicks for pagination
	prev_button.click(paper_manager.prev_page, outputs=[paper_list])
	next_button.click(paper_manager.next_page, outputs=[paper_list])

	# Refresh functionality
	refresh_button = gr.Button("Refresh", visible=False, elem_id="refresh-hidden")
	refresh_button.click(update_paper_manager, outputs=[paper_list])

	# Bind the visible Refresh button to the hidden one using JavaScript
	gr.HTML("""
	<script>
	document.getElementById('refresh-button').addEventListener('click', function() {
	document.getElementById('refresh-hidden').click();
	});
	</script>
	""")

	# Sort option change
	sort_radio.change(
	fn=lambda method: method.lower(),
	inputs=[sort_radio],
	outputs=None,
	_js="""
	(method) => {
	if (method === 'top') {
	document.querySelector('[label="Time Frame for Top"]').style.display = 'block';
	} else {
	document.querySelector('[label="Time Frame for Top"]').style.display = 'none';
	}
	return method;
	}
	"""
	)

	sort_radio.change(
	fn=change_sort_method_ui,
	inputs=[sort_radio, time_frame_dropdown],
	outputs=[paper_list]
	)

	# Search functionality
	search_button.click(
	fn=search_papers_ui,
	inputs=[search_box],
	outputs=[paper_list]
	)

	# Clear search functionality
	clear_search_button.click(
	fn=clear_search_ui,
	inputs=None,
	outputs=[paper_list]
	)

	# Footer
	gr.Markdown("""
	Related useful Spaces:
	- [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien)
	- [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy)
	- [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
	""")


	# --- Launch the App ---

	if __name__ == "__main__":
	demo.launch()