Update app.py
Browse files
app.py
CHANGED
@@ -7,9 +7,15 @@ from apscheduler.schedulers.background import BackgroundScheduler
|
|
7 |
from huggingface_hub import HfApi
|
8 |
|
9 |
import gradio as gr
|
10 |
-
import datasets #
|
11 |
|
12 |
-
from datetime import timezone
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# --- Data Loading and Processing ---
|
15 |
|
@@ -20,33 +26,44 @@ def get_df() -> pd.DataFrame:
|
|
20 |
Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns,
|
21 |
and adds a 'paper_page' link for each paper.
|
22 |
"""
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
|
52 |
class Prettifier:
|
@@ -192,7 +209,7 @@ class PaperManager:
|
|
192 |
|
193 |
# Convert 'date' column to datetime
|
194 |
df_sorted = df.copy()
|
195 |
-
df_sorted['date_parsed'] = pd.to_datetime(df_sorted['date'], errors='coerce').dt.tz_localize(timezone.utc)
|
196 |
df_sorted = df_sorted[df_sorted['date_parsed'] >= time_threshold]
|
197 |
df_sorted = df_sorted.sort_values(by='upvotes', ascending=False).drop(columns=['date_parsed'])
|
198 |
else:
|
@@ -210,11 +227,11 @@ class PaperManager:
|
|
210 |
"""
|
211 |
if method not in ["hot", "new", "top"]:
|
212 |
method = "hot"
|
213 |
-
|
214 |
self.sort_method = method
|
215 |
if method == "top" and time_frame:
|
216 |
self.top_time_frame = time_frame.lower()
|
217 |
-
|
218 |
self.sort_papers()
|
219 |
return True # Assume success
|
220 |
|
@@ -277,6 +294,9 @@ class PaperManager:
|
|
277 |
"""
|
278 |
if self.current_page < self.total_pages:
|
279 |
self.current_page += 1
|
|
|
|
|
|
|
280 |
return self.get_current_page_papers()
|
281 |
|
282 |
def prev_page(self) -> str:
|
@@ -285,12 +305,16 @@ class PaperManager:
|
|
285 |
"""
|
286 |
if self.current_page > 1:
|
287 |
self.current_page -= 1
|
|
|
|
|
|
|
288 |
return self.get_current_page_papers()
|
289 |
|
290 |
def refresh(self) -> str:
|
291 |
"""
|
292 |
Refreshes the current list of papers.
|
293 |
"""
|
|
|
294 |
self.sort_papers()
|
295 |
return self.get_current_page_papers()
|
296 |
|
@@ -317,6 +341,7 @@ def setup_paper_manager():
|
|
317 |
df = get_df()
|
318 |
paper_list = PaperList(df)
|
319 |
paper_manager = PaperManager(paper_list)
|
|
|
320 |
|
321 |
|
322 |
# Initialize PaperManager at the start
|
@@ -328,9 +353,14 @@ def update_paper_manager() -> str:
|
|
328 |
Updates the global PaperManager with the latest DataFrame.
|
329 |
"""
|
330 |
global paper_manager
|
|
|
331 |
df = get_df()
|
|
|
|
|
|
|
332 |
paper_manager.paper_list = PaperList(df)
|
333 |
paper_manager.sort_papers()
|
|
|
334 |
return paper_manager.get_current_page_papers()
|
335 |
|
336 |
|
@@ -344,6 +374,11 @@ scheduler_data.add_job(
|
|
344 |
misfire_grace_time=60,
|
345 |
)
|
346 |
scheduler_data.start()
|
|
|
|
|
|
|
|
|
|
|
347 |
|
348 |
|
349 |
# --- Gradio Interface Functions ---
|
@@ -352,6 +387,7 @@ def change_sort_method_ui(method: str, time_frame: str = "all time") -> str:
|
|
352 |
"""
|
353 |
Changes the sort method and, if 'top' is selected, sets the time frame.
|
354 |
"""
|
|
|
355 |
if method.lower() == "top":
|
356 |
paper_manager.set_sort_method(method.lower(), time_frame)
|
357 |
else:
|
@@ -362,7 +398,8 @@ def change_sort_method_ui(method: str, time_frame: str = "all time") -> str:
|
|
362 |
# --- CSS Styling ---
|
363 |
|
364 |
css = """
|
365 |
-
/*
|
|
|
366 |
body {
|
367 |
background-color: white;
|
368 |
font-family: Verdana, Geneva, sans-serif;
|
@@ -492,7 +529,6 @@ table {
|
|
492 |
}
|
493 |
"""
|
494 |
|
495 |
-
|
496 |
# --- Initialize Gradio Blocks ---
|
497 |
|
498 |
demo = gr.Blocks(css=css)
|
@@ -507,9 +543,22 @@ with demo:
|
|
507 |
|
508 |
Once your paper is submitted, it will automatically appear in this demo.
|
509 |
""")
|
510 |
-
#
|
511 |
with gr.Row():
|
512 |
-
gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
513 |
# Sort Options and Time Frame (conditionally visible)
|
514 |
with gr.Row():
|
515 |
sort_radio = gr.Radio(
|
@@ -568,4 +617,4 @@ with demo:
|
|
568 |
# --- Launch the App ---
|
569 |
|
570 |
if __name__ == "__main__":
|
571 |
-
demo.launch()
|
|
|
7 |
from huggingface_hub import HfApi
|
8 |
|
9 |
import gradio as gr
|
10 |
+
import datasets # Ensure the datasets library is imported
|
11 |
|
12 |
+
from datetime import timezone
|
13 |
+
import atexit # To gracefully shut down the scheduler
|
14 |
+
import logging # For logging purposes
|
15 |
+
|
16 |
+
# --- Logging Configuration ---
|
17 |
+
logging.basicConfig(level=logging.INFO)
|
18 |
+
logger = logging.getLogger(__name__)
|
19 |
|
20 |
# --- Data Loading and Processing ---
|
21 |
|
|
|
26 |
Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns,
|
27 |
and adds a 'paper_page' link for each paper.
|
28 |
"""
|
29 |
+
try:
|
30 |
+
# Load datasets
|
31 |
+
logger.info("Loading 'daily-papers' dataset.")
|
32 |
+
df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
|
33 |
+
logger.info("Loading 'daily-papers-stats' dataset.")
|
34 |
+
df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas()
|
35 |
+
|
36 |
+
# Merge datasets on 'arxiv_id'
|
37 |
+
logger.info("Merging datasets on 'arxiv_id'.")
|
38 |
+
df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id", suffixes=('_papers', '_stats'))
|
39 |
+
|
40 |
+
# Reverse the DataFrame to have the latest papers first
|
41 |
+
df = df[::-1].reset_index(drop=True)
|
42 |
+
|
43 |
+
# Ensure 'date' is in datetime format and handle missing dates
|
44 |
+
logger.info("Processing 'date' column.")
|
45 |
+
df["date"] = pd.to_datetime(df["date"], errors='coerce')
|
46 |
+
df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
|
47 |
+
|
48 |
+
# Prepare the DataFrame by removing 'abstract'
|
49 |
+
logger.info("Removing 'abstract' column if present.")
|
50 |
+
paper_info = []
|
51 |
+
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
|
52 |
+
info = row.copy()
|
53 |
+
if "abstract" in info:
|
54 |
+
del info["abstract"]
|
55 |
+
paper_info.append(info)
|
56 |
+
df_prepared = pd.DataFrame(paper_info)
|
57 |
+
|
58 |
+
# Add 'paper_page' links
|
59 |
+
logger.info("Adding 'paper_page' links.")
|
60 |
+
df_prepared["paper_page"] = df_prepared["arxiv_id"].apply(lambda x: f"https://huggingface.co/papers/{x}")
|
61 |
+
|
62 |
+
logger.info("DataFrame preparation complete.")
|
63 |
+
return df_prepared
|
64 |
+
except Exception as e:
|
65 |
+
logger.error(f"Error in get_df: {e}")
|
66 |
+
return pd.DataFrame() # Return empty DataFrame on error
|
67 |
|
68 |
|
69 |
class Prettifier:
|
|
|
209 |
|
210 |
# Convert 'date' column to datetime
|
211 |
df_sorted = df.copy()
|
212 |
+
df_sorted['date_parsed'] = pd.to_datetime(df_sorted['date'], errors='coerce').dt.tz_localize(timezone.utc, ambiguous='NaT', nonexistent='NaT')
|
213 |
df_sorted = df_sorted[df_sorted['date_parsed'] >= time_threshold]
|
214 |
df_sorted = df_sorted.sort_values(by='upvotes', ascending=False).drop(columns=['date_parsed'])
|
215 |
else:
|
|
|
227 |
"""
|
228 |
if method not in ["hot", "new", "top"]:
|
229 |
method = "hot"
|
230 |
+
logger.info(f"Setting sort method to: {method}")
|
231 |
self.sort_method = method
|
232 |
if method == "top" and time_frame:
|
233 |
self.top_time_frame = time_frame.lower()
|
234 |
+
logger.info(f"Setting top time frame to: {self.top_time_frame}")
|
235 |
self.sort_papers()
|
236 |
return True # Assume success
|
237 |
|
|
|
294 |
"""
|
295 |
if self.current_page < self.total_pages:
|
296 |
self.current_page += 1
|
297 |
+
logger.info(f"Navigated to page {self.current_page}.")
|
298 |
+
else:
|
299 |
+
logger.info("Already on the last page.")
|
300 |
return self.get_current_page_papers()
|
301 |
|
302 |
def prev_page(self) -> str:
|
|
|
305 |
"""
|
306 |
if self.current_page > 1:
|
307 |
self.current_page -= 1
|
308 |
+
logger.info(f"Navigated to page {self.current_page}.")
|
309 |
+
else:
|
310 |
+
logger.info("Already on the first page.")
|
311 |
return self.get_current_page_papers()
|
312 |
|
313 |
def refresh(self) -> str:
|
314 |
"""
|
315 |
Refreshes the current list of papers.
|
316 |
"""
|
317 |
+
logger.info("Refreshing papers.")
|
318 |
self.sort_papers()
|
319 |
return self.get_current_page_papers()
|
320 |
|
|
|
341 |
df = get_df()
|
342 |
paper_list = PaperList(df)
|
343 |
paper_manager = PaperManager(paper_list)
|
344 |
+
logger.info("PaperManager setup complete.")
|
345 |
|
346 |
|
347 |
# Initialize PaperManager at the start
|
|
|
353 |
Updates the global PaperManager with the latest DataFrame.
|
354 |
"""
|
355 |
global paper_manager
|
356 |
+
logger.info("Updating PaperManager with latest data.")
|
357 |
df = get_df()
|
358 |
+
if df.empty:
|
359 |
+
logger.warning("DataFrame is empty. Skipping update.")
|
360 |
+
return paper_manager.get_current_page_papers()
|
361 |
paper_manager.paper_list = PaperList(df)
|
362 |
paper_manager.sort_papers()
|
363 |
+
logger.info("PaperManager updated successfully.")
|
364 |
return paper_manager.get_current_page_papers()
|
365 |
|
366 |
|
|
|
374 |
misfire_grace_time=60,
|
375 |
)
|
376 |
scheduler_data.start()
|
377 |
+
logger.info("BackgroundScheduler started.")
|
378 |
+
|
379 |
+
# Ensure the scheduler shuts down gracefully on exit
|
380 |
+
atexit.register(lambda: scheduler_data.shutdown())
|
381 |
+
logger.info("Scheduler shutdown registered.")
|
382 |
|
383 |
|
384 |
# --- Gradio Interface Functions ---
|
|
|
387 |
"""
|
388 |
Changes the sort method and, if 'top' is selected, sets the time frame.
|
389 |
"""
|
390 |
+
logger.info(f"Changing sort method to: {method} with time frame: {time_frame}")
|
391 |
if method.lower() == "top":
|
392 |
paper_manager.set_sort_method(method.lower(), time_frame)
|
393 |
else:
|
|
|
398 |
# --- CSS Styling ---
|
399 |
|
400 |
css = """
|
401 |
+
/* Hacker News-like CSS */
|
402 |
+
|
403 |
body {
|
404 |
background-color: white;
|
405 |
font-family: Verdana, Geneva, sans-serif;
|
|
|
529 |
}
|
530 |
"""
|
531 |
|
|
|
532 |
# --- Initialize Gradio Blocks ---
|
533 |
|
534 |
demo = gr.Blocks(css=css)
|
|
|
543 |
|
544 |
Once your paper is submitted, it will automatically appear in this demo.
|
545 |
""")
|
546 |
+
# Hacker News-like Header
|
547 |
with gr.Row():
|
548 |
+
gr.HTML("""
|
549 |
+
<table border="0" cellpadding="0" cellspacing="0" class="header-table">
|
550 |
+
<tr>
|
551 |
+
<td>
|
552 |
+
<span class="pagetop">
|
553 |
+
<b class="hnname"><a href="#">Daily Papers</a></b>
|
554 |
+
</span>
|
555 |
+
</td>
|
556 |
+
<td align="right">
|
557 |
+
<!-- Future Navigation Links Can Be Added Here -->
|
558 |
+
</td>
|
559 |
+
</tr>
|
560 |
+
</table>
|
561 |
+
""")
|
562 |
# Sort Options and Time Frame (conditionally visible)
|
563 |
with gr.Row():
|
564 |
sort_radio = gr.Radio(
|
|
|
617 |
# --- Launch the App ---
|
618 |
|
619 |
if __name__ == "__main__":
|
620 |
+
demo.launch()
|