akhaliq HF staff commited on
Commit
33ea647
·
verified ·
1 Parent(s): 6def03f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -37
app.py CHANGED
@@ -7,9 +7,15 @@ from apscheduler.schedulers.background import BackgroundScheduler
7
  from huggingface_hub import HfApi
8
 
9
  import gradio as gr
10
- import datasets # Added import for datasets
11
 
12
- from datetime import timezone # Ensure timezone is imported
 
 
 
 
 
 
13
 
14
  # --- Data Loading and Processing ---
15
 
@@ -20,33 +26,44 @@ def get_df() -> pd.DataFrame:
20
  Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns,
21
  and adds a 'paper_page' link for each paper.
22
  """
23
- # Load datasets
24
- df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
25
- df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas()
26
-
27
- # Merge datasets on 'arxiv_id'
28
- df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id", suffixes=('_papers', '_stats'))
29
-
30
- # Reverse the DataFrame to have the latest papers first
31
- df = df[::-1].reset_index(drop=True)
32
-
33
- # Ensure 'date' is in datetime format and handle missing dates
34
- df["date"] = pd.to_datetime(df["date"], errors='coerce')
35
- df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
36
-
37
- # Prepare the DataFrame by removing 'abstract'
38
- paper_info = []
39
- for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
40
- info = row.copy()
41
- if "abstract" in info:
42
- del info["abstract"]
43
- paper_info.append(info)
44
- df_prepared = pd.DataFrame(paper_info)
45
-
46
- # Add 'paper_page' links
47
- df_prepared["paper_page"] = df_prepared["arxiv_id"].apply(lambda x: f"https://huggingface.co/papers/{x}")
48
-
49
- return df_prepared
 
 
 
 
 
 
 
 
 
 
 
50
 
51
 
52
  class Prettifier:
@@ -192,7 +209,7 @@ class PaperManager:
192
 
193
  # Convert 'date' column to datetime
194
  df_sorted = df.copy()
195
- df_sorted['date_parsed'] = pd.to_datetime(df_sorted['date'], errors='coerce').dt.tz_localize(timezone.utc)
196
  df_sorted = df_sorted[df_sorted['date_parsed'] >= time_threshold]
197
  df_sorted = df_sorted.sort_values(by='upvotes', ascending=False).drop(columns=['date_parsed'])
198
  else:
@@ -210,11 +227,11 @@ class PaperManager:
210
  """
211
  if method not in ["hot", "new", "top"]:
212
  method = "hot"
213
- print(f"Setting sort method to: {method}")
214
  self.sort_method = method
215
  if method == "top" and time_frame:
216
  self.top_time_frame = time_frame.lower()
217
- print(f"Setting top time frame to: {self.top_time_frame}")
218
  self.sort_papers()
219
  return True # Assume success
220
 
@@ -277,6 +294,9 @@ class PaperManager:
277
  """
278
  if self.current_page < self.total_pages:
279
  self.current_page += 1
 
 
 
280
  return self.get_current_page_papers()
281
 
282
  def prev_page(self) -> str:
@@ -285,12 +305,16 @@ class PaperManager:
285
  """
286
  if self.current_page > 1:
287
  self.current_page -= 1
 
 
 
288
  return self.get_current_page_papers()
289
 
290
  def refresh(self) -> str:
291
  """
292
  Refreshes the current list of papers.
293
  """
 
294
  self.sort_papers()
295
  return self.get_current_page_papers()
296
 
@@ -317,6 +341,7 @@ def setup_paper_manager():
317
  df = get_df()
318
  paper_list = PaperList(df)
319
  paper_manager = PaperManager(paper_list)
 
320
 
321
 
322
  # Initialize PaperManager at the start
@@ -328,9 +353,14 @@ def update_paper_manager() -> str:
328
  Updates the global PaperManager with the latest DataFrame.
329
  """
330
  global paper_manager
 
331
  df = get_df()
 
 
 
332
  paper_manager.paper_list = PaperList(df)
333
  paper_manager.sort_papers()
 
334
  return paper_manager.get_current_page_papers()
335
 
336
 
@@ -344,6 +374,11 @@ scheduler_data.add_job(
344
  misfire_grace_time=60,
345
  )
346
  scheduler_data.start()
 
 
 
 
 
347
 
348
 
349
  # --- Gradio Interface Functions ---
@@ -352,6 +387,7 @@ def change_sort_method_ui(method: str, time_frame: str = "all time") -> str:
352
  """
353
  Changes the sort method and, if 'top' is selected, sets the time frame.
354
  """
 
355
  if method.lower() == "top":
356
  paper_manager.set_sort_method(method.lower(), time_frame)
357
  else:
@@ -362,7 +398,8 @@ def change_sort_method_ui(method: str, time_frame: str = "all time") -> str:
362
  # --- CSS Styling ---
363
 
364
  css = """
365
- /* Existing CSS remains unchanged */
 
366
  body {
367
  background-color: white;
368
  font-family: Verdana, Geneva, sans-serif;
@@ -492,7 +529,6 @@ table {
492
  }
493
  """
494
 
495
-
496
  # --- Initialize Gradio Blocks ---
497
 
498
  demo = gr.Blocks(css=css)
@@ -507,9 +543,22 @@ with demo:
507
 
508
  Once your paper is submitted, it will automatically appear in this demo.
509
  """)
510
- # Header without Refresh Button
511
  with gr.Row():
512
- gr.Markdown("<b>Daily Papers</b>")
 
 
 
 
 
 
 
 
 
 
 
 
 
513
  # Sort Options and Time Frame (conditionally visible)
514
  with gr.Row():
515
  sort_radio = gr.Radio(
@@ -568,4 +617,4 @@ with demo:
568
  # --- Launch the App ---
569
 
570
  if __name__ == "__main__":
571
- demo.launch()
 
7
  from huggingface_hub import HfApi
8
 
9
  import gradio as gr
10
+ import datasets # Ensure the datasets library is imported
11
 
12
+ from datetime import timezone
13
+ import atexit # To gracefully shut down the scheduler
14
+ import logging # For logging purposes
15
+
16
+ # --- Logging Configuration ---
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
 
20
  # --- Data Loading and Processing ---
21
 
 
26
  Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns,
27
  and adds a 'paper_page' link for each paper.
28
  """
29
+ try:
30
+ # Load datasets
31
+ logger.info("Loading 'daily-papers' dataset.")
32
+ df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
33
+ logger.info("Loading 'daily-papers-stats' dataset.")
34
+ df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas()
35
+
36
+ # Merge datasets on 'arxiv_id'
37
+ logger.info("Merging datasets on 'arxiv_id'.")
38
+ df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id", suffixes=('_papers', '_stats'))
39
+
40
+ # Reverse the DataFrame to have the latest papers first
41
+ df = df[::-1].reset_index(drop=True)
42
+
43
+ # Ensure 'date' is in datetime format and handle missing dates
44
+ logger.info("Processing 'date' column.")
45
+ df["date"] = pd.to_datetime(df["date"], errors='coerce')
46
+ df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
47
+
48
+ # Prepare the DataFrame by removing 'abstract'
49
+ logger.info("Removing 'abstract' column if present.")
50
+ paper_info = []
51
+ for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
52
+ info = row.copy()
53
+ if "abstract" in info:
54
+ del info["abstract"]
55
+ paper_info.append(info)
56
+ df_prepared = pd.DataFrame(paper_info)
57
+
58
+ # Add 'paper_page' links
59
+ logger.info("Adding 'paper_page' links.")
60
+ df_prepared["paper_page"] = df_prepared["arxiv_id"].apply(lambda x: f"https://huggingface.co/papers/{x}")
61
+
62
+ logger.info("DataFrame preparation complete.")
63
+ return df_prepared
64
+ except Exception as e:
65
+ logger.error(f"Error in get_df: {e}")
66
+ return pd.DataFrame() # Return empty DataFrame on error
67
 
68
 
69
  class Prettifier:
 
209
 
210
  # Convert 'date' column to datetime
211
  df_sorted = df.copy()
212
+ df_sorted['date_parsed'] = pd.to_datetime(df_sorted['date'], errors='coerce').dt.tz_localize(timezone.utc, ambiguous='NaT', nonexistent='NaT')
213
  df_sorted = df_sorted[df_sorted['date_parsed'] >= time_threshold]
214
  df_sorted = df_sorted.sort_values(by='upvotes', ascending=False).drop(columns=['date_parsed'])
215
  else:
 
227
  """
228
  if method not in ["hot", "new", "top"]:
229
  method = "hot"
230
+ logger.info(f"Setting sort method to: {method}")
231
  self.sort_method = method
232
  if method == "top" and time_frame:
233
  self.top_time_frame = time_frame.lower()
234
+ logger.info(f"Setting top time frame to: {self.top_time_frame}")
235
  self.sort_papers()
236
  return True # Assume success
237
 
 
294
  """
295
  if self.current_page < self.total_pages:
296
  self.current_page += 1
297
+ logger.info(f"Navigated to page {self.current_page}.")
298
+ else:
299
+ logger.info("Already on the last page.")
300
  return self.get_current_page_papers()
301
 
302
  def prev_page(self) -> str:
 
305
  """
306
  if self.current_page > 1:
307
  self.current_page -= 1
308
+ logger.info(f"Navigated to page {self.current_page}.")
309
+ else:
310
+ logger.info("Already on the first page.")
311
  return self.get_current_page_papers()
312
 
313
  def refresh(self) -> str:
314
  """
315
  Refreshes the current list of papers.
316
  """
317
+ logger.info("Refreshing papers.")
318
  self.sort_papers()
319
  return self.get_current_page_papers()
320
 
 
341
  df = get_df()
342
  paper_list = PaperList(df)
343
  paper_manager = PaperManager(paper_list)
344
+ logger.info("PaperManager setup complete.")
345
 
346
 
347
  # Initialize PaperManager at the start
 
353
  Updates the global PaperManager with the latest DataFrame.
354
  """
355
  global paper_manager
356
+ logger.info("Updating PaperManager with latest data.")
357
  df = get_df()
358
+ if df.empty:
359
+ logger.warning("DataFrame is empty. Skipping update.")
360
+ return paper_manager.get_current_page_papers()
361
  paper_manager.paper_list = PaperList(df)
362
  paper_manager.sort_papers()
363
+ logger.info("PaperManager updated successfully.")
364
  return paper_manager.get_current_page_papers()
365
 
366
 
 
374
  misfire_grace_time=60,
375
  )
376
  scheduler_data.start()
377
+ logger.info("BackgroundScheduler started.")
378
+
379
+ # Ensure the scheduler shuts down gracefully on exit
380
+ atexit.register(lambda: scheduler_data.shutdown())
381
+ logger.info("Scheduler shutdown registered.")
382
 
383
 
384
  # --- Gradio Interface Functions ---
 
387
  """
388
  Changes the sort method and, if 'top' is selected, sets the time frame.
389
  """
390
+ logger.info(f"Changing sort method to: {method} with time frame: {time_frame}")
391
  if method.lower() == "top":
392
  paper_manager.set_sort_method(method.lower(), time_frame)
393
  else:
 
398
  # --- CSS Styling ---
399
 
400
  css = """
401
+ /* Hacker News-like CSS */
402
+
403
  body {
404
  background-color: white;
405
  font-family: Verdana, Geneva, sans-serif;
 
529
  }
530
  """
531
 
 
532
  # --- Initialize Gradio Blocks ---
533
 
534
  demo = gr.Blocks(css=css)
 
543
 
544
  Once your paper is submitted, it will automatically appear in this demo.
545
  """)
546
+ # Hacker News-like Header
547
  with gr.Row():
548
+ gr.HTML("""
549
+ <table border="0" cellpadding="0" cellspacing="0" class="header-table">
550
+ <tr>
551
+ <td>
552
+ <span class="pagetop">
553
+ <b class="hnname"><a href="#">Daily Papers</a></b>
554
+ </span>
555
+ </td>
556
+ <td align="right">
557
+ <!-- Future Navigation Links Can Be Added Here -->
558
+ </td>
559
+ </tr>
560
+ </table>
561
+ """)
562
  # Sort Options and Time Frame (conditionally visible)
563
  with gr.Row():
564
  sort_radio = gr.Radio(
 
617
  # --- Launch the App ---
618
 
619
  if __name__ == "__main__":
620
+ demo.launch()