akhaliq HF staff commited on
Commit
6790790
·
verified ·
1 Parent(s): 3e6fd58

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +158 -22
app.py CHANGED
@@ -19,9 +19,15 @@ from datetime import timezone # Ensure timezone is imported
19
  api = HfApi()
20
 
21
  def get_df() -> pd.DataFrame:
22
- # Load and merge datasets
 
 
 
 
23
  df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
24
  df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas()
 
 
25
  df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id")
26
 
27
  # Reverse the DataFrame to have the latest papers first
@@ -47,6 +53,9 @@ def get_df() -> pd.DataFrame:
47
 
48
 
49
  class Prettifier:
 
 
 
50
  @staticmethod
51
  def get_github_link(link: str) -> str:
52
  if not link:
@@ -64,24 +73,27 @@ class Prettifier:
64
  date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
65
 
66
  new_row = {
67
- "arxiv_id": row["arxiv_id"], # Include arxiv_id
68
- "date_display": date_display, # For display
69
- "date": row["date"], # For internal calculations
70
- "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
71
- "title": row["title"],
72
  "github": Prettifier.get_github_link(row.get("github", "")),
73
- "👍": row["upvotes"],
74
- "💬": row["num_comments"],
75
  }
76
  new_rows.append(new_row)
77
  return pd.DataFrame(new_rows)
78
 
79
 
80
  class PaperList:
 
 
 
81
  COLUMN_INFO = [
82
- ["arxiv_id", "str"], # Added arxiv_id
83
- ["date_display", "markdown"],# For display
84
- ["date", "str"], # For internal use
85
  ["paper_page", "markdown"],
86
  ["title", "str"],
87
  ["github", "markdown"],
@@ -107,6 +119,9 @@ class PaperList:
107
  title_search_query: str,
108
  max_num_to_retrieve: int = 1000, # Set a high default to include all if not specified
109
  ) -> pd.DataFrame:
 
 
 
110
  df = self.df_raw.copy()
111
 
112
  # Filter by title if search query is provided
@@ -125,11 +140,15 @@ class PaperList:
125
  # --- Sorting and Pagination Management ---
126
 
127
  class PaperManager:
 
 
 
128
  def __init__(self, paper_list: PaperList, papers_per_page=30):
129
  self.paper_list = paper_list
130
  self.papers_per_page = papers_per_page
131
- self.sort_method = "hot" # Default sort method
132
- self.current_search_query = "" # Initialize with no search query
 
133
  self.sort_papers()
134
  # 'current_page' and 'total_pages' are set in 'sort_papers()'
135
 
@@ -154,6 +173,9 @@ class PaperManager:
154
  return score
155
 
156
  def sort_papers(self):
 
 
 
157
  df = self.paper_list.df_raw.copy()
158
 
159
  # Apply search filter if a search query exists
@@ -164,7 +186,28 @@ class PaperManager:
164
  df['score'] = df.apply(self.calculate_score, axis=1)
165
  df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
166
  elif self.sort_method == "new":
167
- df_sorted = df.sort_values(by='date', ascending=False) # Sort by 'date' instead of 'published_at'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  else:
169
  df_sorted = df
170
 
@@ -173,21 +216,34 @@ class PaperManager:
173
  self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1)
174
  self.current_page = 1
175
 
176
- def set_sort_method(self, method):
177
- if method not in ["hot", "new"]:
 
 
 
 
178
  method = "hot"
179
  print(f"Setting sort method to: {method}")
180
  self.sort_method = method
 
 
 
181
  self.sort_papers()
182
  return True # Assume success
183
 
184
  def set_search_query(self, query: str):
 
 
 
185
  print(f"Setting search query to: {query}")
186
  self.current_search_query = query
187
  self.sort_papers()
188
  return True # Assume success
189
 
190
  def get_current_page_papers(self) -> str:
 
 
 
191
  start = (self.current_page - 1) * self.papers_per_page
192
  end = start + self.papers_per_page
193
  current_papers = self.paper_list.df_prettified.iloc[start:end]
@@ -203,6 +259,9 @@ class PaperManager:
203
  """
204
 
205
  def format_paper(self, row, rank):
 
 
 
206
  title = row.get('title', 'No title')
207
  paper_id = row.get('arxiv_id', '')
208
  url = f"https://huggingface.co/papers/{paper_id}"
@@ -235,22 +294,34 @@ class PaperManager:
235
  """
236
 
237
  def next_page(self) -> str:
 
 
 
238
  if self.current_page < self.total_pages:
239
  self.current_page += 1
240
  return self.get_current_page_papers()
241
 
242
  def prev_page(self) -> str:
 
 
 
243
  if self.current_page > 1:
244
  self.current_page -= 1
245
  return self.get_current_page_papers()
246
 
247
  def refresh(self) -> str:
 
 
 
248
  self.sort_papers()
249
  return self.get_current_page_papers()
250
 
251
 
252
  # Initialize PaperList and PaperManager
253
  def initialize_paper_manager() -> str:
 
 
 
254
  df = get_df()
255
  paper_list = PaperList(df)
256
  manager = PaperManager(paper_list)
@@ -261,6 +332,9 @@ paper_manager = None # Initialize globally
261
 
262
 
263
  def setup_paper_manager():
 
 
 
264
  global paper_manager
265
  df = get_df()
266
  paper_list = PaperList(df)
@@ -272,6 +346,9 @@ setup_paper_manager()
272
 
273
 
274
  def update_paper_manager() -> str:
 
 
 
275
  global paper_manager
276
  df = get_df()
277
  paper_manager.paper_list = PaperList(df)
@@ -293,20 +370,40 @@ scheduler_data.start()
293
 
294
  # --- Gradio Interface Functions ---
295
 
296
- def change_sort_method_ui(method: str) -> str:
297
- paper_manager.set_sort_method(method.lower())
 
 
 
 
 
 
298
  return paper_manager.get_current_page_papers()
299
 
300
 
301
  def refresh_papers_ui() -> str:
 
 
 
302
  return paper_manager.refresh()
303
 
304
 
305
  def search_papers_ui(query: str) -> str:
 
 
 
306
  paper_manager.set_search_query(query)
307
  return paper_manager.get_current_page_papers()
308
 
309
 
 
 
 
 
 
 
 
 
310
  # --- CSS Styling ---
311
 
312
  css = """
@@ -453,7 +550,6 @@ table {
453
  }
454
  """
455
 
456
-
457
  # --- Initialize Gradio Blocks ---
458
 
459
  demo = gr.Blocks(css=css)
@@ -484,7 +580,7 @@ with demo:
484
  </tr>
485
  </table>
486
  """)
487
- # Search Bar and Sort Options
488
  with gr.Row():
489
  search_box = gr.Textbox(
490
  label="Search Papers by Title",
@@ -493,13 +589,22 @@ with demo:
493
  interactive=True
494
  )
495
  search_button = gr.Button("Search")
 
 
496
  with gr.Row():
497
  sort_radio = gr.Radio(
498
- choices=["Hot", "New"],
499
  value="Hot",
500
  label="Sort By",
501
  interactive=True
502
  )
 
 
 
 
 
 
 
503
  # Paper list
504
  paper_list = gr.HTML()
505
  # Navigation Buttons
@@ -532,8 +637,24 @@ with demo:
532
 
533
  # Sort option change
534
  sort_radio.change(
535
- fn=change_sort_method_ui,
536
  inputs=[sort_radio],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
537
  outputs=[paper_list]
538
  )
539
 
@@ -544,6 +665,21 @@ with demo:
544
  outputs=[paper_list]
545
  )
546
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
547
 
548
  # --- Launch the App ---
549
 
 
19
  api = HfApi()
20
 
21
  def get_df() -> pd.DataFrame:
22
+ """
23
+ Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns,
24
+ and adds a 'paper_page' link for each paper.
25
+ """
26
+ # Load datasets
27
  df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
28
  df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas()
29
+
30
+ # Merge datasets on 'arxiv_id'
31
  df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id")
32
 
33
  # Reverse the DataFrame to have the latest papers first
 
53
 
54
 
55
  class Prettifier:
56
+ """
57
+ Converts raw DataFrame rows into a prettified format suitable for display.
58
+ """
59
  @staticmethod
60
  def get_github_link(link: str) -> str:
61
  if not link:
 
73
  date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
74
 
75
  new_row = {
76
+ "arxiv_id": row.get("arxiv_id", ""), # Include arxiv_id
77
+ "date_display": date_display, # For display
78
+ "date": row.get("date", datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")), # For internal calculations
79
+ "paper_page": Prettifier.create_link(row.get("arxiv_id", ""), row.get("paper_page", "#")),
80
+ "title": row.get("title", "No title"),
81
  "github": Prettifier.get_github_link(row.get("github", "")),
82
+ "👍": row.get("upvotes", 0),
83
+ "💬": row.get("num_comments", 0),
84
  }
85
  new_rows.append(new_row)
86
  return pd.DataFrame(new_rows)
87
 
88
 
89
  class PaperList:
90
+ """
91
+ Manages the list of papers, including search functionality.
92
+ """
93
  COLUMN_INFO = [
94
+ ["arxiv_id", "str"], # Added arxiv_id
95
+ ["date_display", "markdown"], # For display
96
+ ["date", "str"], # For internal use
97
  ["paper_page", "markdown"],
98
  ["title", "str"],
99
  ["github", "markdown"],
 
119
  title_search_query: str,
120
  max_num_to_retrieve: int = 1000, # Set a high default to include all if not specified
121
  ) -> pd.DataFrame:
122
+ """
123
+ Filters the DataFrame based on the title search query and limits the number of results.
124
+ """
125
  df = self.df_raw.copy()
126
 
127
  # Filter by title if search query is provided
 
140
  # --- Sorting and Pagination Management ---
141
 
142
  class PaperManager:
143
+ """
144
+ Manages sorting, pagination, and search queries for the list of papers.
145
+ """
146
  def __init__(self, paper_list: PaperList, papers_per_page=30):
147
  self.paper_list = paper_list
148
  self.papers_per_page = papers_per_page
149
+ self.sort_method = "hot" # Default sort method
150
+ self.current_search_query = "" # Initialize with no search query
151
+ self.top_time_frame = "all time" # Default time frame for "Top" sorting
152
  self.sort_papers()
153
  # 'current_page' and 'total_pages' are set in 'sort_papers()'
154
 
 
173
  return score
174
 
175
  def sort_papers(self):
176
+ """
177
+ Sorts the papers based on the current sort method and search query.
178
+ """
179
  df = self.paper_list.df_raw.copy()
180
 
181
  # Apply search filter if a search query exists
 
186
  df['score'] = df.apply(self.calculate_score, axis=1)
187
  df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
188
  elif self.sort_method == "new":
189
+ df_sorted = df.sort_values(by='date', ascending=False) # Sort by 'date'
190
+ elif self.sort_method == "top":
191
+ # Filter based on the selected time frame
192
+ now = datetime.datetime.now(timezone.utc)
193
+ if self.top_time_frame == "day":
194
+ time_threshold = now - datetime.timedelta(days=1)
195
+ elif self.top_time_frame == "week":
196
+ time_threshold = now - datetime.timedelta(weeks=1)
197
+ elif self.top_time_frame == "month":
198
+ time_threshold = now - datetime.timedelta(days=30)
199
+ elif self.top_time_frame == "year":
200
+ time_threshold = now - datetime.timedelta(days=365)
201
+ elif self.top_time_frame == "all time":
202
+ time_threshold = datetime.datetime.min.replace(tzinfo=timezone.utc)
203
+ else:
204
+ time_threshold = datetime.datetime.min.replace(tzinfo=timezone.utc)
205
+
206
+ # Convert 'date' column to datetime
207
+ df_sorted = df.copy()
208
+ df_sorted['date_parsed'] = pd.to_datetime(df_sorted['date'], errors='coerce').dt.tz_localize(timezone.utc)
209
+ df_sorted = df_sorted[df_sorted['date_parsed'] >= time_threshold]
210
+ df_sorted = df_sorted.sort_values(by='upvotes', ascending=False).drop(columns=['date_parsed'])
211
  else:
212
  df_sorted = df
213
 
 
216
  self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1)
217
  self.current_page = 1
218
 
219
+ def set_sort_method(self, method, time_frame=None):
220
+ """
221
+ Sets the sort method ('hot', 'new', 'top') and re-sorts the papers.
222
+ If 'top' is selected, also sets the time frame.
223
+ """
224
+ if method not in ["hot", "new", "top"]:
225
  method = "hot"
226
  print(f"Setting sort method to: {method}")
227
  self.sort_method = method
228
+ if method == "top" and time_frame:
229
+ self.top_time_frame = time_frame.lower()
230
+ print(f"Setting top time frame to: {self.top_time_frame}")
231
  self.sort_papers()
232
  return True # Assume success
233
 
234
  def set_search_query(self, query: str):
235
+ """
236
+ Sets the current search query and re-sorts the papers.
237
+ """
238
  print(f"Setting search query to: {query}")
239
  self.current_search_query = query
240
  self.sort_papers()
241
  return True # Assume success
242
 
243
  def get_current_page_papers(self) -> str:
244
+ """
245
+ Retrieves the HTML string of the current page's papers.
246
+ """
247
  start = (self.current_page - 1) * self.papers_per_page
248
  end = start + self.papers_per_page
249
  current_papers = self.paper_list.df_prettified.iloc[start:end]
 
259
  """
260
 
261
  def format_paper(self, row, rank):
262
+ """
263
+ Formats a single paper entry into HTML.
264
+ """
265
  title = row.get('title', 'No title')
266
  paper_id = row.get('arxiv_id', '')
267
  url = f"https://huggingface.co/papers/{paper_id}"
 
294
  """
295
 
296
  def next_page(self) -> str:
297
+ """
298
+ Navigates to the next page if possible.
299
+ """
300
  if self.current_page < self.total_pages:
301
  self.current_page += 1
302
  return self.get_current_page_papers()
303
 
304
  def prev_page(self) -> str:
305
+ """
306
+ Navigates to the previous page if possible.
307
+ """
308
  if self.current_page > 1:
309
  self.current_page -= 1
310
  return self.get_current_page_papers()
311
 
312
  def refresh(self) -> str:
313
+ """
314
+ Refreshes the current list of papers.
315
+ """
316
  self.sort_papers()
317
  return self.get_current_page_papers()
318
 
319
 
320
  # Initialize PaperList and PaperManager
321
  def initialize_paper_manager() -> str:
322
+ """
323
+ Initializes the PaperList and PaperManager with the current DataFrame.
324
+ """
325
  df = get_df()
326
  paper_list = PaperList(df)
327
  manager = PaperManager(paper_list)
 
332
 
333
 
334
  def setup_paper_manager():
335
+ """
336
+ Sets up the global PaperManager instance.
337
+ """
338
  global paper_manager
339
  df = get_df()
340
  paper_list = PaperList(df)
 
346
 
347
 
348
  def update_paper_manager() -> str:
349
+ """
350
+ Updates the global PaperManager with the latest DataFrame.
351
+ """
352
  global paper_manager
353
  df = get_df()
354
  paper_manager.paper_list = PaperList(df)
 
370
 
371
  # --- Gradio Interface Functions ---
372
 
373
+ def change_sort_method_ui(method: str, time_frame: str = "all time") -> str:
374
+ """
375
+ Changes the sort method and, if 'top' is selected, sets the time frame.
376
+ """
377
+ if method.lower() == "top":
378
+ paper_manager.set_sort_method(method.lower(), time_frame)
379
+ else:
380
+ paper_manager.set_sort_method(method.lower())
381
  return paper_manager.get_current_page_papers()
382
 
383
 
384
  def refresh_papers_ui() -> str:
385
+ """
386
+ Refreshes the paper list.
387
+ """
388
  return paper_manager.refresh()
389
 
390
 
391
  def search_papers_ui(query: str) -> str:
392
+ """
393
+ Searches for papers based on the title search query.
394
+ """
395
  paper_manager.set_search_query(query)
396
  return paper_manager.get_current_page_papers()
397
 
398
 
399
+ def clear_search_ui() -> str:
400
+ """
401
+ Clears the current search query and refreshes the paper list.
402
+ """
403
+ paper_manager.set_search_query("")
404
+ return paper_manager.get_current_page_papers()
405
+
406
+
407
  # --- CSS Styling ---
408
 
409
  css = """
 
550
  }
551
  """
552
 
 
553
  # --- Initialize Gradio Blocks ---
554
 
555
  demo = gr.Blocks(css=css)
 
580
  </tr>
581
  </table>
582
  """)
583
+ # Search Bar and Clear Search Button
584
  with gr.Row():
585
  search_box = gr.Textbox(
586
  label="Search Papers by Title",
 
589
  interactive=True
590
  )
591
  search_button = gr.Button("Search")
592
+ clear_search_button = gr.Button("Clear Search")
593
+ # Sort Options and Time Frame (conditionally visible)
594
  with gr.Row():
595
  sort_radio = gr.Radio(
596
+ choices=["Hot", "New", "Top"],
597
  value="Hot",
598
  label="Sort By",
599
  interactive=True
600
  )
601
+ time_frame_dropdown = gr.Dropdown(
602
+ choices=["day", "week", "month", "year", "all time"],
603
+ value="all time",
604
+ label="Time Frame for Top",
605
+ visible=False,
606
+ interactive=True
607
+ )
608
  # Paper list
609
  paper_list = gr.HTML()
610
  # Navigation Buttons
 
637
 
638
  # Sort option change
639
  sort_radio.change(
640
+ fn=lambda method: method.lower(),
641
  inputs=[sort_radio],
642
+ outputs=None,
643
+ _js="""
644
+ (method) => {
645
+ if (method === 'top') {
646
+ document.querySelector('[label="Time Frame for Top"]').style.display = 'block';
647
+ } else {
648
+ document.querySelector('[label="Time Frame for Top"]').style.display = 'none';
649
+ }
650
+ return method;
651
+ }
652
+ """
653
+ )
654
+
655
+ sort_radio.change(
656
+ fn=change_sort_method_ui,
657
+ inputs=[sort_radio, time_frame_dropdown],
658
  outputs=[paper_list]
659
  )
660
 
 
665
  outputs=[paper_list]
666
  )
667
 
668
+ # Clear search functionality
669
+ clear_search_button.click(
670
+ fn=clear_search_ui,
671
+ inputs=None,
672
+ outputs=[paper_list]
673
+ )
674
+
675
+ # Footer
676
+ gr.Markdown("""
677
+ Related useful Spaces:
678
+ - [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien)
679
+ - [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy)
680
+ - [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
681
+ """)
682
+
683
 
684
  # --- Launch the App ---
685