akhaliq HF staff commited on
Commit
4f9c2ea
·
verified ·
1 Parent(s): d695524

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +425 -77
app.py CHANGED
@@ -11,6 +11,7 @@ from ragatouille import RAGPretrainedModel
11
  import gradio as gr
12
  from gradio_calendar import Calendar
13
  import datasets
 
14
 
15
  # --- Data Loading and Processing ---
16
 
@@ -40,6 +41,7 @@ def update_abstract_index() -> None:
40
  abstract_retriever.search("LLM")
41
 
42
 
 
43
  scheduler_abstract = BackgroundScheduler()
44
  scheduler_abstract.add_job(
45
  func=update_abstract_index,
@@ -58,7 +60,7 @@ def get_df() -> pd.DataFrame:
58
  on="arxiv_id",
59
  )
60
  df = df[::-1].reset_index(drop=True)
61
- df["date"] = df["date"].dt.strftime("%Y-%m-%d")
62
 
63
  paper_info = []
64
  for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
@@ -164,6 +166,7 @@ class PaperList:
164
  return df_prettified
165
 
166
 
 
167
  paper_list = PaperList(get_df())
168
 
169
 
@@ -172,6 +175,7 @@ def update_paper_list() -> None:
172
  paper_list = PaperList(get_df())
173
 
174
 
 
175
  scheduler_data = BackgroundScheduler()
176
  scheduler_data.add_job(
177
  func=update_paper_list,
@@ -182,6 +186,7 @@ scheduler_data.add_job(
182
  )
183
  scheduler_data.start()
184
 
 
185
  # --- Gradio App ---
186
 
187
  DESCRIPTION = "# [Daily Papers](https://huggingface.co/papers)"
@@ -193,99 +198,442 @@ Related useful Spaces:
193
  - [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
194
  """
195
 
196
-
197
- def update_df() -> pd.DataFrame:
198
- return paper_list.df_prettified
199
-
200
-
201
- def update_num_papers(df: pd.DataFrame) -> str:
202
- return f"{len(df)} / {len(paper_list.df_raw)}"
203
-
204
-
205
- def search(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  start_date: datetime.datetime,
207
  end_date: datetime.datetime,
208
  search_title: str,
209
  search_abstract: str,
210
  max_num_to_retrieve: int,
 
211
  ) -> pd.DataFrame:
212
- return paper_list.search(start_date, end_date, search_title, search_abstract, max_num_to_retrieve)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
 
 
214
 
215
- with gr.Blocks(css="style.css") as demo:
216
- gr.Markdown(DESCRIPTION)
217
- with gr.Group():
218
- search_title = gr.Textbox(label="Search title")
219
  with gr.Row():
220
- with gr.Column(scale=4):
221
- search_abstract = gr.Textbox(
222
- label="Search abstract",
223
- info="The result may not be accurate as the abstract does not contain all the information.",
224
- )
225
- with gr.Column(scale=1):
226
- max_num_to_retrieve = gr.Slider(
227
- label="Max number to retrieve",
228
- info="This is used only for search on abstracts.",
229
- minimum=1,
230
- maximum=len(paper_list.df_raw),
231
- step=1,
232
- value=100,
233
- )
 
 
234
  with gr.Row():
235
- start_date = Calendar(label="Start date", type="date", value="2023-05-05")
236
- end_date = Calendar(label="End date", type="date", value=datetime.datetime.utcnow().strftime("%Y-%m-%d"))
237
-
238
- num_papers = gr.Textbox(label="Number of papers", value=update_num_papers(paper_list.df_raw), interactive=False)
239
- df = gr.Dataframe(
240
- value=paper_list.df_prettified,
241
- datatype=paper_list.column_datatype,
242
- type="pandas",
243
- interactive=False,
244
- height=1000,
245
- elem_id="table",
246
- column_widths=["10%", "10%", "60%", "10%", "5%", "5%"],
247
- wrap=True,
248
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
- gr.Markdown(FOOT_NOTE)
251
-
252
- # Define the triggers and corresponding functions
253
- search_event = gr.Button("Search")
254
- search_event.click(
255
- fn=search,
256
- inputs=[start_date, end_date, search_title, search_abstract, max_num_to_retrieve],
257
- outputs=df,
258
- ).then(
259
- fn=update_num_papers,
260
- inputs=df,
261
- outputs=num_papers,
262
- queue=False,
263
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
- # Automatically trigger search when inputs change
266
- for trigger in [start_date, end_date, search_title, search_abstract, max_num_to_retrieve]:
267
- trigger.change(
268
- fn=search,
269
- inputs=[start_date, end_date, search_title, search_abstract, max_num_to_retrieve],
270
- outputs=df,
271
  ).then(
272
  fn=update_num_papers,
273
- inputs=df,
274
  outputs=num_papers,
275
  queue=False,
276
  )
277
 
278
- # Load the initial dataframe and number of papers
279
- demo.load(
280
- fn=update_df,
281
- outputs=df,
282
- queue=False,
283
- ).then(
284
- fn=update_num_papers,
285
- inputs=df,
286
- outputs=num_papers,
287
- queue=False,
288
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
 
290
  if __name__ == "__main__":
291
- demo.queue(api_open=False).launch(show_api=False)
 
11
  import gradio as gr
12
  from gradio_calendar import Calendar
13
  import datasets
14
+ import requests
15
 
16
  # --- Data Loading and Processing ---
17
 
 
41
  abstract_retriever.search("LLM")
42
 
43
 
44
+ # Scheduler for updating abstract index every hour
45
  scheduler_abstract = BackgroundScheduler()
46
  scheduler_abstract.add_job(
47
  func=update_abstract_index,
 
60
  on="arxiv_id",
61
  )
62
  df = df[::-1].reset_index(drop=True)
63
+ df["date"] = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d")
64
 
65
  paper_info = []
66
  for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
 
166
  return df_prettified
167
 
168
 
169
+ # Initialize PaperList
170
  paper_list = PaperList(get_df())
171
 
172
 
 
175
  paper_list = PaperList(get_df())
176
 
177
 
178
+ # Scheduler for updating paper list every hour
179
  scheduler_data = BackgroundScheduler()
180
  scheduler_data.add_job(
181
  func=update_paper_list,
 
186
  )
187
  scheduler_data.start()
188
 
189
+
190
  # --- Gradio App ---
191
 
192
  DESCRIPTION = "# [Daily Papers](https://huggingface.co/papers)"
 
198
  - [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
199
  """
200
 
201
+ # --- Sorting and Pagination Management ---
202
+
203
+ class PaperManager:
204
+ def __init__(self, paper_list: PaperList, papers_per_page=30):
205
+ self.paper_list = paper_list
206
+ self.papers_per_page = papers_per_page
207
+ self.current_page = 1
208
+ self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1)
209
+ self.sort_method = "hot" # Default sort method
210
+
211
+ def calculate_score(self, paper):
212
+ """
213
+ Calculate the score of a paper based on upvotes and age.
214
+ This mimics the "hotness" algorithm used by platforms like Hacker News.
215
+ """
216
+ upvotes = paper.get('upvotes', 0)
217
+ published_at_str = paper.get('date', datetime.datetime.now(timezone.utc).isoformat())
218
+ try:
219
+ published_time = datetime.datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
220
+ except ValueError:
221
+ # If parsing fails, use current time to minimize the impact on sorting
222
+ published_time = datetime.datetime.now(datetime.timezone.utc)
223
+
224
+ time_diff = datetime.datetime.now(datetime.timezone.utc) - published_time
225
+ time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours
226
+
227
+ # Avoid division by zero and apply the hotness formula
228
+ score = upvotes / ((time_diff_hours + 2) ** 1.5)
229
+ return score
230
+
231
+ def sort_papers(self):
232
+ df = self.paper_list.df_raw.copy()
233
+
234
+ if self.sort_method == "hot":
235
+ df['score'] = df.apply(self.calculate_score, axis=1)
236
+ df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
237
+ elif self.sort_method == "new":
238
+ df_sorted = df.sort_values(by='date', ascending=False)
239
+ else:
240
+ df_sorted = df
241
+
242
+ self.paper_list.df_raw = df_sorted.reset_index(drop=True)
243
+ self.paper_list.df_prettified = self.paper_list._prettifier(self.paper_list.df_raw).loc[:, self.paper_list.column_names]
244
+ self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1)
245
+ self.current_page = 1
246
+
247
+ def set_sort_method(self, method):
248
+ if method not in ["hot", "new"]:
249
+ method = "hot"
250
+ print(f"Setting sort method to: {method}")
251
+ self.sort_method = method
252
+ self.sort_papers()
253
+ return True # Assume success
254
+
255
+ def get_current_page_papers(self):
256
+ start = (self.current_page - 1) * self.papers_per_page
257
+ end = start + self.papers_per_page
258
+ current_papers = self.paper_list.df_prettified.iloc[start:end]
259
+ return current_papers
260
+
261
+ def next_page(self):
262
+ if self.current_page < self.total_pages:
263
+ self.current_page += 1
264
+ return self.get_current_page_papers()
265
+
266
+ def prev_page(self):
267
+ if self.current_page > 1:
268
+ self.current_page -= 1
269
+ return self.get_current_page_papers()
270
+
271
+ def refresh(self):
272
+ self.sort_papers()
273
+ return self.get_current_page_papers()
274
+
275
+
276
+ # Initialize PaperManager
277
+ paper_manager = PaperManager(paper_list)
278
+
279
+
280
+ def refresh_paper_manager():
281
+ global paper_manager
282
+ paper_manager = PaperManager(paper_list)
283
+ if paper_manager.sort_method:
284
+ paper_manager.sort_papers()
285
+ return paper_manager.get_current_page_papers()
286
+
287
+
288
+ # --- Gradio Interface Functions ---
289
+
290
+ def update_num_papers(current_df: pd.DataFrame) -> str:
291
+ return f"{len(current_df)} / {len(paper_manager.paper_list.df_raw)}"
292
+
293
+
294
+ def perform_search(
295
  start_date: datetime.datetime,
296
  end_date: datetime.datetime,
297
  search_title: str,
298
  search_abstract: str,
299
  max_num_to_retrieve: int,
300
+ sort_method: str
301
  ) -> pd.DataFrame:
302
+ # Update sort method
303
+ paper_manager.set_sort_method(sort_method.lower())
304
+
305
+ # Perform search
306
+ searched_df = paper_manager.paper_list.search(start_date, end_date, search_title, search_abstract, max_num_to_retrieve)
307
+
308
+ # Update PaperList with searched results
309
+ paper_manager.paper_list.df_raw = searched_df.copy()
310
+ paper_manager.paper_list.df_prettified = paper_manager.paper_list._prettifier(searched_df).loc[:, paper_manager.paper_list.column_names]
311
+ paper_manager.total_pages = max((len(searched_df) + paper_manager.papers_per_page - 1) // paper_manager.papers_per_page, 1)
312
+ paper_manager.current_page = 1
313
+
314
+ # Apply sorting
315
+ paper_manager.sort_papers()
316
+
317
+ return paper_manager.get_current_page_papers()
318
+
319
+
320
+ def change_sort_method(method: str) -> pd.DataFrame:
321
+ paper_manager.set_sort_method(method.lower())
322
+ return paper_manager.get_current_page_papers()
323
+
324
+
325
+ def get_initial_papers() -> pd.DataFrame:
326
+ return paper_manager.get_current_page_papers()
327
+
328
+
329
+ # --- CSS Styling ---
330
+
331
+ css = """
332
+ /* Existing CSS remains unchanged */
333
+ body {
334
+ background-color: white;
335
+ font-family: Verdana, Geneva, sans-serif;
336
+ margin: 0;
337
+ padding: 0;
338
+ }
339
+
340
+ a {
341
+ color: #0000ff;
342
+ text-decoration: none;
343
+ }
344
+
345
+ a:visited {
346
+ color: #551A8B;
347
+ }
348
+
349
+ .container {
350
+ width: 85%;
351
+ margin: auto;
352
+ }
353
+
354
+ table {
355
+ width: 100%;
356
+ }
357
+
358
+ .header-table {
359
+ width: 100%;
360
+ background-color: #ff6600;
361
+ padding: 2px 10px;
362
+ }
363
+
364
+ .header-table a {
365
+ color: black;
366
+ font-weight: bold;
367
+ font-size: 14pt;
368
+ text-decoration: none;
369
+ }
370
+
371
+ .itemlist .athing {
372
+ background-color: #f6f6ef;
373
+ }
374
+
375
+ .rank {
376
+ font-size: 14pt;
377
+ color: #828282;
378
+ padding-right: 5px;
379
+ }
380
+
381
+ .storylink {
382
+ font-size: 10pt;
383
+ }
384
+
385
+ .subtext {
386
+ font-size: 8pt;
387
+ color: #828282;
388
+ padding-left: 40px;
389
+ }
390
+
391
+ .subtext a {
392
+ color: #828282;
393
+ text-decoration: none;
394
+ }
395
+
396
+ #refresh-button {
397
+ background: none;
398
+ border: none;
399
+ color: black;
400
+ font-weight: bold;
401
+ font-size: 14pt;
402
+ cursor: pointer;
403
+ }
404
+
405
+ .no-papers {
406
+ text-align: center;
407
+ color: #828282;
408
+ padding: 1rem;
409
+ font-size: 14pt;
410
+ }
411
+
412
+ @media (max-width: 640px) {
413
+ .header-table a {
414
+ font-size: 12pt;
415
+ }
416
+
417
+ .storylink {
418
+ font-size: 9pt;
419
+ }
420
+
421
+ .subtext {
422
+ font-size: 7pt;
423
+ }
424
+ }
425
+
426
+ /* Dark mode */
427
+ @media (prefers-color-scheme: dark) {
428
+ body {
429
+ background-color: #121212;
430
+ color: #e0e0e0;
431
+ }
432
+
433
+ a {
434
+ color: #add8e6;
435
+ }
436
+
437
+ a:visited {
438
+ color: #9370db;
439
+ }
440
+
441
+ .header-table {
442
+ background-color: #ff6600;
443
+ }
444
+
445
+ .header-table a {
446
+ color: black;
447
+ }
448
+
449
+ .itemlist .athing {
450
+ background-color: #1e1e1e;
451
+ }
452
+
453
+ .rank {
454
+ color: #b0b0b0;
455
+ }
456
+
457
+ .subtext {
458
+ color: #b0b0b0;
459
+ }
460
+
461
+ .subtext a {
462
+ color: #b0b0b0;
463
+ }
464
+
465
+ #refresh-button {
466
+ color: #e0e0e0;
467
+ }
468
+
469
+ .no-papers {
470
+ color: #b0b0b0;
471
+ }
472
+ }
473
+ """
474
+
475
+ # --- Initialize Gradio Blocks ---
476
+
477
+ demo = gr.Blocks(css=css)
478
+
479
+ with demo:
480
+ with gr.Column(elem_classes=["container"]):
481
+ # Accordion for Submission Instructions
482
+ with gr.Accordion("How to Submit a Paper", open=False):
483
+ gr.Markdown("""
484
+ **Submit the paper to Daily Papers:**
485
+ [https://huggingface.co/papers/submit](https://huggingface.co/papers/submit)
486
 
487
+ Once your paper is submitted, it will automatically appear in this demo.
488
+ """)
489
 
490
+ # Header with Refresh Button
 
 
 
491
  with gr.Row():
492
+ gr.HTML("""
493
+ <table border="0" cellpadding="0" cellspacing="0" class="header-table">
494
+ <tr>
495
+ <td>
496
+ <span class="pagetop">
497
+ <b class="hnname"><a href="#">Daily Papers</a></b>
498
+ </span>
499
+ </td>
500
+ <td align="right">
501
+ <button id="refresh-button">Refresh</button>
502
+ </td>
503
+ </tr>
504
+ </table>
505
+ """)
506
+
507
+ # Sorting Options
508
  with gr.Row():
509
+ sort_radio = gr.Radio(
510
+ choices=["Hot", "New"],
511
+ value="Hot",
512
+ label="Sort By",
513
+ interactive=True
514
+ )
515
+
516
+ # Search and Filter Inputs
517
+ with gr.Group():
518
+ search_title = gr.Textbox(label="Search Title")
519
+ with gr.Row():
520
+ with gr.Column(scale=4):
521
+ search_abstract = gr.Textbox(
522
+ label="Search Abstract",
523
+ info="The result may not be accurate as the abstract does not contain all the information.",
524
+ )
525
+ with gr.Column(scale=1):
526
+ max_num_to_retrieve = gr.Slider(
527
+ label="Max Number to Retrieve",
528
+ info="This is used only for search on abstracts.",
529
+ minimum=1,
530
+ maximum=1000, # Adjust as needed
531
+ step=1,
532
+ value=100,
533
+ )
534
+ with gr.Row():
535
+ start_date = Calendar(label="Start Date", type="date", value="2023-05-05")
536
+ end_date = Calendar(label="End Date", type="date", value=datetime.datetime.utcnow().strftime("%Y-%m-%d"))
537
+
538
+ search_button = gr.Button("Search")
539
+
540
+ # Number of Papers Display
541
+ num_papers = gr.Textbox(label="Number of Papers", value=update_num_papers(paper_manager.get_current_page_papers()), interactive=False)
542
+
543
+ # Paper List Display
544
+ df_display = gr.DataFrame(
545
+ value=paper_manager.get_current_page_papers(),
546
+ datatype=paper_manager.paper_list.column_datatype,
547
+ type="pandas",
548
+ interactive=False,
549
+ height=600,
550
+ elem_id="table",
551
+ column_widths=["10%", "10%", "60%", "10%", "5%", "5%"],
552
+ wrap=True,
553
+ )
554
 
555
+ # Pagination Buttons
556
+ with gr.Row():
557
+ prev_button = gr.Button("Prev")
558
+ next_button = gr.Button("Next")
559
+
560
+ # Footer
561
+ gr.Markdown(FOOT_NOTE)
562
+
563
+ # Hidden Refresh Button
564
+ refresh_button = gr.Button("Refresh", visible=False, elem_id="refresh-hidden")
565
+ refresh_button.click(refresh_paper_manager, outputs=[df_display])
566
+
567
+ # Bind the visible Refresh button to the hidden one using JavaScript
568
+ gr.HTML("""
569
+ <script>
570
+ document.getElementById('refresh-button').addEventListener('click', function() {
571
+ document.getElementById('refresh-hidden').click();
572
+ });
573
+ </script>
574
+ """)
575
+
576
+ # Event Handlers
577
+
578
+ # Search Button Click
579
+ search_button.click(
580
+ fn=perform_search,
581
+ inputs=[start_date, end_date, search_title, search_abstract, max_num_to_retrieve, sort_radio],
582
+ outputs=[df_display],
583
+ ).then(
584
+ fn=update_num_papers,
585
+ inputs=df_display,
586
+ outputs=num_papers,
587
+ queue=False,
588
+ )
589
 
590
+ # Sort Radio Change
591
+ sort_radio.change(
592
+ fn=change_sort_method,
593
+ inputs=[sort_radio],
594
+ outputs=[df_display],
 
595
  ).then(
596
  fn=update_num_papers,
597
+ inputs=df_display,
598
  outputs=num_papers,
599
  queue=False,
600
  )
601
 
602
+ # Pagination Buttons
603
+ prev_button.click(
604
+ fn=paper_manager.prev_page,
605
+ inputs=None,
606
+ outputs=[df_display],
607
+ ).then(
608
+ fn=update_num_papers,
609
+ inputs=df_display,
610
+ outputs=num_papers,
611
+ queue=False,
612
+ )
613
+
614
+ next_button.click(
615
+ fn=paper_manager.next_page,
616
+ inputs=None,
617
+ outputs=[df_display],
618
+ ).then(
619
+ fn=update_num_papers,
620
+ inputs=df_display,
621
+ outputs=num_papers,
622
+ queue=False,
623
+ )
624
+
625
+ # Initial Load
626
+ demo.load(
627
+ fn=get_initial_papers,
628
+ outputs=[df_display],
629
+ ).then(
630
+ fn=update_num_papers,
631
+ inputs=df_display,
632
+ outputs=num_papers,
633
+ queue=False,
634
+ )
635
+
636
+ # --- Launch the App ---
637
 
638
  if __name__ == "__main__":
639
+ demo.launch()