akhaliq HF staff commited on
Commit
a7e2292
Β·
verified Β·
1 Parent(s): dea1fc4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -81
app.py CHANGED
@@ -8,7 +8,6 @@ from huggingface_hub import HfApi
8
 
9
  import gradio as gr
10
  import datasets # Ensure the datasets library is imported
11
- import requests # For making API calls
12
 
13
  from datetime import timezone
14
  import atexit # To gracefully shut down the scheduler
@@ -22,37 +21,10 @@ logger = logging.getLogger(__name__)
22
 
23
  api = HfApi()
24
 
25
- def get_repo_counts(arxiv_id: str) -> dict:
26
- """
27
- Fetches the number of models, datasets, and Spaces linked to a given arxiv_id using Hugging Face API.
28
- """
29
- url = f"https://huggingface.co/api/arxiv/{arxiv_id}/repos"
30
- try:
31
- response = requests.get(url, timeout=10)
32
- response.raise_for_status()
33
- data = response.json()
34
-
35
- models = data.get('models', [])
36
- datasets_list = data.get('datasets', [])
37
- spaces = data.get('spaces', [])
38
-
39
- return {
40
- 'models_count': len(models),
41
- 'datasets_count': len(datasets_list),
42
- 'spaces_count': len(spaces)
43
- }
44
- except requests.exceptions.RequestException as e:
45
- logger.error(f"Error fetching repo counts for {arxiv_id}: {e}")
46
- return {
47
- 'models_count': 0,
48
- 'datasets_count': 0,
49
- 'spaces_count': 0
50
- }
51
-
52
  def get_df() -> pd.DataFrame:
53
  """
54
  Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns,
55
- adds a 'paper_page' link for each paper, and fetches counts of models, datasets, and Spaces linked to each paper.
56
  """
57
  try:
58
  # Load datasets
@@ -80,17 +52,6 @@ def get_df() -> pd.DataFrame:
80
  info = row.copy()
81
  if "abstract" in info:
82
  del info["abstract"]
83
- # Fetch repo counts
84
- arxiv_id = info.get("arxiv_id", "")
85
- if arxiv_id:
86
- counts = get_repo_counts(arxiv_id)
87
- info.update(counts)
88
- else:
89
- info.update({
90
- 'models_count': 0,
91
- 'datasets_count': 0,
92
- 'spaces_count': 0
93
- })
94
  paper_info.append(info)
95
  df_prepared = pd.DataFrame(paper_info)
96
 
@@ -98,6 +59,11 @@ def get_df() -> pd.DataFrame:
98
  logger.info("Adding 'paper_page' links.")
99
  df_prepared["paper_page"] = df_prepared["arxiv_id"].apply(lambda x: f"https://huggingface.co/papers/{x}")
100
 
 
 
 
 
 
101
  logger.info("DataFrame preparation complete.")
102
  return df_prepared
103
  except Exception as e:
@@ -109,11 +75,7 @@ class Prettifier:
109
  """
110
  Converts raw DataFrame rows into a prettified format suitable for display.
111
  """
112
- REQUIRED_COLUMNS = [
113
- "arxiv_id", "date_display", "date", "paper_page",
114
- "title", "github", "πŸ‘", "πŸ’¬",
115
- "models_count", "datasets_count", "spaces_count"
116
- ]
117
 
118
  @staticmethod
119
  def get_github_link(link: str) -> str:
@@ -140,9 +102,6 @@ class Prettifier:
140
  "github": Prettifier.get_github_link(row.get("github", "")),
141
  "πŸ‘": row.get("upvotes", 0),
142
  "πŸ’¬": row.get("num_comments", 0),
143
- "models_count": row.get("models_count", 0),
144
- "datasets_count": row.get("datasets_count", 0),
145
- "spaces_count": row.get("spaces_count", 0),
146
  }
147
  new_rows.append(new_row)
148
 
@@ -166,9 +125,6 @@ class PaperList:
166
  ["github", "markdown"],
167
  ["πŸ‘", "number"],
168
  ["πŸ’¬", "number"],
169
- ["models_count", "number"],
170
- ["datasets_count", "number"],
171
- ["spaces_count", "number"],
172
  ]
173
 
174
  def __init__(self, df: pd.DataFrame):
@@ -195,12 +151,13 @@ class PaperList:
195
 
196
  class PaperManager:
197
  """
198
- Manages sorting, pagination, and repository-based sorting for the list of papers.
199
  """
200
  def __init__(self, paper_list: PaperList, papers_per_page=30):
201
  self.paper_list = paper_list
202
  self.papers_per_page = papers_per_page
203
  self.sort_method = "hot" # Default sort method
 
204
  self.sort_papers()
205
  # 'current_page' and 'total_pages' are set in 'sort_papers()'
206
 
@@ -239,12 +196,27 @@ class PaperManager:
239
  df_sorted = df
240
  elif self.sort_method == "new":
241
  df_sorted = df.sort_values(by='date', ascending=False) # Sort by 'date'
242
- elif self.sort_method == "most_models":
243
- df_sorted = df.sort_values(by='models_count', ascending=False)
244
- elif self.sort_method == "most_datasets":
245
- df_sorted = df.sort_values(by='datasets_count', ascending=False)
246
- elif self.sort_method == "most_spaces":
247
- df_sorted = df.sort_values(by='spaces_count', ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  else:
249
  df_sorted = df
250
 
@@ -256,13 +228,16 @@ class PaperManager:
256
 
257
  def set_sort_method(self, method, time_frame=None):
258
  """
259
- Sets the sort method ('hot', 'new', 'most_models', 'most_datasets', 'most_spaces') and re-sorts the papers.
 
260
  """
261
- valid_methods = ["hot", "new", "most_models", "most_datasets", "most_spaces"]
262
- if method not in valid_methods:
263
  method = "hot"
264
  logger.info(f"Setting sort method to: {method}")
265
  self.sort_method = method
 
 
 
266
  self.sort_papers()
267
  return True # Assume success
268
 
@@ -293,9 +268,6 @@ class PaperManager:
293
  url = f"https://huggingface.co/papers/{paper_id}"
294
  upvotes = row.get('πŸ‘', 0)
295
  comments = row.get('πŸ’¬', 0)
296
- models = row.get('models_count', 0)
297
- datasets_count = row.get('datasets_count', 0)
298
- spaces = row.get('spaces_count', 0)
299
  date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
300
  try:
301
  published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
@@ -316,8 +288,7 @@ class PaperManager:
316
  <td colspan="1"></td>
317
  <td class="subtext">
318
  <span class="score">{upvotes} upvotes</span><br>
319
- {time_ago} | <a href="#">{comments} comments</a><br>
320
- Models: {models} | Datasets: {datasets_count} | Spaces: {spaces}
321
  </td>
322
  </tr>
323
  <tr style="height:5px"></tr>
@@ -422,18 +393,13 @@ logger.info("Scheduler shutdown registered.")
422
 
423
  def change_sort_method_ui(method: str, time_frame: str = "all time") -> str:
424
  """
425
- Changes the sort method and, if applicable, sets additional parameters.
426
  """
427
  logger.info(f"Changing sort method to: {method} with time frame: {time_frame}")
428
- if method.lower() in ["most_models", "most_datasets", "most_spaces"]:
429
- paper_manager.set_sort_method(method.lower())
430
- elif method.lower() == "hot":
431
- paper_manager.set_sort_method(method.lower())
432
- elif method.lower() == "new":
433
- paper_manager.set_sort_method(method.lower())
434
  else:
435
- # Default to 'hot' if method is unrecognized
436
- paper_manager.set_sort_method("hot")
437
  return paper_manager.get_current_page_papers()
438
 
439
 
@@ -602,22 +568,28 @@ with demo:
602
  </tr>
603
  </table>
604
  """)
605
- # Sort Options (Removed "Top" and its timeframe)
606
  with gr.Row():
607
  sort_radio = gr.Radio(
608
- choices=["Hot", "New", "Most Models", "Most Datasets", "Most Spaces"],
609
  value="Hot",
610
  label="Sort By",
611
  interactive=True
612
  )
613
- # Removed time_frame_dropdown as "Top" sort is removed
 
 
 
 
 
 
614
  # Paper list
615
  paper_list = gr.HTML()
616
  # Navigation Buttons
617
  with gr.Row():
618
  prev_button = gr.Button("Prev")
619
  next_button = gr.Button("Next")
620
-
621
  # Load papers on app start
622
  demo.load(
623
  fn=lambda: paper_manager.get_current_page_papers(),
@@ -628,10 +600,17 @@ with demo:
628
  prev_button.click(paper_manager.prev_page, outputs=[paper_list])
629
  next_button.click(paper_manager.next_page, outputs=[paper_list])
630
 
631
- # Sort option change: Apply sorting method
 
 
 
 
 
 
 
632
  sort_radio.change(
633
  fn=change_sort_method_ui,
634
- inputs=[sort_radio, None], # Pass None since time_frame_dropdown is removed
635
  outputs=[paper_list]
636
  )
637
 
 
8
 
9
  import gradio as gr
10
  import datasets # Ensure the datasets library is imported
 
11
 
12
  from datetime import timezone
13
  import atexit # To gracefully shut down the scheduler
 
21
 
22
  api = HfApi()
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def get_df() -> pd.DataFrame:
25
  """
26
  Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns,
27
+ and adds a 'paper_page' link for each paper.
28
  """
29
  try:
30
  # Load datasets
 
52
  info = row.copy()
53
  if "abstract" in info:
54
  del info["abstract"]
 
 
 
 
 
 
 
 
 
 
 
55
  paper_info.append(info)
56
  df_prepared = pd.DataFrame(paper_info)
57
 
 
59
  logger.info("Adding 'paper_page' links.")
60
  df_prepared["paper_page"] = df_prepared["arxiv_id"].apply(lambda x: f"https://huggingface.co/papers/{x}")
61
 
62
+ # Verify that 'date' column exists
63
+ if 'date' not in df_prepared.columns:
64
+ logger.error("'date' column is missing from the DataFrame.")
65
+ df_prepared["date"] = datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")
66
+
67
  logger.info("DataFrame preparation complete.")
68
  return df_prepared
69
  except Exception as e:
 
75
  """
76
  Converts raw DataFrame rows into a prettified format suitable for display.
77
  """
78
+ REQUIRED_COLUMNS = ["arxiv_id", "date_display", "date", "paper_page", "title", "github", "πŸ‘", "πŸ’¬"]
 
 
 
 
79
 
80
  @staticmethod
81
  def get_github_link(link: str) -> str:
 
102
  "github": Prettifier.get_github_link(row.get("github", "")),
103
  "πŸ‘": row.get("upvotes", 0),
104
  "πŸ’¬": row.get("num_comments", 0),
 
 
 
105
  }
106
  new_rows.append(new_row)
107
 
 
125
  ["github", "markdown"],
126
  ["πŸ‘", "number"],
127
  ["πŸ’¬", "number"],
 
 
 
128
  ]
129
 
130
  def __init__(self, df: pd.DataFrame):
 
151
 
152
  class PaperManager:
153
  """
154
+ Manages sorting and pagination for the list of papers.
155
  """
156
  def __init__(self, paper_list: PaperList, papers_per_page=30):
157
  self.paper_list = paper_list
158
  self.papers_per_page = papers_per_page
159
  self.sort_method = "hot" # Default sort method
160
+ self.top_time_frame = "all time" # Default time frame for "Top" sorting
161
  self.sort_papers()
162
  # 'current_page' and 'total_pages' are set in 'sort_papers()'
163
 
 
196
  df_sorted = df
197
  elif self.sort_method == "new":
198
  df_sorted = df.sort_values(by='date', ascending=False) # Sort by 'date'
199
+ elif self.sort_method == "top":
200
+ # Filter based on the selected time frame
201
+ now = datetime.datetime.now(timezone.utc)
202
+ if self.top_time_frame == "day":
203
+ time_threshold = now - datetime.timedelta(days=1)
204
+ elif self.top_time_frame == "week":
205
+ time_threshold = now - datetime.timedelta(weeks=1)
206
+ elif self.top_time_frame == "month":
207
+ time_threshold = now - datetime.timedelta(days=30)
208
+ elif self.top_time_frame == "year":
209
+ time_threshold = now - datetime.timedelta(days=365)
210
+ elif self.top_time_frame == "all time":
211
+ time_threshold = datetime.datetime.min.replace(tzinfo=timezone.utc)
212
+ else:
213
+ time_threshold = datetime.datetime.min.replace(tzinfo=timezone.utc)
214
+
215
+ # Convert 'date' column to datetime
216
+ df_sorted = df.copy()
217
+ df_sorted['date_parsed'] = pd.to_datetime(df_sorted['date'], errors='coerce').dt.tz_localize(timezone.utc, ambiguous='NaT', nonexistent='NaT')
218
+ df_sorted = df_sorted[df_sorted['date_parsed'] >= time_threshold]
219
+ df_sorted = df_sorted.sort_values(by='upvotes', ascending=False).drop(columns=['date_parsed'])
220
  else:
221
  df_sorted = df
222
 
 
228
 
229
  def set_sort_method(self, method, time_frame=None):
230
  """
231
+ Sets the sort method ('hot', 'new', 'top') and re-sorts the papers.
232
+ If 'top' is selected, also sets the time frame.
233
  """
234
+ if method not in ["hot", "new", "top"]:
 
235
  method = "hot"
236
  logger.info(f"Setting sort method to: {method}")
237
  self.sort_method = method
238
+ if method == "top" and time_frame:
239
+ self.top_time_frame = time_frame.lower()
240
+ logger.info(f"Setting top time frame to: {self.top_time_frame}")
241
  self.sort_papers()
242
  return True # Assume success
243
 
 
268
  url = f"https://huggingface.co/papers/{paper_id}"
269
  upvotes = row.get('πŸ‘', 0)
270
  comments = row.get('πŸ’¬', 0)
 
 
 
271
  date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
272
  try:
273
  published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
 
288
  <td colspan="1"></td>
289
  <td class="subtext">
290
  <span class="score">{upvotes} upvotes</span><br>
291
+ {time_ago} | <a href="#">{comments} comments</a>
 
292
  </td>
293
  </tr>
294
  <tr style="height:5px"></tr>
 
393
 
394
  def change_sort_method_ui(method: str, time_frame: str = "all time") -> str:
395
  """
396
+ Changes the sort method and, if 'top' is selected, sets the time frame.
397
  """
398
  logger.info(f"Changing sort method to: {method} with time frame: {time_frame}")
399
+ if method.lower() == "top":
400
+ paper_manager.set_sort_method(method.lower(), time_frame)
 
 
 
 
401
  else:
402
+ paper_manager.set_sort_method(method.lower())
 
403
  return paper_manager.get_current_page_papers()
404
 
405
 
 
568
  </tr>
569
  </table>
570
  """)
571
+ # Sort Options and Time Frame (conditionally visible)
572
  with gr.Row():
573
  sort_radio = gr.Radio(
574
+ choices=["Hot", "New", "Top"],
575
  value="Hot",
576
  label="Sort By",
577
  interactive=True
578
  )
579
+ time_frame_dropdown = gr.Dropdown(
580
+ choices=["day", "week", "month", "year", "all time"],
581
+ value="all time",
582
+ label="Time Frame for Top",
583
+ visible=False,
584
+ interactive=True
585
+ )
586
  # Paper list
587
  paper_list = gr.HTML()
588
  # Navigation Buttons
589
  with gr.Row():
590
  prev_button = gr.Button("Prev")
591
  next_button = gr.Button("Next")
592
+
593
  # Load papers on app start
594
  demo.load(
595
  fn=lambda: paper_manager.get_current_page_papers(),
 
600
  prev_button.click(paper_manager.prev_page, outputs=[paper_list])
601
  next_button.click(paper_manager.next_page, outputs=[paper_list])
602
 
603
+ # Sort option change: Toggle visibility of time_frame_dropdown based on sort method
604
+ sort_radio.change(
605
+ fn=lambda method: gr.update(visible=True) if method.lower() == "top" else gr.update(visible=False),
606
+ inputs=[sort_radio],
607
+ outputs=[time_frame_dropdown]
608
+ )
609
+
610
+ # Sort option change: Apply sorting method with time frame if applicable
611
  sort_radio.change(
612
  fn=change_sort_method_ui,
613
+ inputs=[sort_radio, time_frame_dropdown],
614
  outputs=[paper_list]
615
  )
616