akhaliq HF staff commited on
Commit
3b04ee1
Β·
verified Β·
1 Parent(s): 361761b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -34
app.py CHANGED
@@ -13,7 +13,7 @@ from gradio_calendar import Calendar
13
  import datasets
14
  import requests
15
 
16
- from datetime import timezone # Added import to fix the NameError
17
 
18
  # --- Data Loading and Processing ---
19
 
@@ -27,7 +27,7 @@ api.snapshot_download(
27
  local_dir=INDEX_DIR_PATH,
28
  )
29
  abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
30
- # Run once to initialize the retriever
31
  abstract_retriever.search("LLM")
32
 
33
 
@@ -56,18 +56,24 @@ scheduler_abstract.start()
56
 
57
 
58
  def get_df() -> pd.DataFrame:
59
- df = pd.merge(
60
- left=datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
61
- right=datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
62
- on="arxiv_id",
63
- )
 
64
  df = df[::-1].reset_index(drop=True)
65
- df["date"] = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d")
66
-
 
 
 
 
67
  paper_info = []
68
  for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
69
  info = row.copy()
70
- del info["abstract"]
 
71
  info["paper_page"] = f"https://huggingface.co/papers/{row.arxiv_id}"
72
  paper_info.append(info)
73
  return pd.DataFrame(paper_info)
@@ -84,22 +90,32 @@ class Prettifier:
84
  def create_link(text: str, url: str) -> str:
85
  return f'<a href="{url}" target="_blank">{text}</a>'
86
 
87
- @staticmethod
88
- def to_div(text: str | None, category_name: str) -> str:
89
- if text is None:
90
- text = ""
91
- class_name = f"{category_name}-{text.lower()}"
92
- return f'<div class="{class_name}">{text}</div>'
93
-
94
  def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
95
  new_rows = []
96
  for _, row in df.iterrows():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  new_row = {
98
  "arxiv_id": row["arxiv_id"], # Include arxiv_id
99
- "date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
 
100
  "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
101
  "title": row["title"],
102
- "github": self.get_github_link(row.github),
 
103
  "πŸ‘": row["upvotes"],
104
  "πŸ’¬": row["num_comments"],
105
  }
@@ -109,10 +125,12 @@ class Prettifier:
109
 
110
  class PaperList:
111
  COLUMN_INFO = [
112
- ["arxiv_id", "str"], # Added arxiv_id
113
- ["date", "markdown"],
 
114
  ["paper_page", "markdown"],
115
  ["title", "str"],
 
116
  ["github", "markdown"],
117
  ["πŸ‘", "number"],
118
  ["πŸ’¬", "number"],
@@ -140,17 +158,17 @@ class PaperList:
140
  max_num_to_retrieve: int,
141
  ) -> pd.DataFrame:
142
  df = self.df_raw.copy()
143
- df["date"] = pd.to_datetime(df["date"])
144
 
145
  # Filter by date
146
  df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
147
- df["date"] = df["date"].dt.strftime("%Y-%m-%d")
148
 
149
  # Filter by title
150
  if title_search_query:
151
- df = df[df["title"].str.contains(title_search_query, case=False)]
152
 
153
- # Filter by abstract
154
  if abstract_search_query:
155
  results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
156
  remaining_ids = set(df["arxiv_id"])
@@ -166,6 +184,7 @@ class PaperList:
166
  found_ids.append(arxiv_id)
167
  df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()
168
 
 
169
  df_prettified = self._prettifier(df).loc[:, self.column_names]
170
  return df_prettified
171
 
@@ -176,10 +195,9 @@ class PaperManager:
176
  def __init__(self, paper_list: PaperList, papers_per_page=30):
177
  self.paper_list = paper_list
178
  self.papers_per_page = papers_per_page
179
- self.current_page = 1
180
  self.sort_method = "hot" # Default sort method
181
  self.sort_papers()
182
- self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1)
183
 
184
  def calculate_score(self, row):
185
  """
@@ -187,10 +205,9 @@ class PaperManager:
187
  This mimics the "hotness" algorithm used by platforms like Hacker News.
188
  """
189
  upvotes = row.get('πŸ‘', 0)
190
- published_at_str = row.get('date', datetime.datetime.now(timezone.utc).isoformat())
191
  try:
192
- published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d")
193
- published_time = published_time.replace(tzinfo=timezone.utc)
194
  except ValueError:
195
  # If parsing fails, use current time to minimize the impact on sorting
196
  published_time = datetime.datetime.now(timezone.utc)
@@ -199,7 +216,7 @@ class PaperManager:
199
  time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours
200
 
201
  # Avoid division by zero and apply the hotness formula
202
- score = upvotes / ((time_diff_hours + 2) ** 1.5)
203
  return score
204
 
205
  def sort_papers(self):
@@ -209,7 +226,7 @@ class PaperManager:
209
  df['score'] = df.apply(self.calculate_score, axis=1)
210
  df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
211
  elif self.sort_method == "new":
212
- df_sorted = df.sort_values(by='date', ascending=False)
213
  else:
214
  df_sorted = df
215
 
@@ -245,10 +262,10 @@ class PaperManager:
245
  title = row.get('title', 'No title')
246
  paper_id = row.get('arxiv_id', '')
247
  url = f"https://huggingface.co/papers/{paper_id}"
248
- authors = 'Unknown' # Assuming authors are not present in the current dataset
249
  upvotes = row.get('πŸ‘', 0)
250
  comments = row.get('πŸ’¬', 0)
251
- published_time_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
252
  try:
253
  published_time = datetime.datetime.strptime(published_time_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
254
  except ValueError:
@@ -572,6 +589,7 @@ with demo:
572
  - [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
573
  """)
574
 
 
575
  # --- Launch the App ---
576
 
577
  if __name__ == "__main__":
 
13
  import datasets
14
  import requests
15
 
16
+ from datetime import timezone # Ensure timezone is imported
17
 
18
  # --- Data Loading and Processing ---
19
 
 
27
  local_dir=INDEX_DIR_PATH,
28
  )
29
  abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
30
+ # Initialize the retriever
31
  abstract_retriever.search("LLM")
32
 
33
 
 
56
 
57
 
58
  def get_df() -> pd.DataFrame:
59
+ # Load and merge datasets
60
+ df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
61
+ df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas()
62
+ df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id")
63
+
64
+ # Reverse the DataFrame to have the latest papers first
65
  df = df[::-1].reset_index(drop=True)
66
+
67
+ # Ensure 'date' is in datetime format and handle missing dates
68
+ df["date"] = pd.to_datetime(df["date"], errors='coerce')
69
+ df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
70
+
71
+ # Prepare the DataFrame by removing 'abstract' and adding 'paper_page'
72
  paper_info = []
73
  for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
74
  info = row.copy()
75
+ if "abstract" in info:
76
+ del info["abstract"]
77
  info["paper_page"] = f"https://huggingface.co/papers/{row.arxiv_id}"
78
  paper_info.append(info)
79
  return pd.DataFrame(paper_info)
 
90
  def create_link(text: str, url: str) -> str:
91
  return f'<a href="{url}" target="_blank">{text}</a>'
92
 
 
 
 
 
 
 
 
93
  def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
94
  new_rows = []
95
  for _, row in df.iterrows():
96
+ # Handle authors: list of dicts or list of strings
97
+ if "authors" in row and isinstance(row["authors"], list):
98
+ authors = ', '.join([
99
+ author.get('name', '') if isinstance(author, dict) else str(author)
100
+ for author in row["authors"]
101
+ ])
102
+ else:
103
+ authors = 'Unknown'
104
+
105
+ # Handle published_at: original date
106
+ published_at = row["date"] # Already formatted as "%Y-%m-%d"
107
+
108
+ # Handle date link
109
+ date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
110
+
111
  new_row = {
112
  "arxiv_id": row["arxiv_id"], # Include arxiv_id
113
+ "date_display": date_display, # For display
114
+ "published_at": published_at, # For internal calculations
115
  "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
116
  "title": row["title"],
117
+ "authors": authors, # Include authors
118
+ "github": Prettifier.get_github_link(row.get("github", "")),
119
  "πŸ‘": row["upvotes"],
120
  "πŸ’¬": row["num_comments"],
121
  }
 
125
 
126
  class PaperList:
127
  COLUMN_INFO = [
128
+ ["arxiv_id", "str"], # Added arxiv_id
129
+ ["date_display", "markdown"],# For display
130
+ ["published_at", "str"], # For internal use
131
  ["paper_page", "markdown"],
132
  ["title", "str"],
133
+ ["authors", "str"], # Added authors
134
  ["github", "markdown"],
135
  ["πŸ‘", "number"],
136
  ["πŸ’¬", "number"],
 
158
  max_num_to_retrieve: int,
159
  ) -> pd.DataFrame:
160
  df = self.df_raw.copy()
161
+ df["date"] = pd.to_datetime(df["date"], errors='coerce')
162
 
163
  # Filter by date
164
  df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
165
+ df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
166
 
167
  # Filter by title
168
  if title_search_query:
169
+ df = df[df["title"].str.contains(title_search_query, case=False, na=False)]
170
 
171
+ # Filter by abstract using RAG
172
  if abstract_search_query:
173
  results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
174
  remaining_ids = set(df["arxiv_id"])
 
184
  found_ids.append(arxiv_id)
185
  df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()
186
 
187
+ # Prettify the DataFrame
188
  df_prettified = self._prettifier(df).loc[:, self.column_names]
189
  return df_prettified
190
 
 
195
  def __init__(self, paper_list: PaperList, papers_per_page=30):
196
  self.paper_list = paper_list
197
  self.papers_per_page = papers_per_page
 
198
  self.sort_method = "hot" # Default sort method
199
  self.sort_papers()
200
+ # 'current_page' and 'total_pages' are set in 'sort_papers()'
201
 
202
  def calculate_score(self, row):
203
  """
 
205
  This mimics the "hotness" algorithm used by platforms like Hacker News.
206
  """
207
  upvotes = row.get('πŸ‘', 0)
208
+ published_at_str = row.get('published_at', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
209
  try:
210
+ published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
 
211
  except ValueError:
212
  # If parsing fails, use current time to minimize the impact on sorting
213
  published_time = datetime.datetime.now(timezone.utc)
 
216
  time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours
217
 
218
  # Avoid division by zero and apply the hotness formula
219
+ score = upvotes / ((time_diff_hours + 2) ** 1.5) if (time_diff_hours + 2) > 0 else 0
220
  return score
221
 
222
  def sort_papers(self):
 
226
  df['score'] = df.apply(self.calculate_score, axis=1)
227
  df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
228
  elif self.sort_method == "new":
229
+ df_sorted = df.sort_values(by='published_at', ascending=False)
230
  else:
231
  df_sorted = df
232
 
 
262
  title = row.get('title', 'No title')
263
  paper_id = row.get('arxiv_id', '')
264
  url = f"https://huggingface.co/papers/{paper_id}"
265
+ authors = row.get('authors', 'Unknown')
266
  upvotes = row.get('πŸ‘', 0)
267
  comments = row.get('πŸ’¬', 0)
268
+ published_time_str = row.get('published_at', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
269
  try:
270
  published_time = datetime.datetime.strptime(published_time_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
271
  except ValueError:
 
589
  - [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
590
  """)
591
 
592
+
593
  # --- Launch the App ---
594
 
595
  if __name__ == "__main__":