akhaliq HF staff commited on
Commit
3e6fd58
Β·
verified Β·
1 Parent(s): 45195c8
Files changed (1) hide show
  1. app.py +9 -47
app.py CHANGED
@@ -18,33 +18,6 @@ from datetime import timezone # Ensure timezone is imported
18
 
19
  api = HfApi()
20
 
21
- INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
22
- INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
23
-
24
- # Removed ragatouille and abstract_retriever initialization
25
- # If INDEX_REPO_ID is not used elsewhere, consider removing related lines
26
-
27
- # Removed abstract_retriever initialization and search
28
-
29
- def update_abstract_index() -> None:
30
- """
31
- Removed abstract_retriever update functionality since ragatouille is no longer used.
32
- """
33
- pass # No operation needed
34
-
35
- # Scheduler for updating abstract index every hour
36
- # Removed scheduler_abstract as it's no longer necessary
37
- # If INDEX_REPO_ID is not used elsewhere, consider removing the download
38
-
39
- # Optionally, remove the snapshot_download if the index is not needed
40
- # api.snapshot_download(
41
- # repo_id=INDEX_REPO_ID,
42
- # repo_type="dataset",
43
- # local_dir=INDEX_DIR_PATH,
44
- # )
45
-
46
- # --- DataFrame Preparation ---
47
-
48
  def get_df() -> pd.DataFrame:
49
  # Load and merge datasets
50
  df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
@@ -58,7 +31,7 @@ def get_df() -> pd.DataFrame:
58
  df["date"] = pd.to_datetime(df["date"], errors='coerce')
59
  df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
60
 
61
- # Prepare the DataFrame by removing 'abstract' and adding 'paper_page'
62
  paper_info = []
63
  for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
64
  info = row.copy()
@@ -87,16 +60,13 @@ class Prettifier:
87
  def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
88
  new_rows = []
89
  for _, row in df.iterrows():
90
- # Handle published_at: original date
91
- published_at = row["date"] # Already formatted as "%Y-%m-%d"
92
-
93
- # Handle date link
94
  date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
95
 
96
  new_row = {
97
  "arxiv_id": row["arxiv_id"], # Include arxiv_id
98
  "date_display": date_display, # For display
99
- "published_at": published_at, # For internal calculations
100
  "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
101
  "title": row["title"],
102
  "github": Prettifier.get_github_link(row.get("github", "")),
@@ -111,7 +81,7 @@ class PaperList:
111
  COLUMN_INFO = [
112
  ["arxiv_id", "str"], # Added arxiv_id
113
  ["date_display", "markdown"],# For display
114
- ["published_at", "str"], # For internal use
115
  ["paper_page", "markdown"],
116
  ["title", "str"],
117
  ["github", "markdown"],
@@ -169,9 +139,9 @@ class PaperManager:
169
  This mimics the "hotness" algorithm used by platforms like Hacker News.
170
  """
171
  upvotes = row.get('πŸ‘', 0)
172
- published_at_str = row.get('published_at', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) # **FIX** Changed from 'date' to 'published_at'
173
  try:
174
- published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
175
  except ValueError:
176
  # If parsing fails, use current time to minimize the impact on sorting
177
  published_time = datetime.datetime.now(timezone.utc)
@@ -194,7 +164,7 @@ class PaperManager:
194
  df['score'] = df.apply(self.calculate_score, axis=1)
195
  df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
196
  elif self.sort_method == "new":
197
- df_sorted = df.sort_values(by='published_at', ascending=False) # **FIX** Changed from 'date' to 'published_at'
198
  else:
199
  df_sorted = df
200
 
@@ -238,9 +208,9 @@ class PaperManager:
238
  url = f"https://huggingface.co/papers/{paper_id}"
239
  upvotes = row.get('πŸ‘', 0)
240
  comments = row.get('πŸ’¬', 0)
241
- published_time_str = row.get('published_at', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
242
  try:
243
- published_time = datetime.datetime.strptime(published_time_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
244
  except ValueError:
245
  published_time = datetime.datetime.now(timezone.utc)
246
  time_diff = datetime.datetime.now(timezone.utc) - published_time
@@ -574,14 +544,6 @@ with demo:
574
  outputs=[paper_list]
575
  )
576
 
577
- # Footer
578
- gr.Markdown("""
579
- Related useful Spaces:
580
- - [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien)
581
- - [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy)
582
- - [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
583
- """)
584
-
585
 
586
  # --- Launch the App ---
587
 
 
18
 
19
  api = HfApi()
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def get_df() -> pd.DataFrame:
22
  # Load and merge datasets
23
  df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
 
31
  df["date"] = pd.to_datetime(df["date"], errors='coerce')
32
  df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
33
 
34
+ # Prepare the DataFrame by removing 'abstract'
35
  paper_info = []
36
  for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
37
  info = row.copy()
 
60
  def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
61
  new_rows = []
62
  for _, row in df.iterrows():
63
+ # Handle date_display as a clickable link
 
 
 
64
  date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
65
 
66
  new_row = {
67
  "arxiv_id": row["arxiv_id"], # Include arxiv_id
68
  "date_display": date_display, # For display
69
+ "date": row["date"], # For internal calculations
70
  "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
71
  "title": row["title"],
72
  "github": Prettifier.get_github_link(row.get("github", "")),
 
81
  COLUMN_INFO = [
82
  ["arxiv_id", "str"], # Added arxiv_id
83
  ["date_display", "markdown"],# For display
84
+ ["date", "str"], # For internal use
85
  ["paper_page", "markdown"],
86
  ["title", "str"],
87
  ["github", "markdown"],
 
139
  This mimics the "hotness" algorithm used by platforms like Hacker News.
140
  """
141
  upvotes = row.get('πŸ‘', 0)
142
+ date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
143
  try:
144
+ published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
145
  except ValueError:
146
  # If parsing fails, use current time to minimize the impact on sorting
147
  published_time = datetime.datetime.now(timezone.utc)
 
164
  df['score'] = df.apply(self.calculate_score, axis=1)
165
  df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
166
  elif self.sort_method == "new":
167
+ df_sorted = df.sort_values(by='date', ascending=False) # Sort by 'date' instead of 'published_at'
168
  else:
169
  df_sorted = df
170
 
 
208
  url = f"https://huggingface.co/papers/{paper_id}"
209
  upvotes = row.get('πŸ‘', 0)
210
  comments = row.get('πŸ’¬', 0)
211
+ date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
212
  try:
213
+ published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
214
  except ValueError:
215
  published_time = datetime.datetime.now(timezone.utc)
216
  time_diff = datetime.datetime.now(timezone.utc) - published_time
 
544
  outputs=[paper_list]
545
  )
546
 
 
 
 
 
 
 
 
 
547
 
548
  # --- Launch the App ---
549