akhaliq HF staff commited on
Commit
ba4e64e
·
verified ·
1 Parent(s): 3b04ee1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -46
app.py CHANGED
@@ -6,7 +6,6 @@ import pandas as pd
6
  import tqdm.auto
7
  from apscheduler.schedulers.background import BackgroundScheduler
8
  from huggingface_hub import HfApi
9
- from ragatouille import RAGPretrainedModel
10
 
11
  import gradio as gr
12
  from gradio_calendar import Calendar
@@ -21,39 +20,30 @@ api = HfApi()
21
 
22
  INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
23
  INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
24
- api.snapshot_download(
25
- repo_id=INDEX_REPO_ID,
26
- repo_type="dataset",
27
- local_dir=INDEX_DIR_PATH,
28
- )
29
- abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
30
- # Initialize the retriever
31
- abstract_retriever.search("LLM")
32
 
 
 
33
 
34
- def update_abstract_index() -> None:
35
- global abstract_retriever
36
-
37
- api.snapshot_download(
38
- repo_id=INDEX_REPO_ID,
39
- repo_type="dataset",
40
- local_dir=INDEX_DIR_PATH,
41
- )
42
- abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
43
- abstract_retriever.search("LLM")
44
 
 
 
 
 
 
45
 
46
  # Scheduler for updating abstract index every hour
47
- scheduler_abstract = BackgroundScheduler()
48
- scheduler_abstract.add_job(
49
- func=update_abstract_index,
50
- trigger="cron",
51
- minute=0, # Every hour at minute 0
52
- timezone="UTC",
53
- misfire_grace_time=3 * 60,
54
- )
55
- scheduler_abstract.start()
56
 
 
57
 
58
  def get_df() -> pd.DataFrame:
59
  # Load and merge datasets
@@ -154,7 +144,6 @@ class PaperList:
154
  start_date: datetime.datetime,
155
  end_date: datetime.datetime,
156
  title_search_query: str,
157
- abstract_search_query: str,
158
  max_num_to_retrieve: int,
159
  ) -> pd.DataFrame:
160
  df = self.df_raw.copy()
@@ -168,21 +157,7 @@ class PaperList:
168
  if title_search_query:
169
  df = df[df["title"].str.contains(title_search_query, case=False, na=False)]
170
 
171
- # Filter by abstract using RAG
172
- if abstract_search_query:
173
- results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
174
- remaining_ids = set(df["arxiv_id"])
175
- found_id_set = set()
176
- found_ids = []
177
- for x in results:
178
- arxiv_id = x["document_id"]
179
- if arxiv_id not in remaining_ids:
180
- continue
181
- if arxiv_id in found_id_set:
182
- continue
183
- found_id_set.add(arxiv_id)
184
- found_ids.append(arxiv_id)
185
- df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()
186
 
187
  # Prettify the DataFrame
188
  df_prettified = self._prettifier(df).loc[:, self.column_names]
@@ -205,7 +180,7 @@ class PaperManager:
205
  This mimics the "hotness" algorithm used by platforms like Hacker News.
206
  """
207
  upvotes = row.get('👍', 0)
208
- published_at_str = row.get('published_at', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
209
  try:
210
  published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
211
  except ValueError:
@@ -226,7 +201,7 @@ class PaperManager:
226
  df['score'] = df.apply(self.calculate_score, axis=1)
227
  df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
228
  elif self.sort_method == "new":
229
- df_sorted = df.sort_values(by='published_at', ascending=False)
230
  else:
231
  df_sorted = df
232
 
 
6
  import tqdm.auto
7
  from apscheduler.schedulers.background import BackgroundScheduler
8
  from huggingface_hub import HfApi
 
9
 
10
  import gradio as gr
11
  from gradio_calendar import Calendar
 
20
 
21
  INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
22
  INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
 
 
 
 
 
 
 
 
23
 
24
+ # Removed ragatouille and abstract_retriever initialization
25
+ # If INDEX_REPO_ID is not used elsewhere, consider removing related lines
26
 
27
+ # Removed abstract_retriever initialization and search
 
 
 
 
 
 
 
 
 
28
 
29
+ def update_abstract_index() -> None:
30
+ """
31
+ Removed abstract_retriever update functionality since ragatouille is no longer used.
32
+ """
33
+ pass # No operation needed
34
 
35
  # Scheduler for updating abstract index every hour
36
+ # Removed scheduler_abstract as it's no longer necessary
37
+ # If INDEX_REPO_ID is not used elsewhere, consider removing the download
38
+
39
+ # Optionally, remove the snapshot_download if the index is not needed
40
+ # api.snapshot_download(
41
+ # repo_id=INDEX_REPO_ID,
42
+ # repo_type="dataset",
43
+ # local_dir=INDEX_DIR_PATH,
44
+ # )
45
 
46
+ # --- DataFrame Preparation ---
47
 
48
  def get_df() -> pd.DataFrame:
49
  # Load and merge datasets
 
144
  start_date: datetime.datetime,
145
  end_date: datetime.datetime,
146
  title_search_query: str,
 
147
  max_num_to_retrieve: int,
148
  ) -> pd.DataFrame:
149
  df = self.df_raw.copy()
 
157
  if title_search_query:
158
  df = df[df["title"].str.contains(title_search_query, case=False, na=False)]
159
 
160
+ # Removed abstract_search_query filtering since ragatouille is no longer used
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  # Prettify the DataFrame
163
  df_prettified = self._prettifier(df).loc[:, self.column_names]
 
180
  This mimics the "hotness" algorithm used by platforms like Hacker News.
181
  """
182
  upvotes = row.get('👍', 0)
183
+ published_at_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) # **FIX** Changed from 'published_at' to 'date'
184
  try:
185
  published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
186
  except ValueError:
 
201
  df['score'] = df.apply(self.calculate_score, axis=1)
202
  df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
203
  elif self.sort_method == "new":
204
+ df_sorted = df.sort_values(by='date', ascending=False) # **FIX** Changed from 'published_at' to 'date'
205
  else:
206
  df_sorted = df
207