Update app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,6 @@ import pandas as pd
|
|
6 |
import tqdm.auto
|
7 |
from apscheduler.schedulers.background import BackgroundScheduler
|
8 |
from huggingface_hub import HfApi
|
9 |
-
from ragatouille import RAGPretrainedModel
|
10 |
|
11 |
import gradio as gr
|
12 |
from gradio_calendar import Calendar
|
@@ -21,39 +20,30 @@ api = HfApi()
|
|
21 |
|
22 |
INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
|
23 |
INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
|
24 |
-
api.snapshot_download(
|
25 |
-
repo_id=INDEX_REPO_ID,
|
26 |
-
repo_type="dataset",
|
27 |
-
local_dir=INDEX_DIR_PATH,
|
28 |
-
)
|
29 |
-
abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
|
30 |
-
# Initialize the retriever
|
31 |
-
abstract_retriever.search("LLM")
|
32 |
|
|
|
|
|
33 |
|
34 |
-
|
35 |
-
global abstract_retriever
|
36 |
-
|
37 |
-
api.snapshot_download(
|
38 |
-
repo_id=INDEX_REPO_ID,
|
39 |
-
repo_type="dataset",
|
40 |
-
local_dir=INDEX_DIR_PATH,
|
41 |
-
)
|
42 |
-
abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
|
43 |
-
abstract_retriever.search("LLM")
|
44 |
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
# Scheduler for updating abstract index every hour
|
47 |
-
scheduler_abstract
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
|
|
|
57 |
|
58 |
def get_df() -> pd.DataFrame:
|
59 |
# Load and merge datasets
|
@@ -154,7 +144,6 @@ class PaperList:
|
|
154 |
start_date: datetime.datetime,
|
155 |
end_date: datetime.datetime,
|
156 |
title_search_query: str,
|
157 |
-
abstract_search_query: str,
|
158 |
max_num_to_retrieve: int,
|
159 |
) -> pd.DataFrame:
|
160 |
df = self.df_raw.copy()
|
@@ -168,21 +157,7 @@ class PaperList:
|
|
168 |
if title_search_query:
|
169 |
df = df[df["title"].str.contains(title_search_query, case=False, na=False)]
|
170 |
|
171 |
-
#
|
172 |
-
if abstract_search_query:
|
173 |
-
results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
|
174 |
-
remaining_ids = set(df["arxiv_id"])
|
175 |
-
found_id_set = set()
|
176 |
-
found_ids = []
|
177 |
-
for x in results:
|
178 |
-
arxiv_id = x["document_id"]
|
179 |
-
if arxiv_id not in remaining_ids:
|
180 |
-
continue
|
181 |
-
if arxiv_id in found_id_set:
|
182 |
-
continue
|
183 |
-
found_id_set.add(arxiv_id)
|
184 |
-
found_ids.append(arxiv_id)
|
185 |
-
df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()
|
186 |
|
187 |
# Prettify the DataFrame
|
188 |
df_prettified = self._prettifier(df).loc[:, self.column_names]
|
@@ -205,7 +180,7 @@ class PaperManager:
|
|
205 |
This mimics the "hotness" algorithm used by platforms like Hacker News.
|
206 |
"""
|
207 |
upvotes = row.get('👍', 0)
|
208 |
-
published_at_str = row.get('
|
209 |
try:
|
210 |
published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
211 |
except ValueError:
|
@@ -226,7 +201,7 @@ class PaperManager:
|
|
226 |
df['score'] = df.apply(self.calculate_score, axis=1)
|
227 |
df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
|
228 |
elif self.sort_method == "new":
|
229 |
-
df_sorted = df.sort_values(by='
|
230 |
else:
|
231 |
df_sorted = df
|
232 |
|
|
|
6 |
import tqdm.auto
|
7 |
from apscheduler.schedulers.background import BackgroundScheduler
|
8 |
from huggingface_hub import HfApi
|
|
|
9 |
|
10 |
import gradio as gr
|
11 |
from gradio_calendar import Calendar
|
|
|
20 |
|
21 |
INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
|
22 |
INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
# Removed ragatouille and abstract_retriever initialization
|
25 |
+
# If INDEX_REPO_ID is not used elsewhere, consider removing related lines
|
26 |
|
27 |
+
# Removed abstract_retriever initialization and search
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
+
def update_abstract_index() -> None:
|
30 |
+
"""
|
31 |
+
Removed abstract_retriever update functionality since ragatouille is no longer used.
|
32 |
+
"""
|
33 |
+
pass # No operation needed
|
34 |
|
35 |
# Scheduler for updating abstract index every hour
|
36 |
+
# Removed scheduler_abstract as it's no longer necessary
|
37 |
+
# If INDEX_REPO_ID is not used elsewhere, consider removing the download
|
38 |
+
|
39 |
+
# Optionally, remove the snapshot_download if the index is not needed
|
40 |
+
# api.snapshot_download(
|
41 |
+
# repo_id=INDEX_REPO_ID,
|
42 |
+
# repo_type="dataset",
|
43 |
+
# local_dir=INDEX_DIR_PATH,
|
44 |
+
# )
|
45 |
|
46 |
+
# --- DataFrame Preparation ---
|
47 |
|
48 |
def get_df() -> pd.DataFrame:
|
49 |
# Load and merge datasets
|
|
|
144 |
start_date: datetime.datetime,
|
145 |
end_date: datetime.datetime,
|
146 |
title_search_query: str,
|
|
|
147 |
max_num_to_retrieve: int,
|
148 |
) -> pd.DataFrame:
|
149 |
df = self.df_raw.copy()
|
|
|
157 |
if title_search_query:
|
158 |
df = df[df["title"].str.contains(title_search_query, case=False, na=False)]
|
159 |
|
160 |
+
# Removed abstract_search_query filtering since ragatouille is no longer used
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
# Prettify the DataFrame
|
163 |
df_prettified = self._prettifier(df).loc[:, self.column_names]
|
|
|
180 |
This mimics the "hotness" algorithm used by platforms like Hacker News.
|
181 |
"""
|
182 |
upvotes = row.get('👍', 0)
|
183 |
+
published_at_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")) # **FIX** Changed from 'published_at' to 'date'
|
184 |
try:
|
185 |
published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
186 |
except ValueError:
|
|
|
201 |
df['score'] = df.apply(self.calculate_score, axis=1)
|
202 |
df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
|
203 |
elif self.sort_method == "new":
|
204 |
+
df_sorted = df.sort_values(by='date', ascending=False) # **FIX** Changed from 'published_at' to 'date'
|
205 |
else:
|
206 |
df_sorted = df
|
207 |
|