update
Browse files
app.py
CHANGED
@@ -18,33 +18,6 @@ from datetime import timezone # Ensure timezone is imported
|
|
18 |
|
19 |
api = HfApi()
|
20 |
|
21 |
-
INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
|
22 |
-
INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
|
23 |
-
|
24 |
-
# Removed ragatouille and abstract_retriever initialization
|
25 |
-
# If INDEX_REPO_ID is not used elsewhere, consider removing related lines
|
26 |
-
|
27 |
-
# Removed abstract_retriever initialization and search
|
28 |
-
|
29 |
-
def update_abstract_index() -> None:
|
30 |
-
"""
|
31 |
-
Removed abstract_retriever update functionality since ragatouille is no longer used.
|
32 |
-
"""
|
33 |
-
pass # No operation needed
|
34 |
-
|
35 |
-
# Scheduler for updating abstract index every hour
|
36 |
-
# Removed scheduler_abstract as it's no longer necessary
|
37 |
-
# If INDEX_REPO_ID is not used elsewhere, consider removing the download
|
38 |
-
|
39 |
-
# Optionally, remove the snapshot_download if the index is not needed
|
40 |
-
# api.snapshot_download(
|
41 |
-
# repo_id=INDEX_REPO_ID,
|
42 |
-
# repo_type="dataset",
|
43 |
-
# local_dir=INDEX_DIR_PATH,
|
44 |
-
# )
|
45 |
-
|
46 |
-
# --- DataFrame Preparation ---
|
47 |
-
|
48 |
def get_df() -> pd.DataFrame:
|
49 |
# Load and merge datasets
|
50 |
df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
|
@@ -58,7 +31,7 @@ def get_df() -> pd.DataFrame:
|
|
58 |
df["date"] = pd.to_datetime(df["date"], errors='coerce')
|
59 |
df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
|
60 |
|
61 |
-
# Prepare the DataFrame by removing 'abstract'
|
62 |
paper_info = []
|
63 |
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
|
64 |
info = row.copy()
|
@@ -87,16 +60,13 @@ class Prettifier:
|
|
87 |
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
|
88 |
new_rows = []
|
89 |
for _, row in df.iterrows():
|
90 |
-
# Handle
|
91 |
-
published_at = row["date"] # Already formatted as "%Y-%m-%d"
|
92 |
-
|
93 |
-
# Handle date link
|
94 |
date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
|
95 |
|
96 |
new_row = {
|
97 |
"arxiv_id": row["arxiv_id"], # Include arxiv_id
|
98 |
"date_display": date_display, # For display
|
99 |
-
"
|
100 |
"paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
|
101 |
"title": row["title"],
|
102 |
"github": Prettifier.get_github_link(row.get("github", "")),
|
@@ -111,7 +81,7 @@ class PaperList:
|
|
111 |
COLUMN_INFO = [
|
112 |
["arxiv_id", "str"], # Added arxiv_id
|
113 |
["date_display", "markdown"],# For display
|
114 |
-
["
|
115 |
["paper_page", "markdown"],
|
116 |
["title", "str"],
|
117 |
["github", "markdown"],
|
@@ -169,9 +139,9 @@ class PaperManager:
|
|
169 |
This mimics the "hotness" algorithm used by platforms like Hacker News.
|
170 |
"""
|
171 |
upvotes = row.get('π', 0)
|
172 |
-
|
173 |
try:
|
174 |
-
published_time = datetime.datetime.strptime(
|
175 |
except ValueError:
|
176 |
# If parsing fails, use current time to minimize the impact on sorting
|
177 |
published_time = datetime.datetime.now(timezone.utc)
|
@@ -194,7 +164,7 @@ class PaperManager:
|
|
194 |
df['score'] = df.apply(self.calculate_score, axis=1)
|
195 |
df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
|
196 |
elif self.sort_method == "new":
|
197 |
-
df_sorted = df.sort_values(by='
|
198 |
else:
|
199 |
df_sorted = df
|
200 |
|
@@ -238,9 +208,9 @@ class PaperManager:
|
|
238 |
url = f"https://huggingface.co/papers/{paper_id}"
|
239 |
upvotes = row.get('π', 0)
|
240 |
comments = row.get('π¬', 0)
|
241 |
-
|
242 |
try:
|
243 |
-
published_time = datetime.datetime.strptime(
|
244 |
except ValueError:
|
245 |
published_time = datetime.datetime.now(timezone.utc)
|
246 |
time_diff = datetime.datetime.now(timezone.utc) - published_time
|
@@ -574,14 +544,6 @@ with demo:
|
|
574 |
outputs=[paper_list]
|
575 |
)
|
576 |
|
577 |
-
# Footer
|
578 |
-
gr.Markdown("""
|
579 |
-
Related useful Spaces:
|
580 |
-
- [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien)
|
581 |
-
- [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy)
|
582 |
-
- [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
|
583 |
-
""")
|
584 |
-
|
585 |
|
586 |
# --- Launch the App ---
|
587 |
|
|
|
18 |
|
19 |
api = HfApi()
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
def get_df() -> pd.DataFrame:
|
22 |
# Load and merge datasets
|
23 |
df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
|
|
|
31 |
df["date"] = pd.to_datetime(df["date"], errors='coerce')
|
32 |
df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
|
33 |
|
34 |
+
# Prepare the DataFrame by removing 'abstract'
|
35 |
paper_info = []
|
36 |
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
|
37 |
info = row.copy()
|
|
|
60 |
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
|
61 |
new_rows = []
|
62 |
for _, row in df.iterrows():
|
63 |
+
# Handle date_display as a clickable link
|
|
|
|
|
|
|
64 |
date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
|
65 |
|
66 |
new_row = {
|
67 |
"arxiv_id": row["arxiv_id"], # Include arxiv_id
|
68 |
"date_display": date_display, # For display
|
69 |
+
"date": row["date"], # For internal calculations
|
70 |
"paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
|
71 |
"title": row["title"],
|
72 |
"github": Prettifier.get_github_link(row.get("github", "")),
|
|
|
81 |
COLUMN_INFO = [
|
82 |
["arxiv_id", "str"], # Added arxiv_id
|
83 |
["date_display", "markdown"],# For display
|
84 |
+
["date", "str"], # For internal use
|
85 |
["paper_page", "markdown"],
|
86 |
["title", "str"],
|
87 |
["github", "markdown"],
|
|
|
139 |
This mimics the "hotness" algorithm used by platforms like Hacker News.
|
140 |
"""
|
141 |
upvotes = row.get('π', 0)
|
142 |
+
date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
|
143 |
try:
|
144 |
+
published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
145 |
except ValueError:
|
146 |
# If parsing fails, use current time to minimize the impact on sorting
|
147 |
published_time = datetime.datetime.now(timezone.utc)
|
|
|
164 |
df['score'] = df.apply(self.calculate_score, axis=1)
|
165 |
df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
|
166 |
elif self.sort_method == "new":
|
167 |
+
df_sorted = df.sort_values(by='date', ascending=False) # Sort by 'date' instead of 'published_at'
|
168 |
else:
|
169 |
df_sorted = df
|
170 |
|
|
|
208 |
url = f"https://huggingface.co/papers/{paper_id}"
|
209 |
upvotes = row.get('π', 0)
|
210 |
comments = row.get('π¬', 0)
|
211 |
+
date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
|
212 |
try:
|
213 |
+
published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
214 |
except ValueError:
|
215 |
published_time = datetime.datetime.now(timezone.utc)
|
216 |
time_diff = datetime.datetime.now(timezone.utc) - published_time
|
|
|
544 |
outputs=[paper_list]
|
545 |
)
|
546 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
547 |
|
548 |
# --- Launch the App ---
|
549 |
|