Update app.py
Browse files
app.py
CHANGED
@@ -13,7 +13,7 @@ from gradio_calendar import Calendar
|
|
13 |
import datasets
|
14 |
import requests
|
15 |
|
16 |
-
from datetime import timezone #
|
17 |
|
18 |
# --- Data Loading and Processing ---
|
19 |
|
@@ -27,7 +27,7 @@ api.snapshot_download(
|
|
27 |
local_dir=INDEX_DIR_PATH,
|
28 |
)
|
29 |
abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
|
30 |
-
#
|
31 |
abstract_retriever.search("LLM")
|
32 |
|
33 |
|
@@ -56,18 +56,24 @@ scheduler_abstract.start()
|
|
56 |
|
57 |
|
58 |
def get_df() -> pd.DataFrame:
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
64 |
df = df[::-1].reset_index(drop=True)
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
67 |
paper_info = []
|
68 |
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
|
69 |
info = row.copy()
|
70 |
-
|
|
|
71 |
info["paper_page"] = f"https://huggingface.co/papers/{row.arxiv_id}"
|
72 |
paper_info.append(info)
|
73 |
return pd.DataFrame(paper_info)
|
@@ -84,22 +90,32 @@ class Prettifier:
|
|
84 |
def create_link(text: str, url: str) -> str:
|
85 |
return f'<a href="{url}" target="_blank">{text}</a>'
|
86 |
|
87 |
-
@staticmethod
|
88 |
-
def to_div(text: str | None, category_name: str) -> str:
|
89 |
-
if text is None:
|
90 |
-
text = ""
|
91 |
-
class_name = f"{category_name}-{text.lower()}"
|
92 |
-
return f'<div class="{class_name}">{text}</div>'
|
93 |
-
|
94 |
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
|
95 |
new_rows = []
|
96 |
for _, row in df.iterrows():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
new_row = {
|
98 |
"arxiv_id": row["arxiv_id"], # Include arxiv_id
|
99 |
-
"
|
|
|
100 |
"paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
|
101 |
"title": row["title"],
|
102 |
-
"
|
|
|
103 |
"π": row["upvotes"],
|
104 |
"π¬": row["num_comments"],
|
105 |
}
|
@@ -109,10 +125,12 @@ class Prettifier:
|
|
109 |
|
110 |
class PaperList:
|
111 |
COLUMN_INFO = [
|
112 |
-
["arxiv_id", "str"],
|
113 |
-
["
|
|
|
114 |
["paper_page", "markdown"],
|
115 |
["title", "str"],
|
|
|
116 |
["github", "markdown"],
|
117 |
["π", "number"],
|
118 |
["π¬", "number"],
|
@@ -140,17 +158,17 @@ class PaperList:
|
|
140 |
max_num_to_retrieve: int,
|
141 |
) -> pd.DataFrame:
|
142 |
df = self.df_raw.copy()
|
143 |
-
df["date"] = pd.to_datetime(df["date"])
|
144 |
|
145 |
# Filter by date
|
146 |
df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
|
147 |
-
df["date"] = df["date"].dt.strftime("%Y-%m-%d")
|
148 |
|
149 |
# Filter by title
|
150 |
if title_search_query:
|
151 |
-
df = df[df["title"].str.contains(title_search_query, case=False)]
|
152 |
|
153 |
-
# Filter by abstract
|
154 |
if abstract_search_query:
|
155 |
results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
|
156 |
remaining_ids = set(df["arxiv_id"])
|
@@ -166,6 +184,7 @@ class PaperList:
|
|
166 |
found_ids.append(arxiv_id)
|
167 |
df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()
|
168 |
|
|
|
169 |
df_prettified = self._prettifier(df).loc[:, self.column_names]
|
170 |
return df_prettified
|
171 |
|
@@ -176,10 +195,9 @@ class PaperManager:
|
|
176 |
def __init__(self, paper_list: PaperList, papers_per_page=30):
|
177 |
self.paper_list = paper_list
|
178 |
self.papers_per_page = papers_per_page
|
179 |
-
self.current_page = 1
|
180 |
self.sort_method = "hot" # Default sort method
|
181 |
self.sort_papers()
|
182 |
-
|
183 |
|
184 |
def calculate_score(self, row):
|
185 |
"""
|
@@ -187,10 +205,9 @@ class PaperManager:
|
|
187 |
This mimics the "hotness" algorithm used by platforms like Hacker News.
|
188 |
"""
|
189 |
upvotes = row.get('π', 0)
|
190 |
-
published_at_str = row.get('
|
191 |
try:
|
192 |
-
published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d")
|
193 |
-
published_time = published_time.replace(tzinfo=timezone.utc)
|
194 |
except ValueError:
|
195 |
# If parsing fails, use current time to minimize the impact on sorting
|
196 |
published_time = datetime.datetime.now(timezone.utc)
|
@@ -199,7 +216,7 @@ class PaperManager:
|
|
199 |
time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours
|
200 |
|
201 |
# Avoid division by zero and apply the hotness formula
|
202 |
-
score = upvotes / ((time_diff_hours + 2) ** 1.5)
|
203 |
return score
|
204 |
|
205 |
def sort_papers(self):
|
@@ -209,7 +226,7 @@ class PaperManager:
|
|
209 |
df['score'] = df.apply(self.calculate_score, axis=1)
|
210 |
df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
|
211 |
elif self.sort_method == "new":
|
212 |
-
df_sorted = df.sort_values(by='
|
213 |
else:
|
214 |
df_sorted = df
|
215 |
|
@@ -245,10 +262,10 @@ class PaperManager:
|
|
245 |
title = row.get('title', 'No title')
|
246 |
paper_id = row.get('arxiv_id', '')
|
247 |
url = f"https://huggingface.co/papers/{paper_id}"
|
248 |
-
authors = 'Unknown'
|
249 |
upvotes = row.get('π', 0)
|
250 |
comments = row.get('π¬', 0)
|
251 |
-
published_time_str = row.get('
|
252 |
try:
|
253 |
published_time = datetime.datetime.strptime(published_time_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
254 |
except ValueError:
|
@@ -572,6 +589,7 @@ with demo:
|
|
572 |
- [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
|
573 |
""")
|
574 |
|
|
|
575 |
# --- Launch the App ---
|
576 |
|
577 |
if __name__ == "__main__":
|
|
|
13 |
import datasets
|
14 |
import requests
|
15 |
|
16 |
+
from datetime import timezone # Ensure timezone is imported
|
17 |
|
18 |
# --- Data Loading and Processing ---
|
19 |
|
|
|
27 |
local_dir=INDEX_DIR_PATH,
|
28 |
)
|
29 |
abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
|
30 |
+
# Initialize the retriever
|
31 |
abstract_retriever.search("LLM")
|
32 |
|
33 |
|
|
|
56 |
|
57 |
|
58 |
def get_df() -> pd.DataFrame:
|
59 |
+
# Load and merge datasets
|
60 |
+
df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
|
61 |
+
df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas()
|
62 |
+
df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id")
|
63 |
+
|
64 |
+
# Reverse the DataFrame to have the latest papers first
|
65 |
df = df[::-1].reset_index(drop=True)
|
66 |
+
|
67 |
+
# Ensure 'date' is in datetime format and handle missing dates
|
68 |
+
df["date"] = pd.to_datetime(df["date"], errors='coerce')
|
69 |
+
df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
|
70 |
+
|
71 |
+
# Prepare the DataFrame by removing 'abstract' and adding 'paper_page'
|
72 |
paper_info = []
|
73 |
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
|
74 |
info = row.copy()
|
75 |
+
if "abstract" in info:
|
76 |
+
del info["abstract"]
|
77 |
info["paper_page"] = f"https://huggingface.co/papers/{row.arxiv_id}"
|
78 |
paper_info.append(info)
|
79 |
return pd.DataFrame(paper_info)
|
|
|
90 |
def create_link(text: str, url: str) -> str:
|
91 |
return f'<a href="{url}" target="_blank">{text}</a>'
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
|
94 |
new_rows = []
|
95 |
for _, row in df.iterrows():
|
96 |
+
# Handle authors: list of dicts or list of strings
|
97 |
+
if "authors" in row and isinstance(row["authors"], list):
|
98 |
+
authors = ', '.join([
|
99 |
+
author.get('name', '') if isinstance(author, dict) else str(author)
|
100 |
+
for author in row["authors"]
|
101 |
+
])
|
102 |
+
else:
|
103 |
+
authors = 'Unknown'
|
104 |
+
|
105 |
+
# Handle published_at: original date
|
106 |
+
published_at = row["date"] # Already formatted as "%Y-%m-%d"
|
107 |
+
|
108 |
+
# Handle date link
|
109 |
+
date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
|
110 |
+
|
111 |
new_row = {
|
112 |
"arxiv_id": row["arxiv_id"], # Include arxiv_id
|
113 |
+
"date_display": date_display, # For display
|
114 |
+
"published_at": published_at, # For internal calculations
|
115 |
"paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
|
116 |
"title": row["title"],
|
117 |
+
"authors": authors, # Include authors
|
118 |
+
"github": Prettifier.get_github_link(row.get("github", "")),
|
119 |
"π": row["upvotes"],
|
120 |
"π¬": row["num_comments"],
|
121 |
}
|
|
|
125 |
|
126 |
class PaperList:
|
127 |
COLUMN_INFO = [
|
128 |
+
["arxiv_id", "str"], # Added arxiv_id
|
129 |
+
["date_display", "markdown"],# For display
|
130 |
+
["published_at", "str"], # For internal use
|
131 |
["paper_page", "markdown"],
|
132 |
["title", "str"],
|
133 |
+
["authors", "str"], # Added authors
|
134 |
["github", "markdown"],
|
135 |
["π", "number"],
|
136 |
["π¬", "number"],
|
|
|
158 |
max_num_to_retrieve: int,
|
159 |
) -> pd.DataFrame:
|
160 |
df = self.df_raw.copy()
|
161 |
+
df["date"] = pd.to_datetime(df["date"], errors='coerce')
|
162 |
|
163 |
# Filter by date
|
164 |
df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
|
165 |
+
df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
|
166 |
|
167 |
# Filter by title
|
168 |
if title_search_query:
|
169 |
+
df = df[df["title"].str.contains(title_search_query, case=False, na=False)]
|
170 |
|
171 |
+
# Filter by abstract using RAG
|
172 |
if abstract_search_query:
|
173 |
results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
|
174 |
remaining_ids = set(df["arxiv_id"])
|
|
|
184 |
found_ids.append(arxiv_id)
|
185 |
df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()
|
186 |
|
187 |
+
# Prettify the DataFrame
|
188 |
df_prettified = self._prettifier(df).loc[:, self.column_names]
|
189 |
return df_prettified
|
190 |
|
|
|
195 |
def __init__(self, paper_list: PaperList, papers_per_page=30):
|
196 |
self.paper_list = paper_list
|
197 |
self.papers_per_page = papers_per_page
|
|
|
198 |
self.sort_method = "hot" # Default sort method
|
199 |
self.sort_papers()
|
200 |
+
# 'current_page' and 'total_pages' are set in 'sort_papers()'
|
201 |
|
202 |
def calculate_score(self, row):
|
203 |
"""
|
|
|
205 |
This mimics the "hotness" algorithm used by platforms like Hacker News.
|
206 |
"""
|
207 |
upvotes = row.get('π', 0)
|
208 |
+
published_at_str = row.get('published_at', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
|
209 |
try:
|
210 |
+
published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
|
|
211 |
except ValueError:
|
212 |
# If parsing fails, use current time to minimize the impact on sorting
|
213 |
published_time = datetime.datetime.now(timezone.utc)
|
|
|
216 |
time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours
|
217 |
|
218 |
# Avoid division by zero and apply the hotness formula
|
219 |
+
score = upvotes / ((time_diff_hours + 2) ** 1.5) if (time_diff_hours + 2) > 0 else 0
|
220 |
return score
|
221 |
|
222 |
def sort_papers(self):
|
|
|
226 |
df['score'] = df.apply(self.calculate_score, axis=1)
|
227 |
df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
|
228 |
elif self.sort_method == "new":
|
229 |
+
df_sorted = df.sort_values(by='published_at', ascending=False)
|
230 |
else:
|
231 |
df_sorted = df
|
232 |
|
|
|
262 |
title = row.get('title', 'No title')
|
263 |
paper_id = row.get('arxiv_id', '')
|
264 |
url = f"https://huggingface.co/papers/{paper_id}"
|
265 |
+
authors = row.get('authors', 'Unknown')
|
266 |
upvotes = row.get('π', 0)
|
267 |
comments = row.get('π¬', 0)
|
268 |
+
published_time_str = row.get('published_at', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
|
269 |
try:
|
270 |
published_time = datetime.datetime.strptime(published_time_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
271 |
except ValueError:
|
|
|
589 |
- [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
|
590 |
""")
|
591 |
|
592 |
+
|
593 |
# --- Launch the App ---
|
594 |
|
595 |
if __name__ == "__main__":
|