dailypapershackernews-dev

Running

App Files Files Community

akhaliq HF Staff commited on Sep 20, 2024

Commit

3b04ee1

verified ·

1 Parent(s): 361761b

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -34

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ from gradio_calendar import Calendar
 import datasets
 import requests
-from datetime import timezone  # Added import to fix the NameError
 # --- Data Loading and Processing ---
@@ -27,7 +27,7 @@ api.snapshot_download(
     local_dir=INDEX_DIR_PATH,
 )
 abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
-# Run once to initialize the retriever
 abstract_retriever.search("LLM")
@@ -56,18 +56,24 @@ scheduler_abstract.start()
 def get_df() -> pd.DataFrame:
-    df = pd.merge(
-        left=datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
-        right=datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
-        on="arxiv_id",
-    )
     df = df[::-1].reset_index(drop=True)
-    df["date"] = pd.to_datetime(df["date"]).dt.strftime("%Y-%m-%d")
     paper_info = []
     for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
         info = row.copy()
-        del info["abstract"]
         info["paper_page"] = f"https://huggingface.co/papers/{row.arxiv_id}"
         paper_info.append(info)
     return pd.DataFrame(paper_info)
@@ -84,22 +90,32 @@ class Prettifier:
     def create_link(text: str, url: str) -> str:
         return f'<a href="{url}" target="_blank">{text}</a>'
-    @staticmethod
-    def to_div(text: str | None, category_name: str) -> str:
-        if text is None:
-            text = ""
-        class_name = f"{category_name}-{text.lower()}"
-        return f'<div class="{class_name}">{text}</div>'
     def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
         new_rows = []
         for _, row in df.iterrows():
             new_row = {
                 "arxiv_id": row["arxiv_id"],  # Include arxiv_id
-                "date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
                 "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
                 "title": row["title"],
-                "github": self.get_github_link(row.github),
                 "👍": row["upvotes"],
                 "💬": row["num_comments"],
             }
@@ -109,10 +125,12 @@ class Prettifier:
 class PaperList:
     COLUMN_INFO = [
-        ["arxiv_id", "str"],        # Added arxiv_id
-        ["date", "markdown"],
         ["paper_page", "markdown"],
         ["title", "str"],
         ["github", "markdown"],
         ["👍", "number"],
         ["💬", "number"],
@@ -140,17 +158,17 @@ class PaperList:
         max_num_to_retrieve: int,
     ) -> pd.DataFrame:
         df = self.df_raw.copy()
-        df["date"] = pd.to_datetime(df["date"])
         # Filter by date
         df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
-        df["date"] = df["date"].dt.strftime("%Y-%m-%d")
         # Filter by title
         if title_search_query:
-            df = df[df["title"].str.contains(title_search_query, case=False)]
-        # Filter by abstract
         if abstract_search_query:
             results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
             remaining_ids = set(df["arxiv_id"])
@@ -166,6 +184,7 @@ class PaperList:
                 found_ids.append(arxiv_id)
             df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()
         df_prettified = self._prettifier(df).loc[:, self.column_names]
         return df_prettified
@@ -176,10 +195,9 @@ class PaperManager:
     def __init__(self, paper_list: PaperList, papers_per_page=30):
         self.paper_list = paper_list
         self.papers_per_page = papers_per_page
-        self.current_page = 1
         self.sort_method = "hot"  # Default sort method
         self.sort_papers()
-        self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1)
     def calculate_score(self, row):
         """
@@ -187,10 +205,9 @@ class PaperManager:
         This mimics the "hotness" algorithm used by platforms like Hacker News.
         """
         upvotes = row.get('👍', 0)
-        published_at_str = row.get('date', datetime.datetime.now(timezone.utc).isoformat())
         try:
-            published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d")
-            published_time = published_time.replace(tzinfo=timezone.utc)
         except ValueError:
             # If parsing fails, use current time to minimize the impact on sorting
             published_time = datetime.datetime.now(timezone.utc)
@@ -199,7 +216,7 @@ class PaperManager:
         time_diff_hours = time_diff.total_seconds() / 3600  # Convert time difference to hours
         # Avoid division by zero and apply the hotness formula
-        score = upvotes / ((time_diff_hours + 2) ** 1.5)
         return score
     def sort_papers(self):
@@ -209,7 +226,7 @@ class PaperManager:
             df['score'] = df.apply(self.calculate_score, axis=1)
             df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
         elif self.sort_method == "new":
-            df_sorted = df.sort_values(by='date', ascending=False)
         else:
             df_sorted = df
@@ -245,10 +262,10 @@ class PaperManager:
         title = row.get('title', 'No title')
         paper_id = row.get('arxiv_id', '')
         url = f"https://huggingface.co/papers/{paper_id}"
-        authors = 'Unknown'  # Assuming authors are not present in the current dataset
         upvotes = row.get('👍', 0)
         comments = row.get('💬', 0)
-        published_time_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
         try:
             published_time = datetime.datetime.strptime(published_time_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
         except ValueError:
@@ -572,6 +589,7 @@ with demo:
     - [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
     """)
 # --- Launch the App ---
 if __name__ == "__main__":

 import datasets
 import requests
+from datetime import timezone  # Ensure timezone is imported
 # --- Data Loading and Processing ---
     local_dir=INDEX_DIR_PATH,
 )
 abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
+# Initialize the retriever
 abstract_retriever.search("LLM")
 def get_df() -> pd.DataFrame:
+    # Load and merge datasets
+    df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
+    df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas()
+    df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id")
+    # Reverse the DataFrame to have the latest papers first
     df = df[::-1].reset_index(drop=True)
+    # Ensure 'date' is in datetime format and handle missing dates
+    df["date"] = pd.to_datetime(df["date"], errors='coerce')
+    df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
+    # Prepare the DataFrame by removing 'abstract' and adding 'paper_page'
     paper_info = []
     for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
         info = row.copy()
+        if "abstract" in info:
+            del info["abstract"]
         info["paper_page"] = f"https://huggingface.co/papers/{row.arxiv_id}"
         paper_info.append(info)
     return pd.DataFrame(paper_info)
     def create_link(text: str, url: str) -> str:
         return f'<a href="{url}" target="_blank">{text}</a>'
     def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
         new_rows = []
         for _, row in df.iterrows():
+            # Handle authors: list of dicts or list of strings
+            if "authors" in row and isinstance(row["authors"], list):
+                authors = ', '.join([
+                    author.get('name', '') if isinstance(author, dict) else str(author)
+                    for author in row["authors"]
+                ])
+            else:
+                authors = 'Unknown'
+            # Handle published_at: original date
+            published_at = row["date"]  # Already formatted as "%Y-%m-%d"
+            # Handle date link
+            date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
             new_row = {
                 "arxiv_id": row["arxiv_id"],  # Include arxiv_id
+                "date_display": date_display,  # For display
+                "published_at": published_at,  # For internal calculations
                 "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
                 "title": row["title"],
+                "authors": authors,  # Include authors
+                "github": Prettifier.get_github_link(row.get("github", "")),
                 "👍": row["upvotes"],
                 "💬": row["num_comments"],
             }
 class PaperList:
     COLUMN_INFO = [
+        ["arxiv_id", "str"],         # Added arxiv_id
+        ["date_display", "markdown"],# For display
+        ["published_at", "str"],     # For internal use
         ["paper_page", "markdown"],
         ["title", "str"],
+        ["authors", "str"],          # Added authors
         ["github", "markdown"],
         ["👍", "number"],
         ["💬", "number"],
         max_num_to_retrieve: int,
     ) -> pd.DataFrame:
         df = self.df_raw.copy()
+        df["date"] = pd.to_datetime(df["date"], errors='coerce')
         # Filter by date
         df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
+        df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
         # Filter by title
         if title_search_query:
+            df = df[df["title"].str.contains(title_search_query, case=False, na=False)]
+        # Filter by abstract using RAG
         if abstract_search_query:
             results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
             remaining_ids = set(df["arxiv_id"])
                 found_ids.append(arxiv_id)
             df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()
+        # Prettify the DataFrame
         df_prettified = self._prettifier(df).loc[:, self.column_names]
         return df_prettified
     def __init__(self, paper_list: PaperList, papers_per_page=30):
         self.paper_list = paper_list
         self.papers_per_page = papers_per_page
         self.sort_method = "hot"  # Default sort method
         self.sort_papers()
+        # 'current_page' and 'total_pages' are set in 'sort_papers()'
     def calculate_score(self, row):
         """
         This mimics the "hotness" algorithm used by platforms like Hacker News.
         """
         upvotes = row.get('👍', 0)
+        published_at_str = row.get('published_at', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
         try:
+            published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
         except ValueError:
             # If parsing fails, use current time to minimize the impact on sorting
             published_time = datetime.datetime.now(timezone.utc)
         time_diff_hours = time_diff.total_seconds() / 3600  # Convert time difference to hours
         # Avoid division by zero and apply the hotness formula
+        score = upvotes / ((time_diff_hours + 2) ** 1.5) if (time_diff_hours + 2) > 0 else 0
         return score
     def sort_papers(self):
             df['score'] = df.apply(self.calculate_score, axis=1)
             df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
         elif self.sort_method == "new":
+            df_sorted = df.sort_values(by='published_at', ascending=False)
         else:
             df_sorted = df
         title = row.get('title', 'No title')
         paper_id = row.get('arxiv_id', '')
         url = f"https://huggingface.co/papers/{paper_id}"
+        authors = row.get('authors', 'Unknown')
         upvotes = row.get('👍', 0)
         comments = row.get('💬', 0)
+        published_time_str = row.get('published_at', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
         try:
             published_time = datetime.datetime.strptime(published_time_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
         except ValueError:
     - [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
     """)
 # --- Launch the App ---
 if __name__ == "__main__":