dailypapershackernews-dev

Running

App Files Files Community

akhaliq HF Staff commited on Sep 20, 2024

Commit

3e6fd58

verified ·

1 Parent(s): 45195c8

update

Browse files

Files changed (1) hide show

app.py +9 -47

app.py CHANGED Viewed

@@ -18,33 +18,6 @@ from datetime import timezone  # Ensure timezone is imported
 api = HfApi()
-INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
-INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
-# Removed ragatouille and abstract_retriever initialization
-# If INDEX_REPO_ID is not used elsewhere, consider removing related lines
-# Removed abstract_retriever initialization and search
-def update_abstract_index() -> None:
-    """
-    Removed abstract_retriever update functionality since ragatouille is no longer used.
-    """
-    pass  # No operation needed
-# Scheduler for updating abstract index every hour
-# Removed scheduler_abstract as it's no longer necessary
-# If INDEX_REPO_ID is not used elsewhere, consider removing the download
-# Optionally, remove the snapshot_download if the index is not needed
-# api.snapshot_download(
-#     repo_id=INDEX_REPO_ID,
-#     repo_type="dataset",
-#     local_dir=INDEX_DIR_PATH,
-# )
-# --- DataFrame Preparation ---
 def get_df() -> pd.DataFrame:
     # Load and merge datasets
     df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
@@ -58,7 +31,7 @@ def get_df() -> pd.DataFrame:
     df["date"] = pd.to_datetime(df["date"], errors='coerce')
     df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
-    # Prepare the DataFrame by removing 'abstract' and adding 'paper_page'
     paper_info = []
     for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
         info = row.copy()
@@ -87,16 +60,13 @@ class Prettifier:
     def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
         new_rows = []
         for _, row in df.iterrows():
-            # Handle published_at: original date
-            published_at = row["date"]  # Already formatted as "%Y-%m-%d"
-            # Handle date link
             date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
             new_row = {
                 "arxiv_id": row["arxiv_id"],  # Include arxiv_id
                 "date_display": date_display,  # For display
-                "published_at": published_at,  # For internal calculations
                 "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
                 "title": row["title"],
                 "github": Prettifier.get_github_link(row.get("github", "")),
@@ -111,7 +81,7 @@ class PaperList:
     COLUMN_INFO = [
         ["arxiv_id", "str"],         # Added arxiv_id
         ["date_display", "markdown"],# For display
-        ["published_at", "str"],     # For internal use
         ["paper_page", "markdown"],
         ["title", "str"],
         ["github", "markdown"],
@@ -169,9 +139,9 @@ class PaperManager:
         This mimics the "hotness" algorithm used by platforms like Hacker News.
         """
         upvotes = row.get('👍', 0)
-        published_at_str = row.get('published_at', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))  # **FIX** Changed from 'date' to 'published_at'
         try:
-            published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
         except ValueError:
             # If parsing fails, use current time to minimize the impact on sorting
             published_time = datetime.datetime.now(timezone.utc)
@@ -194,7 +164,7 @@ class PaperManager:
             df['score'] = df.apply(self.calculate_score, axis=1)
             df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
         elif self.sort_method == "new":
-            df_sorted = df.sort_values(by='published_at', ascending=False)  # **FIX** Changed from 'date' to 'published_at'
         else:
             df_sorted = df
@@ -238,9 +208,9 @@ class PaperManager:
         url = f"https://huggingface.co/papers/{paper_id}"
         upvotes = row.get('👍', 0)
         comments = row.get('💬', 0)
-        published_time_str = row.get('published_at', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
         try:
-            published_time = datetime.datetime.strptime(published_time_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
         except ValueError:
             published_time = datetime.datetime.now(timezone.utc)
         time_diff = datetime.datetime.now(timezone.utc) - published_time
@@ -574,14 +544,6 @@ with demo:
         outputs=[paper_list]
     )
-    # Footer
-    gr.Markdown("""
-    Related useful Spaces:
-    - [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien)
-    - [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy)
-    - [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
-    """)
 # --- Launch the App ---

 api = HfApi()
 def get_df() -> pd.DataFrame:
     # Load and merge datasets
     df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
     df["date"] = pd.to_datetime(df["date"], errors='coerce')
     df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
+    # Prepare the DataFrame by removing 'abstract'
     paper_info = []
     for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
         info = row.copy()
     def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
         new_rows = []
         for _, row in df.iterrows():
+            # Handle date_display as a clickable link
             date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
             new_row = {
                 "arxiv_id": row["arxiv_id"],  # Include arxiv_id
                 "date_display": date_display,  # For display
+                "date": row["date"],            # For internal calculations
                 "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
                 "title": row["title"],
                 "github": Prettifier.get_github_link(row.get("github", "")),
     COLUMN_INFO = [
         ["arxiv_id", "str"],         # Added arxiv_id
         ["date_display", "markdown"],# For display
+        ["date", "str"],             # For internal use
         ["paper_page", "markdown"],
         ["title", "str"],
         ["github", "markdown"],
         This mimics the "hotness" algorithm used by platforms like Hacker News.
         """
         upvotes = row.get('👍', 0)
+        date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
         try:
+            published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
         except ValueError:
             # If parsing fails, use current time to minimize the impact on sorting
             published_time = datetime.datetime.now(timezone.utc)
             df['score'] = df.apply(self.calculate_score, axis=1)
             df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
         elif self.sort_method == "new":
+            df_sorted = df.sort_values(by='date', ascending=False)  # Sort by 'date' instead of 'published_at'
         else:
             df_sorted = df
         url = f"https://huggingface.co/papers/{paper_id}"
         upvotes = row.get('👍', 0)
         comments = row.get('💬', 0)
+        date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
         try:
+            published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
         except ValueError:
             published_time = datetime.datetime.now(timezone.utc)
         time_diff = datetime.datetime.now(timezone.utc) - published_time
         outputs=[paper_list]
     )
 # --- Launch the App ---