dailypapershackernews-dev

Running

App Files Files Community

akhaliq HF staff commited on Sep 20, 2024

Commit

a7e2292

verified ·

1 Parent(s): dea1fc4

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -81

app.py CHANGED Viewed

@@ -8,7 +8,6 @@ from huggingface_hub import HfApi
 import gradio as gr
 import datasets  # Ensure the datasets library is imported
-import requests   # For making API calls
 from datetime import timezone
 import atexit  # To gracefully shut down the scheduler
@@ -22,37 +21,10 @@ logger = logging.getLogger(__name__)
 api = HfApi()
-def get_repo_counts(arxiv_id: str) -> dict:
-    """
-    Fetches the number of models, datasets, and Spaces linked to a given arxiv_id using Hugging Face API.
-    """
-    url = f"https://huggingface.co/api/arxiv/{arxiv_id}/repos"
-    try:
-        response = requests.get(url, timeout=10)
-        response.raise_for_status()
-        data = response.json()
-        models = data.get('models', [])
-        datasets_list = data.get('datasets', [])
-        spaces = data.get('spaces', [])
-        return {
-            'models_count': len(models),
-            'datasets_count': len(datasets_list),
-            'spaces_count': len(spaces)
-        }
-    except requests.exceptions.RequestException as e:
-        logger.error(f"Error fetching repo counts for {arxiv_id}: {e}")
-        return {
-            'models_count': 0,
-            'datasets_count': 0,
-            'spaces_count': 0
-        }
 def get_df() -> pd.DataFrame:
     """
     Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns,
-    adds a 'paper_page' link for each paper, and fetches counts of models, datasets, and Spaces linked to each paper.
     """
     try:
         # Load datasets
@@ -80,17 +52,6 @@ def get_df() -> pd.DataFrame:
             info = row.copy()
             if "abstract" in info:
                 del info["abstract"]
-            # Fetch repo counts
-            arxiv_id = info.get("arxiv_id", "")
-            if arxiv_id:
-                counts = get_repo_counts(arxiv_id)
-                info.update(counts)
-            else:
-                info.update({
-                    'models_count': 0,
-                    'datasets_count': 0,
-                    'spaces_count': 0
-                })
             paper_info.append(info)
         df_prepared = pd.DataFrame(paper_info)
@@ -98,6 +59,11 @@ def get_df() -> pd.DataFrame:
         logger.info("Adding 'paper_page' links.")
         df_prepared["paper_page"] = df_prepared["arxiv_id"].apply(lambda x: f"https://huggingface.co/papers/{x}")
         logger.info("DataFrame preparation complete.")
         return df_prepared
     except Exception as e:
@@ -109,11 +75,7 @@ class Prettifier:
     """
     Converts raw DataFrame rows into a prettified format suitable for display.
     """
-    REQUIRED_COLUMNS = [
-        "arxiv_id", "date_display", "date", "paper_page",
-        "title", "github", "👍", "💬",
-        "models_count", "datasets_count", "spaces_count"
-    ]
     @staticmethod
     def get_github_link(link: str) -> str:
@@ -140,9 +102,6 @@ class Prettifier:
                 "github": Prettifier.get_github_link(row.get("github", "")),
                 "👍": row.get("upvotes", 0),
                 "💬": row.get("num_comments", 0),
-                "models_count": row.get("models_count", 0),
-                "datasets_count": row.get("datasets_count", 0),
-                "spaces_count": row.get("spaces_count", 0),
             }
             new_rows.append(new_row)
@@ -166,9 +125,6 @@ class PaperList:
         ["github", "markdown"],
         ["👍", "number"],
         ["💬", "number"],
-        ["models_count", "number"],
-        ["datasets_count", "number"],
-        ["spaces_count", "number"],
     ]
     def __init__(self, df: pd.DataFrame):
@@ -195,12 +151,13 @@ class PaperList:
 class PaperManager:
     """
-    Manages sorting, pagination, and repository-based sorting for the list of papers.
     """
     def __init__(self, paper_list: PaperList, papers_per_page=30):
         self.paper_list = paper_list
         self.papers_per_page = papers_per_page
         self.sort_method = "hot"              # Default sort method
         self.sort_papers()
         # 'current_page' and 'total_pages' are set in 'sort_papers()'
@@ -239,12 +196,27 @@ class PaperManager:
                 df_sorted = df
         elif self.sort_method == "new":
             df_sorted = df.sort_values(by='date', ascending=False)  # Sort by 'date'
-        elif self.sort_method == "most_models":
-            df_sorted = df.sort_values(by='models_count', ascending=False)
-        elif self.sort_method == "most_datasets":
-            df_sorted = df.sort_values(by='datasets_count', ascending=False)
-        elif self.sort_method == "most_spaces":
-            df_sorted = df.sort_values(by='spaces_count', ascending=False)
         else:
             df_sorted = df
@@ -256,13 +228,16 @@ class PaperManager:
     def set_sort_method(self, method, time_frame=None):
         """
-        Sets the sort method ('hot', 'new', 'most_models', 'most_datasets', 'most_spaces') and re-sorts the papers.
         """
-        valid_methods = ["hot", "new", "most_models", "most_datasets", "most_spaces"]
-        if method not in valid_methods:
             method = "hot"
         logger.info(f"Setting sort method to: {method}")
         self.sort_method = method
         self.sort_papers()
         return True  # Assume success
@@ -293,9 +268,6 @@ class PaperManager:
         url = f"https://huggingface.co/papers/{paper_id}"
         upvotes = row.get('👍', 0)
         comments = row.get('💬', 0)
-        models = row.get('models_count', 0)
-        datasets_count = row.get('datasets_count', 0)
-        spaces = row.get('spaces_count', 0)
         date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
         try:
             published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
@@ -316,8 +288,7 @@ class PaperManager:
             <td colspan="1"></td>
             <td class="subtext">
                 <span class="score">{upvotes} upvotes</span><br>
-                {time_ago} | <a href="#">{comments} comments</a><br>
-                Models: {models} | Datasets: {datasets_count} | Spaces: {spaces}
             </td>
         </tr>
         <tr style="height:5px"></tr>
@@ -422,18 +393,13 @@ logger.info("Scheduler shutdown registered.")
 def change_sort_method_ui(method: str, time_frame: str = "all time") -> str:
     """
-    Changes the sort method and, if applicable, sets additional parameters.
     """
     logger.info(f"Changing sort method to: {method} with time frame: {time_frame}")
-    if method.lower() in ["most_models", "most_datasets", "most_spaces"]:
-        paper_manager.set_sort_method(method.lower())
-    elif method.lower() == "hot":
-        paper_manager.set_sort_method(method.lower())
-    elif method.lower() == "new":
-        paper_manager.set_sort_method(method.lower())
     else:
-        # Default to 'hot' if method is unrecognized
-        paper_manager.set_sort_method("hot")
     return paper_manager.get_current_page_papers()
@@ -602,22 +568,28 @@ with demo:
                 </tr>
             </table>
             """)
-        # Sort Options (Removed "Top" and its timeframe)
         with gr.Row():
             sort_radio = gr.Radio(
-                choices=["Hot", "New", "Most Models", "Most Datasets", "Most Spaces"],
                 value="Hot",
                 label="Sort By",
                 interactive=True
             )
-            # Removed time_frame_dropdown as "Top" sort is removed
         # Paper list
         paper_list = gr.HTML()
         # Navigation Buttons
         with gr.Row():
             prev_button = gr.Button("Prev")
             next_button = gr.Button("Next")
     # Load papers on app start
     demo.load(
         fn=lambda: paper_manager.get_current_page_papers(),
@@ -628,10 +600,17 @@ with demo:
     prev_button.click(paper_manager.prev_page, outputs=[paper_list])
     next_button.click(paper_manager.next_page, outputs=[paper_list])
-    # Sort option change: Apply sorting method
     sort_radio.change(
         fn=change_sort_method_ui,
-        inputs=[sort_radio, None],  # Pass None since time_frame_dropdown is removed
         outputs=[paper_list]
     )

 import gradio as gr
 import datasets  # Ensure the datasets library is imported
 from datetime import timezone
 import atexit  # To gracefully shut down the scheduler
 api = HfApi()
 def get_df() -> pd.DataFrame:
     """
     Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns,
+    and adds a 'paper_page' link for each paper.
     """
     try:
         # Load datasets
             info = row.copy()
             if "abstract" in info:
                 del info["abstract"]
             paper_info.append(info)
         df_prepared = pd.DataFrame(paper_info)
         logger.info("Adding 'paper_page' links.")
         df_prepared["paper_page"] = df_prepared["arxiv_id"].apply(lambda x: f"https://huggingface.co/papers/{x}")
+        # Verify that 'date' column exists
+        if 'date' not in df_prepared.columns:
+            logger.error("'date' column is missing from the DataFrame.")
+            df_prepared["date"] = datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")
         logger.info("DataFrame preparation complete.")
         return df_prepared
     except Exception as e:
     """
     Converts raw DataFrame rows into a prettified format suitable for display.
     """
+    REQUIRED_COLUMNS = ["arxiv_id", "date_display", "date", "paper_page", "title", "github", "👍", "💬"]
     @staticmethod
     def get_github_link(link: str) -> str:
                 "github": Prettifier.get_github_link(row.get("github", "")),
                 "👍": row.get("upvotes", 0),
                 "💬": row.get("num_comments", 0),
             }
             new_rows.append(new_row)
         ["github", "markdown"],
         ["👍", "number"],
         ["💬", "number"],
     ]
     def __init__(self, df: pd.DataFrame):
 class PaperManager:
     """
+    Manages sorting and pagination for the list of papers.
     """
     def __init__(self, paper_list: PaperList, papers_per_page=30):
         self.paper_list = paper_list
         self.papers_per_page = papers_per_page
         self.sort_method = "hot"              # Default sort method
+        self.top_time_frame = "all time"      # Default time frame for "Top" sorting
         self.sort_papers()
         # 'current_page' and 'total_pages' are set in 'sort_papers()'
                 df_sorted = df
         elif self.sort_method == "new":
             df_sorted = df.sort_values(by='date', ascending=False)  # Sort by 'date'
+        elif self.sort_method == "top":
+            # Filter based on the selected time frame
+            now = datetime.datetime.now(timezone.utc)
+            if self.top_time_frame == "day":
+                time_threshold = now - datetime.timedelta(days=1)
+            elif self.top_time_frame == "week":
+                time_threshold = now - datetime.timedelta(weeks=1)
+            elif self.top_time_frame == "month":
+                time_threshold = now - datetime.timedelta(days=30)
+            elif self.top_time_frame == "year":
+                time_threshold = now - datetime.timedelta(days=365)
+            elif self.top_time_frame == "all time":
+                time_threshold = datetime.datetime.min.replace(tzinfo=timezone.utc)
+            else:
+                time_threshold = datetime.datetime.min.replace(tzinfo=timezone.utc)
+            # Convert 'date' column to datetime
+            df_sorted = df.copy()
+            df_sorted['date_parsed'] = pd.to_datetime(df_sorted['date'], errors='coerce').dt.tz_localize(timezone.utc, ambiguous='NaT', nonexistent='NaT')
+            df_sorted = df_sorted[df_sorted['date_parsed'] >= time_threshold]
+            df_sorted = df_sorted.sort_values(by='upvotes', ascending=False).drop(columns=['date_parsed'])
         else:
             df_sorted = df
     def set_sort_method(self, method, time_frame=None):
         """
+        Sets the sort method ('hot', 'new', 'top') and re-sorts the papers.
+        If 'top' is selected, also sets the time frame.
         """
+        if method not in ["hot", "new", "top"]:
             method = "hot"
         logger.info(f"Setting sort method to: {method}")
         self.sort_method = method
+        if method == "top" and time_frame:
+            self.top_time_frame = time_frame.lower()
+            logger.info(f"Setting top time frame to: {self.top_time_frame}")
         self.sort_papers()
         return True  # Assume success
         url = f"https://huggingface.co/papers/{paper_id}"
         upvotes = row.get('👍', 0)
         comments = row.get('💬', 0)
         date_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
         try:
             published_time = datetime.datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
             <td colspan="1"></td>
             <td class="subtext">
                 <span class="score">{upvotes} upvotes</span><br>
+                {time_ago} | <a href="#">{comments} comments</a>
             </td>
         </tr>
         <tr style="height:5px"></tr>
 def change_sort_method_ui(method: str, time_frame: str = "all time") -> str:
     """
+    Changes the sort method and, if 'top' is selected, sets the time frame.
     """
     logger.info(f"Changing sort method to: {method} with time frame: {time_frame}")
+    if method.lower() == "top":
+        paper_manager.set_sort_method(method.lower(), time_frame)
     else:
+        paper_manager.set_sort_method(method.lower())
     return paper_manager.get_current_page_papers()
                 </tr>
             </table>
             """)
+        # Sort Options and Time Frame (conditionally visible)
         with gr.Row():
             sort_radio = gr.Radio(
+                choices=["Hot", "New", "Top"],
                 value="Hot",
                 label="Sort By",
                 interactive=True
             )
+            time_frame_dropdown = gr.Dropdown(
+                choices=["day", "week", "month", "year", "all time"],
+                value="all time",
+                label="Time Frame for Top",
+                visible=False,
+                interactive=True
+            )
         # Paper list
         paper_list = gr.HTML()
         # Navigation Buttons
         with gr.Row():
             prev_button = gr.Button("Prev")
             next_button = gr.Button("Next")
     # Load papers on app start
     demo.load(
         fn=lambda: paper_manager.get_current_page_papers(),
     prev_button.click(paper_manager.prev_page, outputs=[paper_list])
     next_button.click(paper_manager.next_page, outputs=[paper_list])
+    # Sort option change: Toggle visibility of time_frame_dropdown based on sort method
+    sort_radio.change(
+        fn=lambda method: gr.update(visible=True) if method.lower() == "top" else gr.update(visible=False),
+        inputs=[sort_radio],
+        outputs=[time_frame_dropdown]
+    )
+    # Sort option change: Apply sorting method with time frame if applicable
     sort_radio.change(
         fn=change_sort_method_ui,
+        inputs=[sort_radio, time_frame_dropdown],
         outputs=[paper_list]
     )