dailypapershackernews-dev

Sleeping

App Files Files Community

akhaliq HF Staff commited on Sep 20, 2024

Commit

6790790

verified ·

1 Parent(s): 3e6fd58

Update app.py

Browse files

Files changed (1) hide show

app.py +158 -22

app.py CHANGED Viewed

@@ -19,9 +19,15 @@ from datetime import timezone  # Ensure timezone is imported
 api = HfApi()
 def get_df() -> pd.DataFrame:
-    # Load and merge datasets
     df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
     df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas()
     df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id")
     # Reverse the DataFrame to have the latest papers first
@@ -47,6 +53,9 @@ def get_df() -> pd.DataFrame:
 class Prettifier:
     @staticmethod
     def get_github_link(link: str) -> str:
         if not link:
@@ -64,24 +73,27 @@ class Prettifier:
             date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
             new_row = {
-                "arxiv_id": row["arxiv_id"],  # Include arxiv_id
-                "date_display": date_display,  # For display
-                "date": row["date"],            # For internal calculations
-                "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
-                "title": row["title"],
                 "github": Prettifier.get_github_link(row.get("github", "")),
-                "👍": row["upvotes"],
-                "💬": row["num_comments"],
             }
             new_rows.append(new_row)
         return pd.DataFrame(new_rows)
 class PaperList:
     COLUMN_INFO = [
-        ["arxiv_id", "str"],         # Added arxiv_id
-        ["date_display", "markdown"],# For display
-        ["date", "str"],             # For internal use
         ["paper_page", "markdown"],
         ["title", "str"],
         ["github", "markdown"],
@@ -107,6 +119,9 @@ class PaperList:
         title_search_query: str,
         max_num_to_retrieve: int = 1000,  # Set a high default to include all if not specified
     ) -> pd.DataFrame:
         df = self.df_raw.copy()
         # Filter by title if search query is provided
@@ -125,11 +140,15 @@ class PaperList:
 # --- Sorting and Pagination Management ---
 class PaperManager:
     def __init__(self, paper_list: PaperList, papers_per_page=30):
         self.paper_list = paper_list
         self.papers_per_page = papers_per_page
-        self.sort_method = "hot"  # Default sort method
-        self.current_search_query = ""  # Initialize with no search query
         self.sort_papers()
         # 'current_page' and 'total_pages' are set in 'sort_papers()'
@@ -154,6 +173,9 @@ class PaperManager:
         return score
     def sort_papers(self):
         df = self.paper_list.df_raw.copy()
         # Apply search filter if a search query exists
@@ -164,7 +186,28 @@ class PaperManager:
             df['score'] = df.apply(self.calculate_score, axis=1)
             df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
         elif self.sort_method == "new":
-            df_sorted = df.sort_values(by='date', ascending=False)  # Sort by 'date' instead of 'published_at'
         else:
             df_sorted = df
@@ -173,21 +216,34 @@ class PaperManager:
         self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1)
         self.current_page = 1
-    def set_sort_method(self, method):
-        if method not in ["hot", "new"]:
             method = "hot"
         print(f"Setting sort method to: {method}")
         self.sort_method = method
         self.sort_papers()
         return True  # Assume success
     def set_search_query(self, query: str):
         print(f"Setting search query to: {query}")
         self.current_search_query = query
         self.sort_papers()
         return True  # Assume success
     def get_current_page_papers(self) -> str:
         start = (self.current_page - 1) * self.papers_per_page
         end = start + self.papers_per_page
         current_papers = self.paper_list.df_prettified.iloc[start:end]
@@ -203,6 +259,9 @@ class PaperManager:
         """
     def format_paper(self, row, rank):
         title = row.get('title', 'No title')
         paper_id = row.get('arxiv_id', '')
         url = f"https://huggingface.co/papers/{paper_id}"
@@ -235,22 +294,34 @@ class PaperManager:
         """
     def next_page(self) -> str:
         if self.current_page < self.total_pages:
             self.current_page += 1
         return self.get_current_page_papers()
     def prev_page(self) -> str:
         if self.current_page > 1:
             self.current_page -= 1
         return self.get_current_page_papers()
     def refresh(self) -> str:
         self.sort_papers()
         return self.get_current_page_papers()
 # Initialize PaperList and PaperManager
 def initialize_paper_manager() -> str:
     df = get_df()
     paper_list = PaperList(df)
     manager = PaperManager(paper_list)
@@ -261,6 +332,9 @@ paper_manager = None  # Initialize globally
 def setup_paper_manager():
     global paper_manager
     df = get_df()
     paper_list = PaperList(df)
@@ -272,6 +346,9 @@ setup_paper_manager()
 def update_paper_manager() -> str:
     global paper_manager
     df = get_df()
     paper_manager.paper_list = PaperList(df)
@@ -293,20 +370,40 @@ scheduler_data.start()
 # --- Gradio Interface Functions ---
-def change_sort_method_ui(method: str) -> str:
-    paper_manager.set_sort_method(method.lower())
     return paper_manager.get_current_page_papers()
 def refresh_papers_ui() -> str:
     return paper_manager.refresh()
 def search_papers_ui(query: str) -> str:
     paper_manager.set_search_query(query)
     return paper_manager.get_current_page_papers()
 # --- CSS Styling ---
 css = """
@@ -453,7 +550,6 @@ table {
 }
 """
 # --- Initialize Gradio Blocks ---
 demo = gr.Blocks(css=css)
@@ -484,7 +580,7 @@ with demo:
                 </tr>
             </table>
             """)
-        # Search Bar and Sort Options
         with gr.Row():
             search_box = gr.Textbox(
                 label="Search Papers by Title",
@@ -493,13 +589,22 @@ with demo:
                 interactive=True
             )
             search_button = gr.Button("Search")
         with gr.Row():
             sort_radio = gr.Radio(
-                choices=["Hot", "New"],
                 value="Hot",
                 label="Sort By",
                 interactive=True
             )
         # Paper list
         paper_list = gr.HTML()
         # Navigation Buttons
@@ -532,8 +637,24 @@ with demo:
     # Sort option change
     sort_radio.change(
-        fn=change_sort_method_ui,
         inputs=[sort_radio],
         outputs=[paper_list]
     )
@@ -544,6 +665,21 @@ with demo:
         outputs=[paper_list]
     )
 # --- Launch the App ---

 api = HfApi()
 def get_df() -> pd.DataFrame:
+    """
+    Loads and merges the papers and stats datasets, preprocesses the data by removing unnecessary columns,
+    and adds a 'paper_page' link for each paper.
+    """
+    # Load datasets
     df_papers = datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas()
     df_stats = datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas()
+    # Merge datasets on 'arxiv_id'
     df = pd.merge(left=df_papers, right=df_stats, on="arxiv_id")
     # Reverse the DataFrame to have the latest papers first
 class Prettifier:
+    """
+    Converts raw DataFrame rows into a prettified format suitable for display.
+    """
     @staticmethod
     def get_github_link(link: str) -> str:
         if not link:
             date_display = Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}")
             new_row = {
+                "arxiv_id": row.get("arxiv_id", ""),                        # Include arxiv_id
+                "date_display": date_display,                               # For display
+                "date": row.get("date", datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")),  # For internal calculations
+                "paper_page": Prettifier.create_link(row.get("arxiv_id", ""), row.get("paper_page", "#")),
+                "title": row.get("title", "No title"),
                 "github": Prettifier.get_github_link(row.get("github", "")),
+                "👍": row.get("upvotes", 0),
+                "💬": row.get("num_comments", 0),
             }
             new_rows.append(new_row)
         return pd.DataFrame(new_rows)
 class PaperList:
+    """
+    Manages the list of papers, including search functionality.
+    """
     COLUMN_INFO = [
+        ["arxiv_id", "str"],          # Added arxiv_id
+        ["date_display", "markdown"], # For display
+        ["date", "str"],              # For internal use
         ["paper_page", "markdown"],
         ["title", "str"],
         ["github", "markdown"],
         title_search_query: str,
         max_num_to_retrieve: int = 1000,  # Set a high default to include all if not specified
     ) -> pd.DataFrame:
+        """
+        Filters the DataFrame based on the title search query and limits the number of results.
+        """
         df = self.df_raw.copy()
         # Filter by title if search query is provided
 # --- Sorting and Pagination Management ---
 class PaperManager:
+    """
+    Manages sorting, pagination, and search queries for the list of papers.
+    """
     def __init__(self, paper_list: PaperList, papers_per_page=30):
         self.paper_list = paper_list
         self.papers_per_page = papers_per_page
+        self.sort_method = "hot"              # Default sort method
+        self.current_search_query = ""        # Initialize with no search query
+        self.top_time_frame = "all time"      # Default time frame for "Top" sorting
         self.sort_papers()
         # 'current_page' and 'total_pages' are set in 'sort_papers()'
         return score
     def sort_papers(self):
+        """
+        Sorts the papers based on the current sort method and search query.
+        """
         df = self.paper_list.df_raw.copy()
         # Apply search filter if a search query exists
             df['score'] = df.apply(self.calculate_score, axis=1)
             df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
         elif self.sort_method == "new":
+            df_sorted = df.sort_values(by='date', ascending=False)  # Sort by 'date'
+        elif self.sort_method == "top":
+            # Filter based on the selected time frame
+            now = datetime.datetime.now(timezone.utc)
+            if self.top_time_frame == "day":
+                time_threshold = now - datetime.timedelta(days=1)
+            elif self.top_time_frame == "week":
+                time_threshold = now - datetime.timedelta(weeks=1)
+            elif self.top_time_frame == "month":
+                time_threshold = now - datetime.timedelta(days=30)
+            elif self.top_time_frame == "year":
+                time_threshold = now - datetime.timedelta(days=365)
+            elif self.top_time_frame == "all time":
+                time_threshold = datetime.datetime.min.replace(tzinfo=timezone.utc)
+            else:
+                time_threshold = datetime.datetime.min.replace(tzinfo=timezone.utc)
+            # Convert 'date' column to datetime
+            df_sorted = df.copy()
+            df_sorted['date_parsed'] = pd.to_datetime(df_sorted['date'], errors='coerce').dt.tz_localize(timezone.utc)
+            df_sorted = df_sorted[df_sorted['date_parsed'] >= time_threshold]
+            df_sorted = df_sorted.sort_values(by='upvotes', ascending=False).drop(columns=['date_parsed'])
         else:
             df_sorted = df
         self.total_pages = max((len(self.paper_list.df_raw) + self.papers_per_page - 1) // self.papers_per_page, 1)
         self.current_page = 1
+    def set_sort_method(self, method, time_frame=None):
+        """
+        Sets the sort method ('hot', 'new', 'top') and re-sorts the papers.
+        If 'top' is selected, also sets the time frame.
+        """
+        if method not in ["hot", "new", "top"]:
             method = "hot"
         print(f"Setting sort method to: {method}")
         self.sort_method = method
+        if method == "top" and time_frame:
+            self.top_time_frame = time_frame.lower()
+            print(f"Setting top time frame to: {self.top_time_frame}")
         self.sort_papers()
         return True  # Assume success
     def set_search_query(self, query: str):
+        """
+        Sets the current search query and re-sorts the papers.
+        """
         print(f"Setting search query to: {query}")
         self.current_search_query = query
         self.sort_papers()
         return True  # Assume success
     def get_current_page_papers(self) -> str:
+        """
+        Retrieves the HTML string of the current page's papers.
+        """
         start = (self.current_page - 1) * self.papers_per_page
         end = start + self.papers_per_page
         current_papers = self.paper_list.df_prettified.iloc[start:end]
         """
     def format_paper(self, row, rank):
+        """
+        Formats a single paper entry into HTML.
+        """
         title = row.get('title', 'No title')
         paper_id = row.get('arxiv_id', '')
         url = f"https://huggingface.co/papers/{paper_id}"
         """
     def next_page(self) -> str:
+        """
+        Navigates to the next page if possible.
+        """
         if self.current_page < self.total_pages:
             self.current_page += 1
         return self.get_current_page_papers()
     def prev_page(self) -> str:
+        """
+        Navigates to the previous page if possible.
+        """
         if self.current_page > 1:
             self.current_page -= 1
         return self.get_current_page_papers()
     def refresh(self) -> str:
+        """
+        Refreshes the current list of papers.
+        """
         self.sort_papers()
         return self.get_current_page_papers()
 # Initialize PaperList and PaperManager
 def initialize_paper_manager() -> str:
+    """
+    Initializes the PaperList and PaperManager with the current DataFrame.
+    """
     df = get_df()
     paper_list = PaperList(df)
     manager = PaperManager(paper_list)
 def setup_paper_manager():
+    """
+    Sets up the global PaperManager instance.
+    """
     global paper_manager
     df = get_df()
     paper_list = PaperList(df)
 def update_paper_manager() -> str:
+    """
+    Updates the global PaperManager with the latest DataFrame.
+    """
     global paper_manager
     df = get_df()
     paper_manager.paper_list = PaperList(df)
 # --- Gradio Interface Functions ---
+def change_sort_method_ui(method: str, time_frame: str = "all time") -> str:
+    """
+    Changes the sort method and, if 'top' is selected, sets the time frame.
+    """
+    if method.lower() == "top":
+        paper_manager.set_sort_method(method.lower(), time_frame)
+    else:
+        paper_manager.set_sort_method(method.lower())
     return paper_manager.get_current_page_papers()
 def refresh_papers_ui() -> str:
+    """
+    Refreshes the paper list.
+    """
     return paper_manager.refresh()
 def search_papers_ui(query: str) -> str:
+    """
+    Searches for papers based on the title search query.
+    """
     paper_manager.set_search_query(query)
     return paper_manager.get_current_page_papers()
+def clear_search_ui() -> str:
+    """
+    Clears the current search query and refreshes the paper list.
+    """
+    paper_manager.set_search_query("")
+    return paper_manager.get_current_page_papers()
 # --- CSS Styling ---
 css = """
 }
 """
 # --- Initialize Gradio Blocks ---
 demo = gr.Blocks(css=css)
                 </tr>
             </table>
             """)
+        # Search Bar and Clear Search Button
         with gr.Row():
             search_box = gr.Textbox(
                 label="Search Papers by Title",
                 interactive=True
             )
             search_button = gr.Button("Search")
+            clear_search_button = gr.Button("Clear Search")
+        # Sort Options and Time Frame (conditionally visible)
         with gr.Row():
             sort_radio = gr.Radio(
+                choices=["Hot", "New", "Top"],
                 value="Hot",
                 label="Sort By",
                 interactive=True
             )
+            time_frame_dropdown = gr.Dropdown(
+                choices=["day", "week", "month", "year", "all time"],
+                value="all time",
+                label="Time Frame for Top",
+                visible=False,
+                interactive=True
+            )
         # Paper list
         paper_list = gr.HTML()
         # Navigation Buttons
     # Sort option change
     sort_radio.change(
+        fn=lambda method: method.lower(),
         inputs=[sort_radio],
+        outputs=None,
+        _js="""
+        (method) => {
+            if (method === 'top') {
+                document.querySelector('[label="Time Frame for Top"]').style.display = 'block';
+            } else {
+                document.querySelector('[label="Time Frame for Top"]').style.display = 'none';
+            }
+            return method;
+        }
+        """
+    )
+    sort_radio.change(
+        fn=change_sort_method_ui,
+        inputs=[sort_radio, time_frame_dropdown],
         outputs=[paper_list]
     )
         outputs=[paper_list]
     )
+    # Clear search functionality
+    clear_search_button.click(
+        fn=clear_search_ui,
+        inputs=None,
+        outputs=[paper_list]
+    )
+    # Footer
+    gr.Markdown("""
+    Related useful Spaces:
+    - [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien)
+    - [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy)
+    - [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
+    """)
 # --- Launch the App ---