dailypapershackernews-dev

Running

App Files Files Community

akhaliq HF Staff commited on Sep 20, 2024

Commit

45195c8

verified ·

1 Parent(s): ba4e64e

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -29

app.py CHANGED Viewed

@@ -64,9 +64,13 @@ def get_df() -> pd.DataFrame:
         info = row.copy()
         if "abstract" in info:
             del info["abstract"]
-        info["paper_page"] = f"https://huggingface.co/papers/{row.arxiv_id}"
         paper_info.append(info)
-    return pd.DataFrame(paper_info)
 class Prettifier:
@@ -83,15 +87,6 @@ class Prettifier:
     def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
         new_rows = []
         for _, row in df.iterrows():
-            # Handle authors: list of dicts or list of strings
-            if "authors" in row and isinstance(row["authors"], list):
-                authors = ', '.join([
-                    author.get('name', '') if isinstance(author, dict) else str(author)
-                    for author in row["authors"]
-                ])
-            else:
-                authors = 'Unknown'
             # Handle published_at: original date
             published_at = row["date"]  # Already formatted as "%Y-%m-%d"
@@ -104,7 +99,6 @@ class Prettifier:
                 "published_at": published_at,  # For internal calculations
                 "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
                 "title": row["title"],
-                "authors": authors,  # Include authors
                 "github": Prettifier.get_github_link(row.get("github", "")),
                 "👍": row["upvotes"],
                 "💬": row["num_comments"],
@@ -120,7 +114,6 @@ class PaperList:
         ["published_at", "str"],     # For internal use
         ["paper_page", "markdown"],
         ["title", "str"],
-        ["authors", "str"],          # Added authors
         ["github", "markdown"],
         ["👍", "number"],
         ["💬", "number"],
@@ -141,23 +134,18 @@ class PaperList:
     def search(
         self,
-        start_date: datetime.datetime,
-        end_date: datetime.datetime,
         title_search_query: str,
-        max_num_to_retrieve: int,
     ) -> pd.DataFrame:
         df = self.df_raw.copy()
-        df["date"] = pd.to_datetime(df["date"], errors='coerce')
-        # Filter by date
-        df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
-        df["date"] = df["date"].dt.strftime("%Y-%m-%d").fillna(datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
-        # Filter by title
         if title_search_query:
             df = df[df["title"].str.contains(title_search_query, case=False, na=False)]
-        # Removed abstract_search_query filtering since ragatouille is no longer used
         # Prettify the DataFrame
         df_prettified = self._prettifier(df).loc[:, self.column_names]
@@ -171,6 +159,7 @@ class PaperManager:
         self.paper_list = paper_list
         self.papers_per_page = papers_per_page
         self.sort_method = "hot"  # Default sort method
         self.sort_papers()
         # 'current_page' and 'total_pages' are set in 'sort_papers()'
@@ -180,7 +169,7 @@ class PaperManager:
         This mimics the "hotness" algorithm used by platforms like Hacker News.
         """
         upvotes = row.get('👍', 0)
-        published_at_str = row.get('date', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))  # **FIX** Changed from 'published_at' to 'date'
         try:
             published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
         except ValueError:
@@ -197,11 +186,15 @@ class PaperManager:
     def sort_papers(self):
         df = self.paper_list.df_raw.copy()
         if self.sort_method == "hot":
             df['score'] = df.apply(self.calculate_score, axis=1)
             df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
         elif self.sort_method == "new":
-            df_sorted = df.sort_values(by='date', ascending=False)  # **FIX** Changed from 'published_at' to 'date'
         else:
             df_sorted = df
@@ -218,6 +211,12 @@ class PaperManager:
         self.sort_papers()
         return True  # Assume success
     def get_current_page_papers(self) -> str:
         start = (self.current_page - 1) * self.papers_per_page
         end = start + self.papers_per_page
@@ -237,7 +236,6 @@ class PaperManager:
         title = row.get('title', 'No title')
         paper_id = row.get('arxiv_id', '')
         url = f"https://huggingface.co/papers/{paper_id}"
-        authors = row.get('authors', 'Unknown')
         upvotes = row.get('👍', 0)
         comments = row.get('💬', 0)
         published_time_str = row.get('published_at', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
@@ -260,7 +258,7 @@ class PaperManager:
             <td colspan="1"></td>
             <td class="subtext">
                 <span class="score">{upvotes} upvotes</span><br>
-                authors: {authors} | {time_ago} | <a href="#">{comments} comments</a>
             </td>
         </tr>
         <tr style="height:5px"></tr>
@@ -334,6 +332,11 @@ def refresh_papers_ui() -> str:
     return paper_manager.refresh()
 # --- CSS Styling ---
 css = """
@@ -511,7 +514,15 @@ with demo:
                 </tr>
             </table>
             """)
-        # Sort Options
         with gr.Row():
             sort_radio = gr.Radio(
                 choices=["Hot", "New"],
@@ -556,6 +567,13 @@ with demo:
         outputs=[paper_list]
     )
     # Footer
     gr.Markdown("""
     Related useful Spaces:

         info = row.copy()
         if "abstract" in info:
             del info["abstract"]
         paper_info.append(info)
+    df_prepared = pd.DataFrame(paper_info)
+    # Add 'paper_page' links
+    df_prepared["paper_page"] = df_prepared["arxiv_id"].apply(lambda x: f"https://huggingface.co/papers/{x}")
+    return df_prepared
 class Prettifier:
     def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
         new_rows = []
         for _, row in df.iterrows():
             # Handle published_at: original date
             published_at = row["date"]  # Already formatted as "%Y-%m-%d"
                 "published_at": published_at,  # For internal calculations
                 "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
                 "title": row["title"],
                 "github": Prettifier.get_github_link(row.get("github", "")),
                 "👍": row["upvotes"],
                 "💬": row["num_comments"],
         ["published_at", "str"],     # For internal use
         ["paper_page", "markdown"],
         ["title", "str"],
         ["github", "markdown"],
         ["👍", "number"],
         ["💬", "number"],
     def search(
         self,
         title_search_query: str,
+        max_num_to_retrieve: int = 1000,  # Set a high default to include all if not specified
     ) -> pd.DataFrame:
         df = self.df_raw.copy()
+        # Filter by title if search query is provided
         if title_search_query:
             df = df[df["title"].str.contains(title_search_query, case=False, na=False)]
+        # Limit the number of papers to retrieve if max_num_to_retrieve is set
+        if max_num_to_retrieve:
+            df = df.head(max_num_to_retrieve)
         # Prettify the DataFrame
         df_prettified = self._prettifier(df).loc[:, self.column_names]
         self.paper_list = paper_list
         self.papers_per_page = papers_per_page
         self.sort_method = "hot"  # Default sort method
+        self.current_search_query = ""  # Initialize with no search query
         self.sort_papers()
         # 'current_page' and 'total_pages' are set in 'sort_papers()'
         This mimics the "hotness" algorithm used by platforms like Hacker News.
         """
         upvotes = row.get('👍', 0)
+        published_at_str = row.get('published_at', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))  # **FIX** Changed from 'date' to 'published_at'
         try:
             published_time = datetime.datetime.strptime(published_at_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
         except ValueError:
     def sort_papers(self):
         df = self.paper_list.df_raw.copy()
+        # Apply search filter if a search query exists
+        if self.current_search_query:
+            df = df[df["title"].str.contains(self.current_search_query, case=False, na=False)]
         if self.sort_method == "hot":
             df['score'] = df.apply(self.calculate_score, axis=1)
             df_sorted = df.sort_values(by='score', ascending=False).drop(columns=['score'])
         elif self.sort_method == "new":
+            df_sorted = df.sort_values(by='published_at', ascending=False)  # **FIX** Changed from 'date' to 'published_at'
         else:
             df_sorted = df
         self.sort_papers()
         return True  # Assume success
+    def set_search_query(self, query: str):
+        print(f"Setting search query to: {query}")
+        self.current_search_query = query
+        self.sort_papers()
+        return True  # Assume success
     def get_current_page_papers(self) -> str:
         start = (self.current_page - 1) * self.papers_per_page
         end = start + self.papers_per_page
         title = row.get('title', 'No title')
         paper_id = row.get('arxiv_id', '')
         url = f"https://huggingface.co/papers/{paper_id}"
         upvotes = row.get('👍', 0)
         comments = row.get('💬', 0)
         published_time_str = row.get('published_at', datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d"))
             <td colspan="1"></td>
             <td class="subtext">
                 <span class="score">{upvotes} upvotes</span><br>
+                {time_ago} | <a href="#">{comments} comments</a>
             </td>
         </tr>
         <tr style="height:5px"></tr>
     return paper_manager.refresh()
+def search_papers_ui(query: str) -> str:
+    paper_manager.set_search_query(query)
+    return paper_manager.get_current_page_papers()
 # --- CSS Styling ---
 css = """
                 </tr>
             </table>
             """)
+        # Search Bar and Sort Options
+        with gr.Row():
+            search_box = gr.Textbox(
+                label="Search Papers by Title",
+                placeholder="Enter keywords to search...",
+                lines=1,
+                interactive=True
+            )
+            search_button = gr.Button("Search")
         with gr.Row():
             sort_radio = gr.Radio(
                 choices=["Hot", "New"],
         outputs=[paper_list]
     )
+    # Search functionality
+    search_button.click(
+        fn=search_papers_ui,
+        inputs=[search_box],
+        outputs=[paper_list]
+    )
     # Footer
     gr.Markdown("""
     Related useful Spaces: