Spaces:

siddhartharya
/

Bookmark-Manager

Paused

App Files Files Community

siddhartharya commited on Nov 26, 2024

Commit

44e9a1d

verified ·

1 Parent(s): 2e66ec2

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -106

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
 import threading
 from queue import Queue, Empty
 # Import OpenAI library
 import openai
@@ -145,7 +146,9 @@ def llm_worker():
         try:
             # Rate Limiting
             rpm_bucket.wait_for_token()
-            tpm_bucket.wait_for_token(tokens=150)  # Assuming max_tokens=150 per request
             html_content = bookmark.get('html_content', '')
             soup = BeautifulSoup(html_content, 'html.parser')
@@ -186,8 +189,11 @@ Provide:
 Categories:
 {', '.join([f'"{cat}"' for cat in CATEGORIES])}
 Format:
-Summary: [Your summary]
-Category: [One category]
 """
             else:
                 prompt = f"""
@@ -200,24 +206,19 @@ Provide:
 Categories:
 {', '.join([f'"{cat}"' for cat in CATEGORIES])}
 Format:
-Summary: [Your summary]
-Category: [One category]
 """
-            def estimate_tokens(text):
-                return len(text) / 4  # Approximation
-            prompt_tokens = estimate_tokens(prompt)
-            max_tokens = 150
-            total_tokens = prompt_tokens + max_tokens
-            # Prepare the prompt with token estimation
             response = openai.ChatCompletion.create(
                 model='llama-3.1-70b-versatile',
                 messages=[
                     {"role": "user", "content": prompt}
                 ],
-                max_tokens=int(max_tokens),
                 temperature=0.5,
             )
@@ -225,41 +226,31 @@ Category: [One category]
             if not content:
                 raise ValueError("Empty response received from the model.")
-            summary_match = re.search(r"Summary:\s*(.*)", content, re.IGNORECASE)
-            category_match = re.search(r"Category:\s*(.*)", content, re.IGNORECASE)
-            # Extract summary
-            if summary_match:
-                summary = summary_match.group(1).strip()
-                if summary:
-                    bookmark['summary'] = summary
-                else:
-                    # For dead links, only set summary if it's a slow link
-                    if bookmark.get('slow_link', False):
-                        bookmark['summary'] = metadata.get('description') or metadata.get('title') or 'No summary available.'
-                    else:
-                        # For dead links without summary, do not set 'summary'
-                        bookmark['summary'] = ''
-            else:
-                if bookmark.get('slow_link', False):
-                    bookmark['summary'] = metadata.get('description') or metadata.get('title') or 'No summary available.'
                 else:
-                    bookmark['summary'] = ''
-            # Extract category
-            if category_match:
-                category = category_match.group(1).strip().strip('"')
-                bookmark['category'] = category if category in CATEGORIES else 'Uncategorized'
-            else:
-                bookmark['category'] = 'Uncategorized'
-            # Simple keyword-based validation
-            summary_lower = bookmark.get('summary', '').lower()
-            url_lower = bookmark['url'].lower()
-            if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
-                bookmark['category'] = 'Social Media'
-            elif 'wikipedia' in url_lower:
-                bookmark['category'] = 'Reference and Knowledge Bases'
             logger.info("Successfully generated summary and assigned category")
         except openai.error.RateLimitError as e:
@@ -269,21 +260,47 @@ Category: [One category]
             time.sleep(60)  # Wait before retrying
         except Exception as e:
             logger.error(f"Error generating summary and assigning category for {bookmark.get('url')}: {e}", exc_info=True)
-            # For slow links, provide a summary from metadata or title
-            if bookmark.get('slow_link', False):
-                bookmark['summary'] = metadata.get('description') or metadata.get('title') or 'No summary available.'
-            # For dead links, attempt to set summary; if not possible, leave it unset
-            elif bookmark.get('dead_link', False):
-                bookmark['summary'] = metadata.get('description') or metadata.get('title') or ''
-            else:
-                bookmark['summary'] = 'No summary available.'
             bookmark['category'] = 'Uncategorized'
         finally:
             llm_queue.task_done()
-# Start the LLM worker thread
-llm_thread = threading.Thread(target=llm_worker, daemon=True)
-llm_thread.start()
 def extract_main_content(soup):
     """
@@ -356,7 +373,7 @@ def get_page_metadata(soup):
 def generate_summary_and_assign_category(bookmark):
     """
-    Generate a concise summary and assign a category.
     This function decides whether to use metadata or enqueue an LLM call.
     """
     # Check if metadata can provide a summary
@@ -367,32 +384,19 @@ def generate_summary_and_assign_category(bookmark):
         # Use description as summary
         bookmark['summary'] = description
         # Assign category based on description or title
-        assign_category_based_on_summary(bookmark)
         logger.info(f"Summary derived from metadata for {bookmark.get('url')}")
     elif title:
         # Use title as summary
         bookmark['summary'] = title
         # Assign category based on title
-        assign_category_based_on_summary(bookmark)
         logger.info(f"Summary derived from title for {bookmark.get('url')}")
     else:
         # Enqueue for LLM processing
         logger.info(f"No sufficient metadata for {bookmark.get('url')}. Enqueuing for LLM summary generation.")
         llm_queue.put(bookmark)
-def assign_category_based_on_summary(bookmark):
-    """
-    Assign category based on simple keyword matching in the summary.
-    """
-    summary_lower = bookmark.get('summary', '').lower()
-    url_lower = bookmark['url'].lower()
-    if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
-        bookmark['category'] = 'Social Media'
-    elif 'wikipedia' in url_lower:
-        bookmark['category'] = 'Reference and Knowledge Bases'
-    else:
-        bookmark['category'] = 'Uncategorized'
 def parse_bookmarks(file_content):
     """
     Parse bookmarks from HTML file.
@@ -440,17 +444,20 @@ def fetch_url_info(bookmark):
         if response.status_code >= 500:
             bookmark['dead_link'] = True
-            bookmark['html_content'] = content  # Keep content to extract metadata if possible
             logger.warning(f"Dead link detected: {url} with status {response.status_code}")
         else:
             bookmark['dead_link'] = False
             bookmark['html_content'] = content
             logger.info(f"Fetched information for {url}")
     except requests.exceptions.Timeout:
         bookmark['dead_link'] = False
         bookmark['etag'] = 'N/A'
         bookmark['status_code'] = 'Timeout'
         bookmark['html_content'] = ''
         bookmark['slow_link'] = True
         logger.warning(f"Timeout while fetching {url}. Marking as 'Slow'.")
@@ -458,22 +465,10 @@ def fetch_url_info(bookmark):
         bookmark['dead_link'] = True
         bookmark['etag'] = 'N/A'
         bookmark['status_code'] = 'Error'
         bookmark['html_content'] = ''
         logger.error(f"Error fetching URL info for {url}: {e}", exc_info=True)
     finally:
-        # Extract meta description for dead links if content is available
-        if bookmark.get('dead_link', False) and bookmark.get('html_content'):
-            soup = BeautifulSoup(bookmark['html_content'], 'html.parser')
-            metadata = get_page_metadata(soup)
-            bookmark['description'] = metadata.get('description', '')
-        elif not bookmark.get('dead_link', False):
-            # For active and slow links, attempt to extract description
-            soup = BeautifulSoup(bookmark['html_content'], 'html.parser')
-            metadata = get_page_metadata(soup)
-            bookmark['description'] = metadata.get('description', '')
-        else:
-            bookmark['description'] = ''
         with lock:
             fetch_cache[url] = {
                 'etag': bookmark.get('etag'),
@@ -491,8 +486,7 @@ def vectorize_and_index(bookmarks_list):
     global faiss_index
     logger.info("Vectorizing summaries and building FAISS index")
     try:
-        # Safely access 'summary' using .get() to avoid KeyError
-        summaries = [bookmark.get('summary', '') for bookmark in bookmarks_list]
         embeddings = embedding_model.encode(summaries)
         dimension = embeddings.shape[1]
         index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))
@@ -517,26 +511,19 @@ def display_bookmarks():
             status = "❌ Dead Link"
             card_style = "border: 2px solid red;"
             text_style = "color: white;"
-            # For dead links, use 'summary' if available
-            summary = bookmark.get('summary', '')
-            if not summary:
-                # Provide a default message or leave it empty
-                summary = 'No summary available.'
         elif bookmark.get('slow_link'):
             status = "⏳ Slow Response"
             card_style = "border: 2px solid orange;"
             text_style = "color: white;"
-            # For slow links, always provide a summary
-            summary = bookmark.get('summary', 'No summary available.')
         else:
             status = "✅ Active"
             card_style = "border: 2px solid green;"
             text_style = "color: white;"
-            summary = bookmark.get('summary', 'No summary available.')
         title = bookmark['title']
         url = bookmark['url']
         etag = bookmark.get('etag', 'N/A')
         category = bookmark.get('category', 'Uncategorized')
         # Escape HTML content to prevent XSS attacks
@@ -762,19 +749,12 @@ Bookmarks:
 Provide a concise and helpful response.
 """
-        def estimate_tokens(text):
-            return len(text) / 4  # Approximation
-        prompt_tokens = estimate_tokens(prompt)
-        max_tokens = 300
-        total_tokens = prompt_tokens + max_tokens
         response = openai.ChatCompletion.create(
             model='llama-3.1-70b-versatile',
             messages=[
                 {"role": "user", "content": prompt}
             ],
-            max_tokens=int(max_tokens),
             temperature=0.7,
         )
@@ -971,8 +951,12 @@ Navigate through the tabs to explore each feature in detail.
         logger.info("Launching Gradio app")
         demo.launch(debug=True)
     except Exception as e:
-        logger.error(f"Error building the app: {e}", exc_info=True)
-        print(f"Error building the app: {e}")
 if __name__ == "__main__":
     build_app()

 from concurrent.futures import ThreadPoolExecutor
 import threading
 from queue import Queue, Empty
+import json
 # Import OpenAI library
 import openai
         try:
             # Rate Limiting
             rpm_bucket.wait_for_token()
+            # Estimate tokens: prompt + max_tokens
+            # Here, we assume max_tokens=150
+            tpm_bucket.wait_for_token(tokens=150)
             html_content = bookmark.get('html_content', '')
             soup = BeautifulSoup(html_content, 'html.parser')
 Categories:
 {', '.join([f'"{cat}"' for cat in CATEGORIES])}
 Format:
+Please provide your response in the following JSON format:
+{{
+    "summary": "Your summary here.",
+    "category": "One category from the list."
+}}
 """
             else:
                 prompt = f"""
 Categories:
 {', '.join([f'"{cat}"' for cat in CATEGORIES])}
 Format:
+Please provide your response in the following JSON format:
+{{
+    "summary": "Your summary here.",
+    "category": "One category from the list."
+}}
 """
             response = openai.ChatCompletion.create(
                 model='llama-3.1-70b-versatile',
                 messages=[
                     {"role": "user", "content": prompt}
                 ],
+                max_tokens=150,
                 temperature=0.5,
             )
             if not content:
                 raise ValueError("Empty response received from the model.")
+            # Parse JSON response
+            try:
+                json_response = json.loads(content)
+                summary = json_response.get('summary', '').strip()
+                category = json_response.get('category', '').strip()
+                # Validate and assign
+                if not summary:
+                    summary = metadata.get('description') or metadata.get('title') or 'No summary available.'
+                bookmark['summary'] = summary
+                if category in CATEGORIES:
+                    bookmark['category'] = category
                 else:
+                    # Fallback to keyword-based categorization
+                    bookmark['category'] = categorize_based_on_summary(summary, bookmark['url'])
+            except json.JSONDecodeError:
+                logger.error(f"Failed to parse JSON response for {bookmark.get('url')}. Using fallback methods.")
+                # Fallback methods
+                bookmark['summary'] = metadata.get('description') or metadata.get('title') or 'No summary available.'
+                bookmark['category'] = categorize_based_on_summary(bookmark['summary'], bookmark['url'])
+            # Additional keyword-based validation
+            bookmark['category'] = validate_category(bookmark)
             logger.info("Successfully generated summary and assigned category")
         except openai.error.RateLimitError as e:
             time.sleep(60)  # Wait before retrying
         except Exception as e:
             logger.error(f"Error generating summary and assigning category for {bookmark.get('url')}: {e}", exc_info=True)
+            # Assign default values on failure
+            bookmark['summary'] = 'No summary available.'
             bookmark['category'] = 'Uncategorized'
         finally:
             llm_queue.task_done()
+def categorize_based_on_summary(summary, url):
+    """
+    Assign category based on keywords in the summary or URL.
+    """
+    summary_lower = summary.lower()
+    url_lower = url.lower()
+    if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
+        return 'Social Media'
+    elif 'wikipedia' in url_lower:
+        return 'Reference and Knowledge Bases'
+    elif 'cloud computing' in summary_lower or 'aws' in summary_lower:
+        return 'Technology'
+    elif 'news' in summary_lower or 'media' in summary_lower:
+        return 'News and Media'
+    elif 'education' in summary_lower or 'learning' in summary_lower:
+        return 'Education and Learning'
+    # Add more conditions as needed
+    else:
+        return 'Uncategorized'
+def validate_category(bookmark):
+    """
+    Further validate and adjust the category if needed.
+    """
+    # Example: Specific cases based on URL
+    url_lower = bookmark['url'].lower()
+    if 'facebook' in url_lower or 'x.com' in url_lower:
+        return 'Social Media'
+    elif 'wikipedia' in url_lower:
+        return 'Reference and Knowledge Bases'
+    elif 'aws.amazon.com' in url_lower:
+        return 'Technology'
+    # Add more specific cases as needed
+    else:
+        return bookmark['category']
 def extract_main_content(soup):
     """
 def generate_summary_and_assign_category(bookmark):
     """
+    Generate a concise summary and assign a category using a single LLM call.
     This function decides whether to use metadata or enqueue an LLM call.
     """
     # Check if metadata can provide a summary
         # Use description as summary
         bookmark['summary'] = description
         # Assign category based on description or title
+        bookmark['category'] = categorize_based_on_summary(description, bookmark['url'])
         logger.info(f"Summary derived from metadata for {bookmark.get('url')}")
     elif title:
         # Use title as summary
         bookmark['summary'] = title
         # Assign category based on title
+        bookmark['category'] = categorize_based_on_summary(title, bookmark['url'])
         logger.info(f"Summary derived from title for {bookmark.get('url')}")
     else:
         # Enqueue for LLM processing
         logger.info(f"No sufficient metadata for {bookmark.get('url')}. Enqueuing for LLM summary generation.")
         llm_queue.put(bookmark)
 def parse_bookmarks(file_content):
     """
     Parse bookmarks from HTML file.
         if response.status_code >= 500:
             bookmark['dead_link'] = True
+            bookmark['description'] = ''
+            bookmark['html_content'] = ''
             logger.warning(f"Dead link detected: {url} with status {response.status_code}")
         else:
             bookmark['dead_link'] = False
             bookmark['html_content'] = content
+            bookmark['description'] = ''
             logger.info(f"Fetched information for {url}")
     except requests.exceptions.Timeout:
         bookmark['dead_link'] = False
         bookmark['etag'] = 'N/A'
         bookmark['status_code'] = 'Timeout'
+        bookmark['description'] = ''
         bookmark['html_content'] = ''
         bookmark['slow_link'] = True
         logger.warning(f"Timeout while fetching {url}. Marking as 'Slow'.")
         bookmark['dead_link'] = True
         bookmark['etag'] = 'N/A'
         bookmark['status_code'] = 'Error'
+        bookmark['description'] = ''
         bookmark['html_content'] = ''
         logger.error(f"Error fetching URL info for {url}: {e}", exc_info=True)
     finally:
         with lock:
             fetch_cache[url] = {
                 'etag': bookmark.get('etag'),
     global faiss_index
     logger.info("Vectorizing summaries and building FAISS index")
     try:
+        summaries = [bookmark['summary'] for bookmark in bookmarks_list]
         embeddings = embedding_model.encode(summaries)
         dimension = embeddings.shape[1]
         index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))
             status = "❌ Dead Link"
             card_style = "border: 2px solid red;"
             text_style = "color: white;"
         elif bookmark.get('slow_link'):
             status = "⏳ Slow Response"
             card_style = "border: 2px solid orange;"
             text_style = "color: white;"
         else:
             status = "✅ Active"
             card_style = "border: 2px solid green;"
             text_style = "color: white;"
         title = bookmark['title']
         url = bookmark['url']
         etag = bookmark.get('etag', 'N/A')
+        summary = bookmark.get('summary', '')
         category = bookmark.get('category', 'Uncategorized')
         # Escape HTML content to prevent XSS attacks
 Provide a concise and helpful response.
 """
         response = openai.ChatCompletion.create(
             model='llama-3.1-70b-versatile',
             messages=[
                 {"role": "user", "content": prompt}
             ],
+            max_tokens=300,
             temperature=0.7,
         )
         logger.info("Launching Gradio app")
         demo.launch(debug=True)
     except Exception as e:
+        logger.error(f"Error building Gradio app: {e}", exc_info=True)
+        print(f"Error building Gradio app: {e}")
 if __name__ == "__main__":
+    # Start the LLM worker thread before launching the app
+    llm_thread = threading.Thread(target=llm_worker, daemon=True)
+    llm_thread.start()
     build_app()