Spaces:

siddhartharya
/

Bookmark-Manager

Running

App Files Files Community

siddhartharya commited on Nov 26, 2024

Commit

ff33c96

verified ·

1 Parent(s): 44e9a1d

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -144

app.py CHANGED Viewed

@@ -11,8 +11,6 @@ import re
 import logging
 import os
 import sys
-import concurrent.futures
-from concurrent.futures import ThreadPoolExecutor
 import threading
 from queue import Queue, Empty
 import json
@@ -75,14 +73,14 @@ CATEGORIES = [
     "Uncategorized",
 ]
-# Set up Groq Cloud API key and base URL
-GROQ_API_KEY = os.getenv('GROQ_API_KEY')
-if not GROQ_API_KEY:
     logger.error("GROQ_API_KEY environment variable not set.")
-openai.api_key = GROQ_API_KEY
-openai.api_base = "https://api.groq.com/openai/v1"
 # Initialize global variables for rate limiting
 api_lock = threading.Lock()
@@ -128,6 +126,111 @@ tpm_bucket = TokenBucket(rate=tpm_rate, capacity=TPM_LIMIT)
 # Queue for LLM tasks
 llm_queue = Queue()
 def llm_worker():
     """
     Worker thread to process LLM tasks from the queue while respecting rate limits.
@@ -214,7 +317,7 @@ Please provide your response in the following JSON format:
 """
             response = openai.ChatCompletion.create(
-                model='llama-3.1-70b-versatile',
                 messages=[
                     {"role": "user", "content": prompt}
                 ],
@@ -266,136 +369,12 @@ Please provide your response in the following JSON format:
         finally:
             llm_queue.task_done()
-def categorize_based_on_summary(summary, url):
-    """
-    Assign category based on keywords in the summary or URL.
-    """
-    summary_lower = summary.lower()
-    url_lower = url.lower()
-    if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
-        return 'Social Media'
-    elif 'wikipedia' in url_lower:
-        return 'Reference and Knowledge Bases'
-    elif 'cloud computing' in summary_lower or 'aws' in summary_lower:
-        return 'Technology'
-    elif 'news' in summary_lower or 'media' in summary_lower:
-        return 'News and Media'
-    elif 'education' in summary_lower or 'learning' in summary_lower:
-        return 'Education and Learning'
-    # Add more conditions as needed
-    else:
-        return 'Uncategorized'
-def validate_category(bookmark):
-    """
-    Further validate and adjust the category if needed.
-    """
-    # Example: Specific cases based on URL
-    url_lower = bookmark['url'].lower()
-    if 'facebook' in url_lower or 'x.com' in url_lower:
-        return 'Social Media'
-    elif 'wikipedia' in url_lower:
-        return 'Reference and Knowledge Bases'
-    elif 'aws.amazon.com' in url_lower:
-        return 'Technology'
-    # Add more specific cases as needed
-    else:
-        return bookmark['category']
-def extract_main_content(soup):
-    """
-    Extract the main content from a webpage while filtering out boilerplate content.
-    """
-    if not soup:
-        return ""
-    # Remove unwanted elements
-    for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'noscript']):
-        element.decompose()
-    # Extract text from <p> tags
-    p_tags = soup.find_all('p')
-    if p_tags:
-        content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
-    else:
-        # Fallback to body content
-        content = soup.get_text(separator=' ', strip=True)
-    # Clean up the text
-    content = re.sub(r'\s+', ' ', content)
-    # Truncate content to a reasonable length (e.g., 1500 words)
-    words = content.split()
-    if len(words) > 1500:
-        content = ' '.join(words[:1500])
-    return content
-def get_page_metadata(soup):
-    """
-    Extract metadata from the webpage including title, description, and keywords.
-    """
-    metadata = {
-        'title': '',
-        'description': '',
-        'keywords': ''
-    }
-    if not soup:
-        return metadata
-    # Get title
-    title_tag = soup.find('title')
-    if title_tag and title_tag.string:
-        metadata['title'] = title_tag.string.strip()
-    # Get meta description
-    meta_desc = (
-        soup.find('meta', attrs={'name': 'description'}) or
-        soup.find('meta', attrs={'property': 'og:description'}) or
-        soup.find('meta', attrs={'name': 'twitter:description'})
-    )
-    if meta_desc:
-        metadata['description'] = meta_desc.get('content', '').strip()
-    # Get meta keywords
-    meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
-    if meta_keywords:
-        metadata['keywords'] = meta_keywords.get('content', '').strip()
-    # Get OG title if main title is empty
-    if not metadata['title']:
-        og_title = soup.find('meta', attrs={'property': 'og:title'})
-        if og_title:
-            metadata['title'] = og_title.get('content', '').strip()
-    return metadata
 def generate_summary_and_assign_category(bookmark):
     """
-    Generate a concise summary and assign a category using a single LLM call.
-    This function decides whether to use metadata or enqueue an LLM call.
-    """
-    # Check if metadata can provide a summary
-    description = bookmark.get('description', '').strip()
-    title = bookmark.get('title', '').strip()
-    if description:
-        # Use description as summary
-        bookmark['summary'] = description
-        # Assign category based on description or title
-        bookmark['category'] = categorize_based_on_summary(description, bookmark['url'])
-        logger.info(f"Summary derived from metadata for {bookmark.get('url')}")
-    elif title:
-        # Use title as summary
-        bookmark['summary'] = title
-        # Assign category based on title
-        bookmark['category'] = categorize_based_on_summary(title, bookmark['url'])
-        logger.info(f"Summary derived from title for {bookmark.get('url')}")
-    else:
-        # Enqueue for LLM processing
-        logger.info(f"No sufficient metadata for {bookmark.get('url')}. Enqueuing for LLM summary generation.")
-        llm_queue.put(bookmark)
 def parse_bookmarks(file_content):
     """
@@ -444,15 +423,17 @@ def fetch_url_info(bookmark):
         if response.status_code >= 500:
             bookmark['dead_link'] = True
-            bookmark['description'] = ''
             bookmark['html_content'] = ''
             logger.warning(f"Dead link detected: {url} with status {response.status_code}")
         else:
             bookmark['dead_link'] = False
             bookmark['html_content'] = content
-            bookmark['description'] = ''
             logger.info(f"Fetched information for {url}")
     except requests.exceptions.Timeout:
         bookmark['dead_link'] = False
         bookmark['etag'] = 'N/A'
@@ -511,19 +492,21 @@ def display_bookmarks():
             status = "❌ Dead Link"
             card_style = "border: 2px solid red;"
             text_style = "color: white;"
         elif bookmark.get('slow_link'):
             status = "⏳ Slow Response"
             card_style = "border: 2px solid orange;"
             text_style = "color: white;"
         else:
             status = "✅ Active"
             card_style = "border: 2px solid green;"
             text_style = "color: white;"
         title = bookmark['title']
         url = bookmark['url']
         etag = bookmark.get('etag', 'N/A')
-        summary = bookmark.get('summary', '')
         category = bookmark.get('category', 'Uncategorized')
         # Escape HTML content to prevent XSS attacks
@@ -585,7 +568,7 @@ def process_uploaded_file(file, state_bookmarks):
         executor.map(fetch_url_info, bookmarks)
     # Process bookmarks for summary and category
-    logger.info("Processing bookmarks for summaries and categories")
     for bookmark in bookmarks:
         generate_summary_and_assign_category(bookmark)
@@ -750,7 +733,7 @@ Provide a concise and helpful response.
 """
         response = openai.ChatCompletion.create(
-            model='llama-3.1-70b-versatile',
             messages=[
                 {"role": "user", "content": prompt}
             ],
@@ -765,8 +748,8 @@ Provide a concise and helpful response.
         return chat_history
     except openai.error.RateLimitError as e:
-        wait_time = int(e.headers.get("Retry-After", 60))
-        logger.warning(f"Chatbot Rate limit reached. Waiting for {wait_time} seconds before retrying...")
         time.sleep(wait_time)
         return chatbot_response(user_query, chat_history)
     except Exception as e:

 import logging
 import os
 import sys
 import threading
 from queue import Queue, Empty
 import json
     "Uncategorized",
 ]
+# Set up OpenAI API key and base URL
+OPENAI_API_KEY = os.getenv('GROQ_API_KEY')  # Ensure this environment variable is set correctly
+if not OPENAI_API_KEY:
     logger.error("GROQ_API_KEY environment variable not set.")
+openai.api_key = OPENAI_API_KEY
+openai.api_base = "https://api.groq.com/openai/v1"  # Ensure this is the correct base URL for your API
 # Initialize global variables for rate limiting
 api_lock = threading.Lock()
 # Queue for LLM tasks
 llm_queue = Queue()
+def categorize_based_on_summary(summary, url):
+    """
+    Assign category based on keywords in the summary or URL.
+    """
+    summary_lower = summary.lower()
+    url_lower = url.lower()
+    if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
+        return 'Social Media'
+    elif 'wikipedia' in url_lower:
+        return 'Reference and Knowledge Bases'
+    elif 'cloud computing' in summary_lower or 'aws' in summary_lower:
+        return 'Technology'
+    elif 'news' in summary_lower or 'media' in summary_lower:
+        return 'News and Media'
+    elif 'education' in summary_lower or 'learning' in summary_lower:
+        return 'Education and Learning'
+    # Add more conditions as needed
+    else:
+        return 'Uncategorized'
+def validate_category(bookmark):
+    """
+    Further validate and adjust the category if needed.
+    """
+    # Example: Specific cases based on URL
+    url_lower = bookmark['url'].lower()
+    if 'facebook' in url_lower or 'x.com' in url_lower:
+        return 'Social Media'
+    elif 'wikipedia' in url_lower:
+        return 'Reference and Knowledge Bases'
+    elif 'aws.amazon.com' in url_lower:
+        return 'Technology'
+    # Add more specific cases as needed
+    else:
+        return bookmark['category']
+def extract_main_content(soup):
+    """
+    Extract the main content from a webpage while filtering out boilerplate content.
+    """
+    if not soup:
+        return ""
+    # Remove unwanted elements
+    for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'noscript']):
+        element.decompose()
+    # Extract text from <p> tags
+    p_tags = soup.find_all('p')
+    if p_tags:
+        content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
+    else:
+        # Fallback to body content
+        content = soup.get_text(separator=' ', strip=True)
+    # Clean up the text
+    content = re.sub(r'\s+', ' ', content)
+    # Truncate content to a reasonable length (e.g., 1500 words)
+    words = content.split()
+    if len(words) > 1500:
+        content = ' '.join(words[:1500])
+    return content
+def get_page_metadata(soup):
+    """
+    Extract metadata from the webpage including title, description, and keywords.
+    """
+    metadata = {
+        'title': '',
+        'description': '',
+        'keywords': ''
+    }
+    if not soup:
+        return metadata
+    # Get title
+    title_tag = soup.find('title')
+    if title_tag and title_tag.string:
+        metadata['title'] = title_tag.string.strip()
+    # Get meta description
+    meta_desc = (
+        soup.find('meta', attrs={'name': 'description'}) or
+        soup.find('meta', attrs={'property': 'og:description'}) or
+        soup.find('meta', attrs={'name': 'twitter:description'})
+    )
+    if meta_desc:
+        metadata['description'] = meta_desc.get('content', '').strip()
+    # Get meta keywords
+    meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
+    if meta_keywords:
+        metadata['keywords'] = meta_keywords.get('content', '').strip()
+    # Get OG title if main title is empty
+    if not metadata['title']:
+        og_title = soup.find('meta', attrs={'property': 'og:title'})
+        if og_title:
+            metadata['title'] = og_title.get('content', '').strip()
+    return metadata
 def llm_worker():
     """
     Worker thread to process LLM tasks from the queue while respecting rate limits.
 """
             response = openai.ChatCompletion.create(
+                model='llama-3.1-70b-versatile',  # Ensure this model is correct and available
                 messages=[
                     {"role": "user", "content": prompt}
                 ],
         finally:
             llm_queue.task_done()
 def generate_summary_and_assign_category(bookmark):
     """
+    Enqueue bookmarks for LLM processing.
+    """
+    logger.info(f"Enqueuing bookmark for LLM processing: {bookmark.get('url')}")
+    llm_queue.put(bookmark)
 def parse_bookmarks(file_content):
     """
         if response.status_code >= 500:
             bookmark['dead_link'] = True
             bookmark['html_content'] = ''
+            bookmark['description'] = ''
             logger.warning(f"Dead link detected: {url} with status {response.status_code}")
         else:
             bookmark['dead_link'] = False
             bookmark['html_content'] = content
+            # Extract description from metadata
+            soup = BeautifulSoup(content, 'html.parser')
+            metadata = get_page_metadata(soup)
+            bookmark['description'] = metadata.get('description', '')
             logger.info(f"Fetched information for {url}")
     except requests.exceptions.Timeout:
         bookmark['dead_link'] = False
         bookmark['etag'] = 'N/A'
             status = "❌ Dead Link"
             card_style = "border: 2px solid red;"
             text_style = "color: white;"
+            summary = 'No summary available.'
         elif bookmark.get('slow_link'):
             status = "⏳ Slow Response"
             card_style = "border: 2px solid orange;"
             text_style = "color: white;"
+            summary = bookmark.get('summary', 'No summary available.')
         else:
             status = "✅ Active"
             card_style = "border: 2px solid green;"
             text_style = "color: white;"
+            summary = bookmark.get('summary', 'No summary available.')
         title = bookmark['title']
         url = bookmark['url']
         etag = bookmark.get('etag', 'N/A')
         category = bookmark.get('category', 'Uncategorized')
         # Escape HTML content to prevent XSS attacks
         executor.map(fetch_url_info, bookmarks)
     # Process bookmarks for summary and category
+    logger.info("Enqueuing bookmarks for LLM processing")
     for bookmark in bookmarks:
         generate_summary_and_assign_category(bookmark)
 """
         response = openai.ChatCompletion.create(
+            model='llama-3.1-70b-versatile',  # Ensure this model is correct and available
             messages=[
                 {"role": "user", "content": prompt}
             ],
         return chat_history
     except openai.error.RateLimitError as e:
+        wait_time = int(e.headers.get("Retry-After", 5))
+        logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying...")
         time.sleep(wait_time)
         return chatbot_response(user_query, chat_history)
     except Exception as e: