Spaces:

siddhartharya
/

Bookmark-Manager

Running

App Files Files Community

siddhartharya commited on Nov 26, 2024

Commit

2e66ec2

verified ·

1 Parent(s): 2ff005a

Update app.py

Browse files

Files changed (1) hide show

app.py +148 -111

app.py CHANGED Viewed

@@ -8,14 +8,13 @@ import numpy as np
 import requests
 import time
 import re
-import base64
 import logging
 import os
 import sys
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
 import threading
-from collections import deque
 # Import OpenAI library
 import openai
@@ -105,7 +104,6 @@ class TokenBucket:
         with self.lock:
             now = time.time()
             elapsed = now - self.timestamp
-            # Refill tokens
             refill = elapsed * self.rate
             self.tokens = min(self.capacity, self.tokens + refill)
             self.timestamp = now
@@ -126,93 +124,28 @@ tpm_rate = TPM_LIMIT / 60  # tokens per second
 rpm_bucket = TokenBucket(rate=rpm_rate, capacity=RPM_LIMIT)
 tpm_bucket = TokenBucket(rate=tpm_rate, capacity=TPM_LIMIT)
-def extract_main_content(soup):
-    """
-    Extract the main content from a webpage while filtering out boilerplate content.
-    """
-    if not soup:
-        return ""
-    # Remove unwanted elements
-    for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'noscript']):
-        element.decompose()
-    # Extract text from <p> tags
-    p_tags = soup.find_all('p')
-    if p_tags:
-        content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
-    else:
-        # Fallback to body content
-        content = soup.get_text(separator=' ', strip=True)
-    # Clean up the text
-    content = re.sub(r'\s+', ' ', content)
-    # Truncate content to a reasonable length (e.g., 1500 words)
-    words = content.split()
-    if len(words) > 1500:
-        content = ' '.join(words[:1500])
-    return content
-def get_page_metadata(soup):
     """
-    Extract metadata from the webpage including title, description, and keywords.
     """
-    metadata = {
-        'title': '',
-        'description': '',
-        'keywords': ''
-    }
-    if not soup:
-        return metadata
-    # Get title
-    title_tag = soup.find('title')
-    if title_tag and title_tag.string:
-        metadata['title'] = title_tag.string.strip()
-    # Get meta description
-    meta_desc = (
-        soup.find('meta', attrs={'name': 'description'}) or
-        soup.find('meta', attrs={'property': 'og:description'}) or
-        soup.find('meta', attrs={'name': 'twitter:description'})
-    )
-    if meta_desc:
-        metadata['description'] = meta_desc.get('content', '').strip()
-    # Get meta keywords
-    meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
-    if meta_keywords:
-        metadata['keywords'] = meta_keywords.get('content', '').strip()
-    # Get OG title if main title is empty
-    if not metadata['title']:
-        og_title = soup.find('meta', attrs={'property': 'og:title'})
-        if og_title:
-            metadata['title'] = og_title.get('content', '').strip()
-    return metadata
-def generate_summary_and_assign_category(bookmark):
-    """
-    Generate a concise summary and assign a category using a single LLM call.
-    For slow links, always provide a summary.
-    For dead links, provide a summary if possible; otherwise, ignore.
-    """
-    logger.info(f"Generating summary and assigning category for bookmark: {bookmark.get('url')}")
-    max_retries = 3
-    retry_count = 0
-    while retry_count < max_retries:
         try:
             # Rate Limiting
             rpm_bucket.wait_for_token()
-            # Estimate tokens: prompt + max_tokens
-            # Here, we assume max_tokens=150
-            tpm_bucket.wait_for_token(tokens=150)
             html_content = bookmark.get('html_content', '')
             soup = BeautifulSoup(html_content, 'html.parser')
@@ -272,17 +205,13 @@ Category: [One category]
 """
             def estimate_tokens(text):
-                return len(text) / 4
             prompt_tokens = estimate_tokens(prompt)
             max_tokens = 150
             total_tokens = prompt_tokens + max_tokens
-            tokens_per_minute = 40000
-            tokens_per_second = tokens_per_minute / 60
-            required_delay = total_tokens / tokens_per_second
-            sleep_time = max(required_delay, 2)
             response = openai.ChatCompletion.create(
                 model='llama-3.1-70b-versatile',
                 messages=[
@@ -333,16 +262,13 @@ Category: [One category]
                 bookmark['category'] = 'Reference and Knowledge Bases'
             logger.info("Successfully generated summary and assigned category")
-            time.sleep(sleep_time)
-            break
         except openai.error.RateLimitError as e:
-            retry_count += 1
-            wait_time = int(e.headers.get("Retry-After", 5))
-            logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying... (Attempt {retry_count}/{max_retries})")
-            time.sleep(wait_time)
         except Exception as e:
-            logger.error(f"Error generating summary and assigning category: {e}", exc_info=True)
             # For slow links, provide a summary from metadata or title
             if bookmark.get('slow_link', False):
                 bookmark['summary'] = metadata.get('description') or metadata.get('title') or 'No summary available.'
@@ -352,7 +278,120 @@ Category: [One category]
             else:
                 bookmark['summary'] = 'No summary available.'
             bookmark['category'] = 'Uncategorized'
-            break
 def parse_bookmarks(file_content):
     """
@@ -558,10 +597,14 @@ def process_uploaded_file(file, state_bookmarks):
     with ThreadPoolExecutor(max_workers=10) as executor:
         executor.map(fetch_url_info, bookmarks)
-    # Process bookmarks concurrently with LLM calls
-    logger.info("Processing bookmarks with LLM concurrently")
-    with ThreadPoolExecutor(max_workers=5) as executor:
-        executor.map(generate_summary_and_assign_category, bookmarks)
     try:
         faiss_index = vectorize_and_index(bookmarks)
@@ -690,7 +733,7 @@ def chatbot_response(user_query, chat_history):
         # Rate Limiting
         rpm_bucket.wait_for_token()
-        tpm_bucket.wait_for_token(tokens=300)  # Assuming max_tokens=300
         query_vector = embedding_model.encode([user_query]).astype('float32')
         k = 5
@@ -720,17 +763,12 @@ Provide a concise and helpful response.
 """
         def estimate_tokens(text):
-            return len(text) / 4
         prompt_tokens = estimate_tokens(prompt)
         max_tokens = 300
         total_tokens = prompt_tokens + max_tokens
-        tokens_per_minute = 40000
-        tokens_per_second = tokens_per_minute / 60
-        required_delay = total_tokens / tokens_per_second
-        sleep_time = max(required_delay, 2)
         response = openai.ChatCompletion.create(
             model='llama-3.1-70b-versatile',
             messages=[
@@ -742,14 +780,13 @@ Provide a concise and helpful response.
         answer = response['choices'][0]['message']['content'].strip()
         logger.info("Chatbot response generated")
-        time.sleep(sleep_time)
         chat_history.append({"role": "assistant", "content": answer})
         return chat_history
     except openai.error.RateLimitError as e:
-        wait_time = int(e.headers.get("Retry-After", 5))
-        logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying...")
         time.sleep(wait_time)
         return chatbot_response(user_query, chat_history)
     except Exception as e:

 import requests
 import time
 import re
 import logging
 import os
 import sys
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
 import threading
+from queue import Queue, Empty
 # Import OpenAI library
 import openai
         with self.lock:
             now = time.time()
             elapsed = now - self.timestamp
             refill = elapsed * self.rate
             self.tokens = min(self.capacity, self.tokens + refill)
             self.timestamp = now
 rpm_bucket = TokenBucket(rate=rpm_rate, capacity=RPM_LIMIT)
 tpm_bucket = TokenBucket(rate=tpm_rate, capacity=TPM_LIMIT)
+# Queue for LLM tasks
+llm_queue = Queue()
+def llm_worker():
     """
+    Worker thread to process LLM tasks from the queue while respecting rate limits.
     """
+    logger.info("LLM worker started.")
+    while True:
+        try:
+            bookmark = llm_queue.get(timeout=60)  # Wait for a task
+        except Empty:
+            continue  # No task, continue waiting
+        if bookmark is None:
+            logger.info("LLM worker shutting down.")
+            break  # Exit signal
         try:
             # Rate Limiting
             rpm_bucket.wait_for_token()
+            tpm_bucket.wait_for_token(tokens=150)  # Assuming max_tokens=150 per request
             html_content = bookmark.get('html_content', '')
             soup = BeautifulSoup(html_content, 'html.parser')
 """
             def estimate_tokens(text):
+                return len(text) / 4  # Approximation
             prompt_tokens = estimate_tokens(prompt)
             max_tokens = 150
             total_tokens = prompt_tokens + max_tokens
+            # Prepare the prompt with token estimation
             response = openai.ChatCompletion.create(
                 model='llama-3.1-70b-versatile',
                 messages=[
                 bookmark['category'] = 'Reference and Knowledge Bases'
             logger.info("Successfully generated summary and assigned category")
         except openai.error.RateLimitError as e:
+            logger.warning(f"LLM Rate limit reached while processing {bookmark.get('url')}. Retrying later...")
+            # Re-enqueue the bookmark for retry
+            llm_queue.put(bookmark)
+            time.sleep(60)  # Wait before retrying
         except Exception as e:
+            logger.error(f"Error generating summary and assigning category for {bookmark.get('url')}: {e}", exc_info=True)
             # For slow links, provide a summary from metadata or title
             if bookmark.get('slow_link', False):
                 bookmark['summary'] = metadata.get('description') or metadata.get('title') or 'No summary available.'
             else:
                 bookmark['summary'] = 'No summary available.'
             bookmark['category'] = 'Uncategorized'
+        finally:
+            llm_queue.task_done()
+# Start the LLM worker thread
+llm_thread = threading.Thread(target=llm_worker, daemon=True)
+llm_thread.start()
+def extract_main_content(soup):
+    """
+    Extract the main content from a webpage while filtering out boilerplate content.
+    """
+    if not soup:
+        return ""
+    # Remove unwanted elements
+    for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'noscript']):
+        element.decompose()
+    # Extract text from <p> tags
+    p_tags = soup.find_all('p')
+    if p_tags:
+        content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
+    else:
+        # Fallback to body content
+        content = soup.get_text(separator=' ', strip=True)
+    # Clean up the text
+    content = re.sub(r'\s+', ' ', content)
+    # Truncate content to a reasonable length (e.g., 1500 words)
+    words = content.split()
+    if len(words) > 1500:
+        content = ' '.join(words[:1500])
+    return content
+def get_page_metadata(soup):
+    """
+    Extract metadata from the webpage including title, description, and keywords.
+    """
+    metadata = {
+        'title': '',
+        'description': '',
+        'keywords': ''
+    }
+    if not soup:
+        return metadata
+    # Get title
+    title_tag = soup.find('title')
+    if title_tag and title_tag.string:
+        metadata['title'] = title_tag.string.strip()
+    # Get meta description
+    meta_desc = (
+        soup.find('meta', attrs={'name': 'description'}) or
+        soup.find('meta', attrs={'property': 'og:description'}) or
+        soup.find('meta', attrs={'name': 'twitter:description'})
+    )
+    if meta_desc:
+        metadata['description'] = meta_desc.get('content', '').strip()
+    # Get meta keywords
+    meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
+    if meta_keywords:
+        metadata['keywords'] = meta_keywords.get('content', '').strip()
+    # Get OG title if main title is empty
+    if not metadata['title']:
+        og_title = soup.find('meta', attrs={'property': 'og:title'})
+        if og_title:
+            metadata['title'] = og_title.get('content', '').strip()
+    return metadata
+def generate_summary_and_assign_category(bookmark):
+    """
+    Generate a concise summary and assign a category.
+    This function decides whether to use metadata or enqueue an LLM call.
+    """
+    # Check if metadata can provide a summary
+    description = bookmark.get('description', '').strip()
+    title = bookmark.get('title', '').strip()
+    if description:
+        # Use description as summary
+        bookmark['summary'] = description
+        # Assign category based on description or title
+        assign_category_based_on_summary(bookmark)
+        logger.info(f"Summary derived from metadata for {bookmark.get('url')}")
+    elif title:
+        # Use title as summary
+        bookmark['summary'] = title
+        # Assign category based on title
+        assign_category_based_on_summary(bookmark)
+        logger.info(f"Summary derived from title for {bookmark.get('url')}")
+    else:
+        # Enqueue for LLM processing
+        logger.info(f"No sufficient metadata for {bookmark.get('url')}. Enqueuing for LLM summary generation.")
+        llm_queue.put(bookmark)
+def assign_category_based_on_summary(bookmark):
+    """
+    Assign category based on simple keyword matching in the summary.
+    """
+    summary_lower = bookmark.get('summary', '').lower()
+    url_lower = bookmark['url'].lower()
+    if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
+        bookmark['category'] = 'Social Media'
+    elif 'wikipedia' in url_lower:
+        bookmark['category'] = 'Reference and Knowledge Bases'
+    else:
+        bookmark['category'] = 'Uncategorized'
 def parse_bookmarks(file_content):
     """
     with ThreadPoolExecutor(max_workers=10) as executor:
         executor.map(fetch_url_info, bookmarks)
+    # Process bookmarks for summary and category
+    logger.info("Processing bookmarks for summaries and categories")
+    for bookmark in bookmarks:
+        generate_summary_and_assign_category(bookmark)
+    # Wait until all LLM tasks are completed
+    llm_queue.join()
+    logger.info("All LLM tasks have been processed")
     try:
         faiss_index = vectorize_and_index(bookmarks)
         # Rate Limiting
         rpm_bucket.wait_for_token()
+        tpm_bucket.wait_for_token(tokens=300)  # Assuming max_tokens=300 per request
         query_vector = embedding_model.encode([user_query]).astype('float32')
         k = 5
 """
         def estimate_tokens(text):
+            return len(text) / 4  # Approximation
         prompt_tokens = estimate_tokens(prompt)
         max_tokens = 300
         total_tokens = prompt_tokens + max_tokens
         response = openai.ChatCompletion.create(
             model='llama-3.1-70b-versatile',
             messages=[
         answer = response['choices'][0]['message']['content'].strip()
         logger.info("Chatbot response generated")
         chat_history.append({"role": "assistant", "content": answer})
         return chat_history
     except openai.error.RateLimitError as e:
+        wait_time = int(e.headers.get("Retry-After", 60))
+        logger.warning(f"Chatbot Rate limit reached. Waiting for {wait_time} seconds before retrying...")
         time.sleep(wait_time)
         return chatbot_response(user_query, chat_history)
     except Exception as e: