Spaces:

siddhartharya
/

Bookmark-Manager

Running

App Files Files Community

siddhartharya commited on Nov 25, 2024

Commit

3b1a6a1

verified ·

1 Parent(s): b8183dd

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -103

app.py CHANGED Viewed

@@ -153,26 +153,18 @@ def generate_summary(bookmark):
     logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
     try:
-        # Get the HTML soup object from the bookmark
-        soup = BeautifulSoup(bookmark.get('html_content', ''), 'html.parser')
-        # Extract metadata and main content
-        metadata = get_page_metadata(soup)
-        main_content = extract_main_content(soup)
-        # Prepare content for the prompt
-        available_content = []
-        if metadata['title']:
-            available_content.append(f"Title: {metadata['title']}")
-        if metadata['description']:
-            available_content.append(f"Description: {metadata['description']}")
-        if metadata['keywords']:
-            available_content.append(f"Keywords: {metadata['keywords']}")
-        if main_content:
-            available_content.append(f"Main Content: {main_content}")
-        # If content is insufficient, instruct the LLM to use prior knowledge
-        if not available_content or len(' '.join(available_content).split()) < 50:
             prompt = f"""
 You are a knowledgeable assistant.
@@ -188,19 +180,23 @@ Focus on:
 Be factual and objective.
 """
         else:
-            # Estimate token count and trim content if necessary
-            max_total_tokens = 8000  # Adjust based on model's maximum context length
-            prompt_tokens_estimate = len(' '.join(available_content).split()) + 200  # 200 tokens reserved for response
-            if prompt_tokens_estimate > max_total_tokens:
-                # Trim main content
-                allowable_content_tokens = max_total_tokens - 200  # Reserve 200 tokens for response
-                main_content_tokens = len(main_content.split())
-                if main_content_tokens > allowable_content_tokens:
-                    main_content = ' '.join(main_content.split()[:allowable_content_tokens])
-                    logger.info("Trimmed main content to fit within token limits.")
-                # Update available content
-                available_content[-1] = f"Main Content: {main_content}"
             # Construct the prompt
             prompt = f"""
@@ -218,12 +214,12 @@ Be factual and objective.
         # Call the LLM via Groq Cloud API
         response = openai.ChatCompletion.create(
-            model='llama3-8b-8192',  # Use the model as per your Groq Cloud API configuration
             messages=[
                 {"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
                 {"role": "user", "content": prompt}
             ],
-            max_tokens=200,  # Adjust as necessary to accommodate longer summaries
             temperature=0.5,
         )
@@ -234,48 +230,7 @@ Be factual and objective.
     except Exception as e:
         logger.error(f"Error generating summary: {e}", exc_info=True)
-        # Fallback mechanisms
-        if metadata['description']:
-            logger.info("Falling back to meta description")
-            bookmark['summary'] = metadata['description']
-        elif main_content:
-            logger.info("Falling back to main content")
-            bookmark['summary'] = ' '.join(main_content.split()[:50]) + '...'
-        elif metadata['title']:
-            logger.info("Falling back to title")
-            bookmark['summary'] = metadata['title']
-        else:
-            # If all else fails, prompt the LLM to use prior knowledge
-            prompt = f"""
-You are a knowledgeable assistant.
-The user provided a URL: {bookmark.get('url')}
-Please provide a concise summary (2-3 sentences) about this website based on your knowledge.
-Focus on:
-- The main purpose or topic of the website.
-- Key information or features.
-- Target audience or use case (if apparent).
-Be factual and objective.
-"""
-            try:
-                response = openai.ChatCompletion.create(
-                    model='llama3-8b-8192',
-                    messages=[
-                        {"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
-                        {"role": "user", "content": prompt}
-                    ],
-                    max_tokens=200,
-                    temperature=0.5,
-                )
-                summary = response['choices'][0]['message']['content'].strip()
-                logger.info("Successfully generated LLM summary using prior knowledge")
-                bookmark['summary'] = summary
-            except Exception as e:
-                logger.error(f"Error generating summary using prior knowledge: {e}", exc_info=True)
-                bookmark['summary'] = 'No summary available.'
         return bookmark
 def parse_bookmarks(file_content):
@@ -318,39 +273,20 @@ async def fetch_url_info(session, bookmark):
             bookmark['etag'] = response.headers.get('ETag', 'N/A')
             bookmark['status_code'] = response.status
             if response.status >= 500:
                 # Server error, consider as dead link
                 bookmark['dead_link'] = True
                 bookmark['description'] = ''
                 bookmark['html_content'] = ''
                 logger.warning(f"Dead link detected: {url} with status {response.status}")
-            elif response.status == 403:
-                # Forbidden, but may be accessible with proper headers
-                logger.info(f"Received 403 for {url}, retrying with different headers")
-                # Try with different headers or methods if necessary
-                # For now, we'll proceed to read the content
-                content = await response.text()
-                bookmark['dead_link'] = False
-                bookmark['html_content'] = content
-                bookmark['description'] = ''
-            elif response.status == 400:
-                # Bad request, may be due to missing parameters
-                bookmark['dead_link'] = False
-                content = await response.text()
-                bookmark['html_content'] = content
-                bookmark['description'] = ''
-            elif response.status >= 400:
-                # Other client errors
-                bookmark['dead_link'] = True
-                bookmark['description'] = ''
-                bookmark['html_content'] = ''
-                logger.warning(f"Dead link detected: {url} with status {response.status}")
             else:
                 bookmark['dead_link'] = False
-                content = await response.text()
-                bookmark['html_content'] = content  # Store full HTML for summary generation
                 bookmark['description'] = ''
                 logger.info(f"Fetched information for {url}")
     except Exception as e:
         bookmark['dead_link'] = True
         bookmark['etag'] = 'N/A'
@@ -364,7 +300,7 @@ async def fetch_url_info(session, bookmark):
             'status_code': bookmark.get('status_code'),
             'dead_link': bookmark.get('dead_link'),
             'description': bookmark.get('description'),
-            'html_content': bookmark.get('html_content', '')
         }
     return bookmark
@@ -417,7 +353,7 @@ Respond with only the category name.
     try:
         response = openai.ChatCompletion.create(
-            model='llama3-8b-8192',  # Use the model as per your Groq Cloud API configuration
             messages=[
                 {"role": "system", "content": "You categorize webpages based on their content."},
                 {"role": "user", "content": prompt}
@@ -695,7 +631,7 @@ Provide a concise and helpful response.
 """
         response = openai.ChatCompletion.create(
-            model='llama3-8b-8192',  # Use the model as per your Groq Cloud API configuration
             messages=[
                 {"role": "system", "content": "You assist users by finding relevant information from their bookmarks."},
                 {"role": "user", "content": prompt}

     logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
     try:
+        html_content = bookmark.get('html_content', '')
+        # Check for insufficient or error content
+        error_keywords = ['Access Denied', 'Error', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic']
+        if not html_content or len(html_content) < 500 or any(keyword.lower() in html_content.lower() for keyword in error_keywords):
+            logger.info(f"Content for {bookmark.get('url')} is insufficient or contains errors. Using prior knowledge.")
+            use_prior_knowledge = True
+        else:
+            use_prior_knowledge = False
+        if use_prior_knowledge:
+            # Construct prompt to use prior knowledge
             prompt = f"""
 You are a knowledgeable assistant.
 Be factual and objective.
 """
         else:
+            # Get the HTML soup object from the bookmark
+            soup = BeautifulSoup(html_content, 'html.parser')
+            # Extract metadata and main content
+            metadata = get_page_metadata(soup)
+            main_content = extract_main_content(soup)
+            # Prepare content for the prompt
+            available_content = []
+            if metadata['title']:
+                available_content.append(f"Title: {metadata['title']}")
+            if metadata['description']:
+                available_content.append(f"Description: {metadata['description']}")
+            if metadata['keywords']:
+                available_content.append(f"Keywords: {metadata['keywords']}")
+            if main_content:
+                available_content.append(f"Main Content: {main_content}")
             # Construct the prompt
             prompt = f"""
         # Call the LLM via Groq Cloud API
         response = openai.ChatCompletion.create(
+            model='llama3-8b-8192',
             messages=[
                 {"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
                 {"role": "user", "content": prompt}
             ],
+            max_tokens=200,
             temperature=0.5,
         )
     except Exception as e:
         logger.error(f"Error generating summary: {e}", exc_info=True)
+        bookmark['summary'] = 'No summary available.'
         return bookmark
 def parse_bookmarks(file_content):
             bookmark['etag'] = response.headers.get('ETag', 'N/A')
             bookmark['status_code'] = response.status
+            content = await response.text()
             if response.status >= 500:
                 # Server error, consider as dead link
                 bookmark['dead_link'] = True
                 bookmark['description'] = ''
                 bookmark['html_content'] = ''
                 logger.warning(f"Dead link detected: {url} with status {response.status}")
             else:
                 bookmark['dead_link'] = False
+                bookmark['html_content'] = content
                 bookmark['description'] = ''
                 logger.info(f"Fetched information for {url}")
     except Exception as e:
         bookmark['dead_link'] = True
         bookmark['etag'] = 'N/A'
             'status_code': bookmark.get('status_code'),
             'dead_link': bookmark.get('dead_link'),
             'description': bookmark.get('description'),
+            'html_content': bookmark.get('html_content', ''),
         }
     return bookmark
     try:
         response = openai.ChatCompletion.create(
+            model='llama3-8b-8192',
             messages=[
                 {"role": "system", "content": "You categorize webpages based on their content."},
                 {"role": "user", "content": prompt}
 """
         response = openai.ChatCompletion.create(
+            model='llama3-8b-8192',
             messages=[
                 {"role": "system", "content": "You assist users by finding relevant information from their bookmarks."},
                 {"role": "user", "content": prompt}