Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -11,8 +11,6 @@ import re
|
|
11 |
import logging
|
12 |
import os
|
13 |
import sys
|
14 |
-
import concurrent.futures
|
15 |
-
from concurrent.futures import ThreadPoolExecutor
|
16 |
import threading
|
17 |
from queue import Queue, Empty
|
18 |
import json
|
@@ -75,14 +73,14 @@ CATEGORIES = [
|
|
75 |
"Uncategorized",
|
76 |
]
|
77 |
|
78 |
-
# Set up
|
79 |
-
|
80 |
|
81 |
-
if not
|
82 |
logger.error("GROQ_API_KEY environment variable not set.")
|
83 |
|
84 |
-
openai.api_key =
|
85 |
-
openai.api_base = "https://api.groq.com/openai/v1"
|
86 |
|
87 |
# Initialize global variables for rate limiting
|
88 |
api_lock = threading.Lock()
|
@@ -128,6 +126,111 @@ tpm_bucket = TokenBucket(rate=tpm_rate, capacity=TPM_LIMIT)
|
|
128 |
# Queue for LLM tasks
|
129 |
llm_queue = Queue()
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
def llm_worker():
|
132 |
"""
|
133 |
Worker thread to process LLM tasks from the queue while respecting rate limits.
|
@@ -214,7 +317,7 @@ Please provide your response in the following JSON format:
|
|
214 |
"""
|
215 |
|
216 |
response = openai.ChatCompletion.create(
|
217 |
-
model='llama-3.1-70b-versatile',
|
218 |
messages=[
|
219 |
{"role": "user", "content": prompt}
|
220 |
],
|
@@ -266,136 +369,12 @@ Please provide your response in the following JSON format:
|
|
266 |
finally:
|
267 |
llm_queue.task_done()
|
268 |
|
269 |
-
def categorize_based_on_summary(summary, url):
|
270 |
-
"""
|
271 |
-
Assign category based on keywords in the summary or URL.
|
272 |
-
"""
|
273 |
-
summary_lower = summary.lower()
|
274 |
-
url_lower = url.lower()
|
275 |
-
if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
|
276 |
-
return 'Social Media'
|
277 |
-
elif 'wikipedia' in url_lower:
|
278 |
-
return 'Reference and Knowledge Bases'
|
279 |
-
elif 'cloud computing' in summary_lower or 'aws' in summary_lower:
|
280 |
-
return 'Technology'
|
281 |
-
elif 'news' in summary_lower or 'media' in summary_lower:
|
282 |
-
return 'News and Media'
|
283 |
-
elif 'education' in summary_lower or 'learning' in summary_lower:
|
284 |
-
return 'Education and Learning'
|
285 |
-
# Add more conditions as needed
|
286 |
-
else:
|
287 |
-
return 'Uncategorized'
|
288 |
-
|
289 |
-
def validate_category(bookmark):
|
290 |
-
"""
|
291 |
-
Further validate and adjust the category if needed.
|
292 |
-
"""
|
293 |
-
# Example: Specific cases based on URL
|
294 |
-
url_lower = bookmark['url'].lower()
|
295 |
-
if 'facebook' in url_lower or 'x.com' in url_lower:
|
296 |
-
return 'Social Media'
|
297 |
-
elif 'wikipedia' in url_lower:
|
298 |
-
return 'Reference and Knowledge Bases'
|
299 |
-
elif 'aws.amazon.com' in url_lower:
|
300 |
-
return 'Technology'
|
301 |
-
# Add more specific cases as needed
|
302 |
-
else:
|
303 |
-
return bookmark['category']
|
304 |
-
|
305 |
-
def extract_main_content(soup):
|
306 |
-
"""
|
307 |
-
Extract the main content from a webpage while filtering out boilerplate content.
|
308 |
-
"""
|
309 |
-
if not soup:
|
310 |
-
return ""
|
311 |
-
|
312 |
-
# Remove unwanted elements
|
313 |
-
for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'noscript']):
|
314 |
-
element.decompose()
|
315 |
-
|
316 |
-
# Extract text from <p> tags
|
317 |
-
p_tags = soup.find_all('p')
|
318 |
-
if p_tags:
|
319 |
-
content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
|
320 |
-
else:
|
321 |
-
# Fallback to body content
|
322 |
-
content = soup.get_text(separator=' ', strip=True)
|
323 |
-
|
324 |
-
# Clean up the text
|
325 |
-
content = re.sub(r'\s+', ' ', content)
|
326 |
-
|
327 |
-
# Truncate content to a reasonable length (e.g., 1500 words)
|
328 |
-
words = content.split()
|
329 |
-
if len(words) > 1500:
|
330 |
-
content = ' '.join(words[:1500])
|
331 |
-
|
332 |
-
return content
|
333 |
-
|
334 |
-
def get_page_metadata(soup):
|
335 |
-
"""
|
336 |
-
Extract metadata from the webpage including title, description, and keywords.
|
337 |
-
"""
|
338 |
-
metadata = {
|
339 |
-
'title': '',
|
340 |
-
'description': '',
|
341 |
-
'keywords': ''
|
342 |
-
}
|
343 |
-
|
344 |
-
if not soup:
|
345 |
-
return metadata
|
346 |
-
|
347 |
-
# Get title
|
348 |
-
title_tag = soup.find('title')
|
349 |
-
if title_tag and title_tag.string:
|
350 |
-
metadata['title'] = title_tag.string.strip()
|
351 |
-
|
352 |
-
# Get meta description
|
353 |
-
meta_desc = (
|
354 |
-
soup.find('meta', attrs={'name': 'description'}) or
|
355 |
-
soup.find('meta', attrs={'property': 'og:description'}) or
|
356 |
-
soup.find('meta', attrs={'name': 'twitter:description'})
|
357 |
-
)
|
358 |
-
if meta_desc:
|
359 |
-
metadata['description'] = meta_desc.get('content', '').strip()
|
360 |
-
|
361 |
-
# Get meta keywords
|
362 |
-
meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
|
363 |
-
if meta_keywords:
|
364 |
-
metadata['keywords'] = meta_keywords.get('content', '').strip()
|
365 |
-
|
366 |
-
# Get OG title if main title is empty
|
367 |
-
if not metadata['title']:
|
368 |
-
og_title = soup.find('meta', attrs={'property': 'og:title'})
|
369 |
-
if og_title:
|
370 |
-
metadata['title'] = og_title.get('content', '').strip()
|
371 |
-
|
372 |
-
return metadata
|
373 |
-
|
374 |
def generate_summary_and_assign_category(bookmark):
|
375 |
"""
|
376 |
-
|
377 |
-
|
378 |
-
""
|
379 |
-
|
380 |
-
description = bookmark.get('description', '').strip()
|
381 |
-
title = bookmark.get('title', '').strip()
|
382 |
-
|
383 |
-
if description:
|
384 |
-
# Use description as summary
|
385 |
-
bookmark['summary'] = description
|
386 |
-
# Assign category based on description or title
|
387 |
-
bookmark['category'] = categorize_based_on_summary(description, bookmark['url'])
|
388 |
-
logger.info(f"Summary derived from metadata for {bookmark.get('url')}")
|
389 |
-
elif title:
|
390 |
-
# Use title as summary
|
391 |
-
bookmark['summary'] = title
|
392 |
-
# Assign category based on title
|
393 |
-
bookmark['category'] = categorize_based_on_summary(title, bookmark['url'])
|
394 |
-
logger.info(f"Summary derived from title for {bookmark.get('url')}")
|
395 |
-
else:
|
396 |
-
# Enqueue for LLM processing
|
397 |
-
logger.info(f"No sufficient metadata for {bookmark.get('url')}. Enqueuing for LLM summary generation.")
|
398 |
-
llm_queue.put(bookmark)
|
399 |
|
400 |
def parse_bookmarks(file_content):
|
401 |
"""
|
@@ -444,15 +423,17 @@ def fetch_url_info(bookmark):
|
|
444 |
|
445 |
if response.status_code >= 500:
|
446 |
bookmark['dead_link'] = True
|
447 |
-
bookmark['description'] = ''
|
448 |
bookmark['html_content'] = ''
|
|
|
449 |
logger.warning(f"Dead link detected: {url} with status {response.status_code}")
|
450 |
else:
|
451 |
bookmark['dead_link'] = False
|
452 |
bookmark['html_content'] = content
|
453 |
-
|
|
|
|
|
|
|
454 |
logger.info(f"Fetched information for {url}")
|
455 |
-
|
456 |
except requests.exceptions.Timeout:
|
457 |
bookmark['dead_link'] = False
|
458 |
bookmark['etag'] = 'N/A'
|
@@ -511,19 +492,21 @@ def display_bookmarks():
|
|
511 |
status = "❌ Dead Link"
|
512 |
card_style = "border: 2px solid red;"
|
513 |
text_style = "color: white;"
|
|
|
514 |
elif bookmark.get('slow_link'):
|
515 |
status = "⏳ Slow Response"
|
516 |
card_style = "border: 2px solid orange;"
|
517 |
text_style = "color: white;"
|
|
|
518 |
else:
|
519 |
status = "✅ Active"
|
520 |
card_style = "border: 2px solid green;"
|
521 |
text_style = "color: white;"
|
|
|
522 |
|
523 |
title = bookmark['title']
|
524 |
url = bookmark['url']
|
525 |
etag = bookmark.get('etag', 'N/A')
|
526 |
-
summary = bookmark.get('summary', '')
|
527 |
category = bookmark.get('category', 'Uncategorized')
|
528 |
|
529 |
# Escape HTML content to prevent XSS attacks
|
@@ -585,7 +568,7 @@ def process_uploaded_file(file, state_bookmarks):
|
|
585 |
executor.map(fetch_url_info, bookmarks)
|
586 |
|
587 |
# Process bookmarks for summary and category
|
588 |
-
logger.info("
|
589 |
for bookmark in bookmarks:
|
590 |
generate_summary_and_assign_category(bookmark)
|
591 |
|
@@ -750,7 +733,7 @@ Provide a concise and helpful response.
|
|
750 |
"""
|
751 |
|
752 |
response = openai.ChatCompletion.create(
|
753 |
-
model='llama-3.1-70b-versatile',
|
754 |
messages=[
|
755 |
{"role": "user", "content": prompt}
|
756 |
],
|
@@ -765,8 +748,8 @@ Provide a concise and helpful response.
|
|
765 |
return chat_history
|
766 |
|
767 |
except openai.error.RateLimitError as e:
|
768 |
-
wait_time = int(e.headers.get("Retry-After",
|
769 |
-
logger.warning(f"
|
770 |
time.sleep(wait_time)
|
771 |
return chatbot_response(user_query, chat_history)
|
772 |
except Exception as e:
|
|
|
11 |
import logging
|
12 |
import os
|
13 |
import sys
|
|
|
|
|
14 |
import threading
|
15 |
from queue import Queue, Empty
|
16 |
import json
|
|
|
73 |
"Uncategorized",
|
74 |
]
|
75 |
|
76 |
+
# Set up OpenAI API key and base URL
|
77 |
+
OPENAI_API_KEY = os.getenv('GROQ_API_KEY') # Ensure this environment variable is set correctly
|
78 |
|
79 |
+
if not OPENAI_API_KEY:
|
80 |
logger.error("GROQ_API_KEY environment variable not set.")
|
81 |
|
82 |
+
openai.api_key = OPENAI_API_KEY
|
83 |
+
openai.api_base = "https://api.groq.com/openai/v1" # Ensure this is the correct base URL for your API
|
84 |
|
85 |
# Initialize global variables for rate limiting
|
86 |
api_lock = threading.Lock()
|
|
|
126 |
# Queue for LLM tasks
|
127 |
llm_queue = Queue()
|
128 |
|
129 |
+
def categorize_based_on_summary(summary, url):
|
130 |
+
"""
|
131 |
+
Assign category based on keywords in the summary or URL.
|
132 |
+
"""
|
133 |
+
summary_lower = summary.lower()
|
134 |
+
url_lower = url.lower()
|
135 |
+
if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
|
136 |
+
return 'Social Media'
|
137 |
+
elif 'wikipedia' in url_lower:
|
138 |
+
return 'Reference and Knowledge Bases'
|
139 |
+
elif 'cloud computing' in summary_lower or 'aws' in summary_lower:
|
140 |
+
return 'Technology'
|
141 |
+
elif 'news' in summary_lower or 'media' in summary_lower:
|
142 |
+
return 'News and Media'
|
143 |
+
elif 'education' in summary_lower or 'learning' in summary_lower:
|
144 |
+
return 'Education and Learning'
|
145 |
+
# Add more conditions as needed
|
146 |
+
else:
|
147 |
+
return 'Uncategorized'
|
148 |
+
|
149 |
+
def validate_category(bookmark):
|
150 |
+
"""
|
151 |
+
Further validate and adjust the category if needed.
|
152 |
+
"""
|
153 |
+
# Example: Specific cases based on URL
|
154 |
+
url_lower = bookmark['url'].lower()
|
155 |
+
if 'facebook' in url_lower or 'x.com' in url_lower:
|
156 |
+
return 'Social Media'
|
157 |
+
elif 'wikipedia' in url_lower:
|
158 |
+
return 'Reference and Knowledge Bases'
|
159 |
+
elif 'aws.amazon.com' in url_lower:
|
160 |
+
return 'Technology'
|
161 |
+
# Add more specific cases as needed
|
162 |
+
else:
|
163 |
+
return bookmark['category']
|
164 |
+
|
165 |
+
def extract_main_content(soup):
|
166 |
+
"""
|
167 |
+
Extract the main content from a webpage while filtering out boilerplate content.
|
168 |
+
"""
|
169 |
+
if not soup:
|
170 |
+
return ""
|
171 |
+
|
172 |
+
# Remove unwanted elements
|
173 |
+
for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'noscript']):
|
174 |
+
element.decompose()
|
175 |
+
|
176 |
+
# Extract text from <p> tags
|
177 |
+
p_tags = soup.find_all('p')
|
178 |
+
if p_tags:
|
179 |
+
content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
|
180 |
+
else:
|
181 |
+
# Fallback to body content
|
182 |
+
content = soup.get_text(separator=' ', strip=True)
|
183 |
+
|
184 |
+
# Clean up the text
|
185 |
+
content = re.sub(r'\s+', ' ', content)
|
186 |
+
|
187 |
+
# Truncate content to a reasonable length (e.g., 1500 words)
|
188 |
+
words = content.split()
|
189 |
+
if len(words) > 1500:
|
190 |
+
content = ' '.join(words[:1500])
|
191 |
+
|
192 |
+
return content
|
193 |
+
|
194 |
+
def get_page_metadata(soup):
|
195 |
+
"""
|
196 |
+
Extract metadata from the webpage including title, description, and keywords.
|
197 |
+
"""
|
198 |
+
metadata = {
|
199 |
+
'title': '',
|
200 |
+
'description': '',
|
201 |
+
'keywords': ''
|
202 |
+
}
|
203 |
+
|
204 |
+
if not soup:
|
205 |
+
return metadata
|
206 |
+
|
207 |
+
# Get title
|
208 |
+
title_tag = soup.find('title')
|
209 |
+
if title_tag and title_tag.string:
|
210 |
+
metadata['title'] = title_tag.string.strip()
|
211 |
+
|
212 |
+
# Get meta description
|
213 |
+
meta_desc = (
|
214 |
+
soup.find('meta', attrs={'name': 'description'}) or
|
215 |
+
soup.find('meta', attrs={'property': 'og:description'}) or
|
216 |
+
soup.find('meta', attrs={'name': 'twitter:description'})
|
217 |
+
)
|
218 |
+
if meta_desc:
|
219 |
+
metadata['description'] = meta_desc.get('content', '').strip()
|
220 |
+
|
221 |
+
# Get meta keywords
|
222 |
+
meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
|
223 |
+
if meta_keywords:
|
224 |
+
metadata['keywords'] = meta_keywords.get('content', '').strip()
|
225 |
+
|
226 |
+
# Get OG title if main title is empty
|
227 |
+
if not metadata['title']:
|
228 |
+
og_title = soup.find('meta', attrs={'property': 'og:title'})
|
229 |
+
if og_title:
|
230 |
+
metadata['title'] = og_title.get('content', '').strip()
|
231 |
+
|
232 |
+
return metadata
|
233 |
+
|
234 |
def llm_worker():
|
235 |
"""
|
236 |
Worker thread to process LLM tasks from the queue while respecting rate limits.
|
|
|
317 |
"""
|
318 |
|
319 |
response = openai.ChatCompletion.create(
|
320 |
+
model='llama-3.1-70b-versatile', # Ensure this model is correct and available
|
321 |
messages=[
|
322 |
{"role": "user", "content": prompt}
|
323 |
],
|
|
|
369 |
finally:
|
370 |
llm_queue.task_done()
|
371 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
def generate_summary_and_assign_category(bookmark):
|
373 |
"""
|
374 |
+
Enqueue bookmarks for LLM processing.
|
375 |
+
"""
|
376 |
+
logger.info(f"Enqueuing bookmark for LLM processing: {bookmark.get('url')}")
|
377 |
+
llm_queue.put(bookmark)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
378 |
|
379 |
def parse_bookmarks(file_content):
|
380 |
"""
|
|
|
423 |
|
424 |
if response.status_code >= 500:
|
425 |
bookmark['dead_link'] = True
|
|
|
426 |
bookmark['html_content'] = ''
|
427 |
+
bookmark['description'] = ''
|
428 |
logger.warning(f"Dead link detected: {url} with status {response.status_code}")
|
429 |
else:
|
430 |
bookmark['dead_link'] = False
|
431 |
bookmark['html_content'] = content
|
432 |
+
# Extract description from metadata
|
433 |
+
soup = BeautifulSoup(content, 'html.parser')
|
434 |
+
metadata = get_page_metadata(soup)
|
435 |
+
bookmark['description'] = metadata.get('description', '')
|
436 |
logger.info(f"Fetched information for {url}")
|
|
|
437 |
except requests.exceptions.Timeout:
|
438 |
bookmark['dead_link'] = False
|
439 |
bookmark['etag'] = 'N/A'
|
|
|
492 |
status = "❌ Dead Link"
|
493 |
card_style = "border: 2px solid red;"
|
494 |
text_style = "color: white;"
|
495 |
+
summary = 'No summary available.'
|
496 |
elif bookmark.get('slow_link'):
|
497 |
status = "⏳ Slow Response"
|
498 |
card_style = "border: 2px solid orange;"
|
499 |
text_style = "color: white;"
|
500 |
+
summary = bookmark.get('summary', 'No summary available.')
|
501 |
else:
|
502 |
status = "✅ Active"
|
503 |
card_style = "border: 2px solid green;"
|
504 |
text_style = "color: white;"
|
505 |
+
summary = bookmark.get('summary', 'No summary available.')
|
506 |
|
507 |
title = bookmark['title']
|
508 |
url = bookmark['url']
|
509 |
etag = bookmark.get('etag', 'N/A')
|
|
|
510 |
category = bookmark.get('category', 'Uncategorized')
|
511 |
|
512 |
# Escape HTML content to prevent XSS attacks
|
|
|
568 |
executor.map(fetch_url_info, bookmarks)
|
569 |
|
570 |
# Process bookmarks for summary and category
|
571 |
+
logger.info("Enqueuing bookmarks for LLM processing")
|
572 |
for bookmark in bookmarks:
|
573 |
generate_summary_and_assign_category(bookmark)
|
574 |
|
|
|
733 |
"""
|
734 |
|
735 |
response = openai.ChatCompletion.create(
|
736 |
+
model='llama-3.1-70b-versatile', # Ensure this model is correct and available
|
737 |
messages=[
|
738 |
{"role": "user", "content": prompt}
|
739 |
],
|
|
|
748 |
return chat_history
|
749 |
|
750 |
except openai.error.RateLimitError as e:
|
751 |
+
wait_time = int(e.headers.get("Retry-After", 5))
|
752 |
+
logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying...")
|
753 |
time.sleep(wait_time)
|
754 |
return chatbot_response(user_query, chat_history)
|
755 |
except Exception as e:
|