siddhartharya commited on
Commit
ff33c96
·
verified ·
1 Parent(s): 44e9a1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -144
app.py CHANGED
@@ -11,8 +11,6 @@ import re
11
  import logging
12
  import os
13
  import sys
14
- import concurrent.futures
15
- from concurrent.futures import ThreadPoolExecutor
16
  import threading
17
  from queue import Queue, Empty
18
  import json
@@ -75,14 +73,14 @@ CATEGORIES = [
75
  "Uncategorized",
76
  ]
77
 
78
- # Set up Groq Cloud API key and base URL
79
- GROQ_API_KEY = os.getenv('GROQ_API_KEY')
80
 
81
- if not GROQ_API_KEY:
82
  logger.error("GROQ_API_KEY environment variable not set.")
83
 
84
- openai.api_key = GROQ_API_KEY
85
- openai.api_base = "https://api.groq.com/openai/v1"
86
 
87
  # Initialize global variables for rate limiting
88
  api_lock = threading.Lock()
@@ -128,6 +126,111 @@ tpm_bucket = TokenBucket(rate=tpm_rate, capacity=TPM_LIMIT)
128
  # Queue for LLM tasks
129
  llm_queue = Queue()
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  def llm_worker():
132
  """
133
  Worker thread to process LLM tasks from the queue while respecting rate limits.
@@ -214,7 +317,7 @@ Please provide your response in the following JSON format:
214
  """
215
 
216
  response = openai.ChatCompletion.create(
217
- model='llama-3.1-70b-versatile',
218
  messages=[
219
  {"role": "user", "content": prompt}
220
  ],
@@ -266,136 +369,12 @@ Please provide your response in the following JSON format:
266
  finally:
267
  llm_queue.task_done()
268
 
269
- def categorize_based_on_summary(summary, url):
270
- """
271
- Assign category based on keywords in the summary or URL.
272
- """
273
- summary_lower = summary.lower()
274
- url_lower = url.lower()
275
- if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
276
- return 'Social Media'
277
- elif 'wikipedia' in url_lower:
278
- return 'Reference and Knowledge Bases'
279
- elif 'cloud computing' in summary_lower or 'aws' in summary_lower:
280
- return 'Technology'
281
- elif 'news' in summary_lower or 'media' in summary_lower:
282
- return 'News and Media'
283
- elif 'education' in summary_lower or 'learning' in summary_lower:
284
- return 'Education and Learning'
285
- # Add more conditions as needed
286
- else:
287
- return 'Uncategorized'
288
-
289
- def validate_category(bookmark):
290
- """
291
- Further validate and adjust the category if needed.
292
- """
293
- # Example: Specific cases based on URL
294
- url_lower = bookmark['url'].lower()
295
- if 'facebook' in url_lower or 'x.com' in url_lower:
296
- return 'Social Media'
297
- elif 'wikipedia' in url_lower:
298
- return 'Reference and Knowledge Bases'
299
- elif 'aws.amazon.com' in url_lower:
300
- return 'Technology'
301
- # Add more specific cases as needed
302
- else:
303
- return bookmark['category']
304
-
305
- def extract_main_content(soup):
306
- """
307
- Extract the main content from a webpage while filtering out boilerplate content.
308
- """
309
- if not soup:
310
- return ""
311
-
312
- # Remove unwanted elements
313
- for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'noscript']):
314
- element.decompose()
315
-
316
- # Extract text from <p> tags
317
- p_tags = soup.find_all('p')
318
- if p_tags:
319
- content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
320
- else:
321
- # Fallback to body content
322
- content = soup.get_text(separator=' ', strip=True)
323
-
324
- # Clean up the text
325
- content = re.sub(r'\s+', ' ', content)
326
-
327
- # Truncate content to a reasonable length (e.g., 1500 words)
328
- words = content.split()
329
- if len(words) > 1500:
330
- content = ' '.join(words[:1500])
331
-
332
- return content
333
-
334
- def get_page_metadata(soup):
335
- """
336
- Extract metadata from the webpage including title, description, and keywords.
337
- """
338
- metadata = {
339
- 'title': '',
340
- 'description': '',
341
- 'keywords': ''
342
- }
343
-
344
- if not soup:
345
- return metadata
346
-
347
- # Get title
348
- title_tag = soup.find('title')
349
- if title_tag and title_tag.string:
350
- metadata['title'] = title_tag.string.strip()
351
-
352
- # Get meta description
353
- meta_desc = (
354
- soup.find('meta', attrs={'name': 'description'}) or
355
- soup.find('meta', attrs={'property': 'og:description'}) or
356
- soup.find('meta', attrs={'name': 'twitter:description'})
357
- )
358
- if meta_desc:
359
- metadata['description'] = meta_desc.get('content', '').strip()
360
-
361
- # Get meta keywords
362
- meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
363
- if meta_keywords:
364
- metadata['keywords'] = meta_keywords.get('content', '').strip()
365
-
366
- # Get OG title if main title is empty
367
- if not metadata['title']:
368
- og_title = soup.find('meta', attrs={'property': 'og:title'})
369
- if og_title:
370
- metadata['title'] = og_title.get('content', '').strip()
371
-
372
- return metadata
373
-
374
  def generate_summary_and_assign_category(bookmark):
375
  """
376
- Generate a concise summary and assign a category using a single LLM call.
377
- This function decides whether to use metadata or enqueue an LLM call.
378
- """
379
- # Check if metadata can provide a summary
380
- description = bookmark.get('description', '').strip()
381
- title = bookmark.get('title', '').strip()
382
-
383
- if description:
384
- # Use description as summary
385
- bookmark['summary'] = description
386
- # Assign category based on description or title
387
- bookmark['category'] = categorize_based_on_summary(description, bookmark['url'])
388
- logger.info(f"Summary derived from metadata for {bookmark.get('url')}")
389
- elif title:
390
- # Use title as summary
391
- bookmark['summary'] = title
392
- # Assign category based on title
393
- bookmark['category'] = categorize_based_on_summary(title, bookmark['url'])
394
- logger.info(f"Summary derived from title for {bookmark.get('url')}")
395
- else:
396
- # Enqueue for LLM processing
397
- logger.info(f"No sufficient metadata for {bookmark.get('url')}. Enqueuing for LLM summary generation.")
398
- llm_queue.put(bookmark)
399
 
400
  def parse_bookmarks(file_content):
401
  """
@@ -444,15 +423,17 @@ def fetch_url_info(bookmark):
444
 
445
  if response.status_code >= 500:
446
  bookmark['dead_link'] = True
447
- bookmark['description'] = ''
448
  bookmark['html_content'] = ''
 
449
  logger.warning(f"Dead link detected: {url} with status {response.status_code}")
450
  else:
451
  bookmark['dead_link'] = False
452
  bookmark['html_content'] = content
453
- bookmark['description'] = ''
 
 
 
454
  logger.info(f"Fetched information for {url}")
455
-
456
  except requests.exceptions.Timeout:
457
  bookmark['dead_link'] = False
458
  bookmark['etag'] = 'N/A'
@@ -511,19 +492,21 @@ def display_bookmarks():
511
  status = "❌ Dead Link"
512
  card_style = "border: 2px solid red;"
513
  text_style = "color: white;"
 
514
  elif bookmark.get('slow_link'):
515
  status = "⏳ Slow Response"
516
  card_style = "border: 2px solid orange;"
517
  text_style = "color: white;"
 
518
  else:
519
  status = "✅ Active"
520
  card_style = "border: 2px solid green;"
521
  text_style = "color: white;"
 
522
 
523
  title = bookmark['title']
524
  url = bookmark['url']
525
  etag = bookmark.get('etag', 'N/A')
526
- summary = bookmark.get('summary', '')
527
  category = bookmark.get('category', 'Uncategorized')
528
 
529
  # Escape HTML content to prevent XSS attacks
@@ -585,7 +568,7 @@ def process_uploaded_file(file, state_bookmarks):
585
  executor.map(fetch_url_info, bookmarks)
586
 
587
  # Process bookmarks for summary and category
588
- logger.info("Processing bookmarks for summaries and categories")
589
  for bookmark in bookmarks:
590
  generate_summary_and_assign_category(bookmark)
591
 
@@ -750,7 +733,7 @@ Provide a concise and helpful response.
750
  """
751
 
752
  response = openai.ChatCompletion.create(
753
- model='llama-3.1-70b-versatile',
754
  messages=[
755
  {"role": "user", "content": prompt}
756
  ],
@@ -765,8 +748,8 @@ Provide a concise and helpful response.
765
  return chat_history
766
 
767
  except openai.error.RateLimitError as e:
768
- wait_time = int(e.headers.get("Retry-After", 60))
769
- logger.warning(f"Chatbot Rate limit reached. Waiting for {wait_time} seconds before retrying...")
770
  time.sleep(wait_time)
771
  return chatbot_response(user_query, chat_history)
772
  except Exception as e:
 
11
  import logging
12
  import os
13
  import sys
 
 
14
  import threading
15
  from queue import Queue, Empty
16
  import json
 
73
  "Uncategorized",
74
  ]
75
 
76
+ # Set up OpenAI API key and base URL
77
+ OPENAI_API_KEY = os.getenv('GROQ_API_KEY') # Ensure this environment variable is set correctly
78
 
79
+ if not OPENAI_API_KEY:
80
  logger.error("GROQ_API_KEY environment variable not set.")
81
 
82
+ openai.api_key = OPENAI_API_KEY
83
+ openai.api_base = "https://api.groq.com/openai/v1" # Ensure this is the correct base URL for your API
84
 
85
  # Initialize global variables for rate limiting
86
  api_lock = threading.Lock()
 
126
  # Queue for LLM tasks
127
  llm_queue = Queue()
128
 
129
+ def categorize_based_on_summary(summary, url):
130
+ """
131
+ Assign category based on keywords in the summary or URL.
132
+ """
133
+ summary_lower = summary.lower()
134
+ url_lower = url.lower()
135
+ if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
136
+ return 'Social Media'
137
+ elif 'wikipedia' in url_lower:
138
+ return 'Reference and Knowledge Bases'
139
+ elif 'cloud computing' in summary_lower or 'aws' in summary_lower:
140
+ return 'Technology'
141
+ elif 'news' in summary_lower or 'media' in summary_lower:
142
+ return 'News and Media'
143
+ elif 'education' in summary_lower or 'learning' in summary_lower:
144
+ return 'Education and Learning'
145
+ # Add more conditions as needed
146
+ else:
147
+ return 'Uncategorized'
148
+
149
+ def validate_category(bookmark):
150
+ """
151
+ Further validate and adjust the category if needed.
152
+ """
153
+ # Example: Specific cases based on URL
154
+ url_lower = bookmark['url'].lower()
155
+ if 'facebook' in url_lower or 'x.com' in url_lower:
156
+ return 'Social Media'
157
+ elif 'wikipedia' in url_lower:
158
+ return 'Reference and Knowledge Bases'
159
+ elif 'aws.amazon.com' in url_lower:
160
+ return 'Technology'
161
+ # Add more specific cases as needed
162
+ else:
163
+ return bookmark['category']
164
+
165
+ def extract_main_content(soup):
166
+ """
167
+ Extract the main content from a webpage while filtering out boilerplate content.
168
+ """
169
+ if not soup:
170
+ return ""
171
+
172
+ # Remove unwanted elements
173
+ for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'noscript']):
174
+ element.decompose()
175
+
176
+ # Extract text from <p> tags
177
+ p_tags = soup.find_all('p')
178
+ if p_tags:
179
+ content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
180
+ else:
181
+ # Fallback to body content
182
+ content = soup.get_text(separator=' ', strip=True)
183
+
184
+ # Clean up the text
185
+ content = re.sub(r'\s+', ' ', content)
186
+
187
+ # Truncate content to a reasonable length (e.g., 1500 words)
188
+ words = content.split()
189
+ if len(words) > 1500:
190
+ content = ' '.join(words[:1500])
191
+
192
+ return content
193
+
194
+ def get_page_metadata(soup):
195
+ """
196
+ Extract metadata from the webpage including title, description, and keywords.
197
+ """
198
+ metadata = {
199
+ 'title': '',
200
+ 'description': '',
201
+ 'keywords': ''
202
+ }
203
+
204
+ if not soup:
205
+ return metadata
206
+
207
+ # Get title
208
+ title_tag = soup.find('title')
209
+ if title_tag and title_tag.string:
210
+ metadata['title'] = title_tag.string.strip()
211
+
212
+ # Get meta description
213
+ meta_desc = (
214
+ soup.find('meta', attrs={'name': 'description'}) or
215
+ soup.find('meta', attrs={'property': 'og:description'}) or
216
+ soup.find('meta', attrs={'name': 'twitter:description'})
217
+ )
218
+ if meta_desc:
219
+ metadata['description'] = meta_desc.get('content', '').strip()
220
+
221
+ # Get meta keywords
222
+ meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
223
+ if meta_keywords:
224
+ metadata['keywords'] = meta_keywords.get('content', '').strip()
225
+
226
+ # Get OG title if main title is empty
227
+ if not metadata['title']:
228
+ og_title = soup.find('meta', attrs={'property': 'og:title'})
229
+ if og_title:
230
+ metadata['title'] = og_title.get('content', '').strip()
231
+
232
+ return metadata
233
+
234
  def llm_worker():
235
  """
236
  Worker thread to process LLM tasks from the queue while respecting rate limits.
 
317
  """
318
 
319
  response = openai.ChatCompletion.create(
320
+ model='llama-3.1-70b-versatile', # Ensure this model is correct and available
321
  messages=[
322
  {"role": "user", "content": prompt}
323
  ],
 
369
  finally:
370
  llm_queue.task_done()
371
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  def generate_summary_and_assign_category(bookmark):
373
  """
374
+ Enqueue bookmarks for LLM processing.
375
+ """
376
+ logger.info(f"Enqueuing bookmark for LLM processing: {bookmark.get('url')}")
377
+ llm_queue.put(bookmark)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
 
379
  def parse_bookmarks(file_content):
380
  """
 
423
 
424
  if response.status_code >= 500:
425
  bookmark['dead_link'] = True
 
426
  bookmark['html_content'] = ''
427
+ bookmark['description'] = ''
428
  logger.warning(f"Dead link detected: {url} with status {response.status_code}")
429
  else:
430
  bookmark['dead_link'] = False
431
  bookmark['html_content'] = content
432
+ # Extract description from metadata
433
+ soup = BeautifulSoup(content, 'html.parser')
434
+ metadata = get_page_metadata(soup)
435
+ bookmark['description'] = metadata.get('description', '')
436
  logger.info(f"Fetched information for {url}")
 
437
  except requests.exceptions.Timeout:
438
  bookmark['dead_link'] = False
439
  bookmark['etag'] = 'N/A'
 
492
  status = "❌ Dead Link"
493
  card_style = "border: 2px solid red;"
494
  text_style = "color: white;"
495
+ summary = 'No summary available.'
496
  elif bookmark.get('slow_link'):
497
  status = "⏳ Slow Response"
498
  card_style = "border: 2px solid orange;"
499
  text_style = "color: white;"
500
+ summary = bookmark.get('summary', 'No summary available.')
501
  else:
502
  status = "✅ Active"
503
  card_style = "border: 2px solid green;"
504
  text_style = "color: white;"
505
+ summary = bookmark.get('summary', 'No summary available.')
506
 
507
  title = bookmark['title']
508
  url = bookmark['url']
509
  etag = bookmark.get('etag', 'N/A')
 
510
  category = bookmark.get('category', 'Uncategorized')
511
 
512
  # Escape HTML content to prevent XSS attacks
 
568
  executor.map(fetch_url_info, bookmarks)
569
 
570
  # Process bookmarks for summary and category
571
+ logger.info("Enqueuing bookmarks for LLM processing")
572
  for bookmark in bookmarks:
573
  generate_summary_and_assign_category(bookmark)
574
 
 
733
  """
734
 
735
  response = openai.ChatCompletion.create(
736
+ model='llama-3.1-70b-versatile', # Ensure this model is correct and available
737
  messages=[
738
  {"role": "user", "content": prompt}
739
  ],
 
748
  return chat_history
749
 
750
  except openai.error.RateLimitError as e:
751
+ wait_time = int(e.headers.get("Retry-After", 5))
752
+ logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying...")
753
  time.sleep(wait_time)
754
  return chatbot_response(user_query, chat_history)
755
  except Exception as e: