siddhartharya commited on
Commit
44e9a1d
·
verified ·
1 Parent(s): 2e66ec2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -106
app.py CHANGED
@@ -15,6 +15,7 @@ import concurrent.futures
15
  from concurrent.futures import ThreadPoolExecutor
16
  import threading
17
  from queue import Queue, Empty
 
18
 
19
  # Import OpenAI library
20
  import openai
@@ -145,7 +146,9 @@ def llm_worker():
145
  try:
146
  # Rate Limiting
147
  rpm_bucket.wait_for_token()
148
- tpm_bucket.wait_for_token(tokens=150) # Assuming max_tokens=150 per request
 
 
149
 
150
  html_content = bookmark.get('html_content', '')
151
  soup = BeautifulSoup(html_content, 'html.parser')
@@ -186,8 +189,11 @@ Provide:
186
  Categories:
187
  {', '.join([f'"{cat}"' for cat in CATEGORIES])}
188
  Format:
189
- Summary: [Your summary]
190
- Category: [One category]
 
 
 
191
  """
192
  else:
193
  prompt = f"""
@@ -200,24 +206,19 @@ Provide:
200
  Categories:
201
  {', '.join([f'"{cat}"' for cat in CATEGORIES])}
202
  Format:
203
- Summary: [Your summary]
204
- Category: [One category]
 
 
 
205
  """
206
 
207
- def estimate_tokens(text):
208
- return len(text) / 4 # Approximation
209
-
210
- prompt_tokens = estimate_tokens(prompt)
211
- max_tokens = 150
212
- total_tokens = prompt_tokens + max_tokens
213
-
214
- # Prepare the prompt with token estimation
215
  response = openai.ChatCompletion.create(
216
  model='llama-3.1-70b-versatile',
217
  messages=[
218
  {"role": "user", "content": prompt}
219
  ],
220
- max_tokens=int(max_tokens),
221
  temperature=0.5,
222
  )
223
 
@@ -225,41 +226,31 @@ Category: [One category]
225
  if not content:
226
  raise ValueError("Empty response received from the model.")
227
 
228
- summary_match = re.search(r"Summary:\s*(.*)", content, re.IGNORECASE)
229
- category_match = re.search(r"Category:\s*(.*)", content, re.IGNORECASE)
 
 
 
230
 
231
- # Extract summary
232
- if summary_match:
233
- summary = summary_match.group(1).strip()
234
- if summary:
235
- bookmark['summary'] = summary
236
- else:
237
- # For dead links, only set summary if it's a slow link
238
- if bookmark.get('slow_link', False):
239
- bookmark['summary'] = metadata.get('description') or metadata.get('title') or 'No summary available.'
240
- else:
241
- # For dead links without summary, do not set 'summary'
242
- bookmark['summary'] = ''
243
- else:
244
- if bookmark.get('slow_link', False):
245
- bookmark['summary'] = metadata.get('description') or metadata.get('title') or 'No summary available.'
246
  else:
247
- bookmark['summary'] = ''
 
248
 
249
- # Extract category
250
- if category_match:
251
- category = category_match.group(1).strip().strip('"')
252
- bookmark['category'] = category if category in CATEGORIES else 'Uncategorized'
253
- else:
254
- bookmark['category'] = 'Uncategorized'
255
 
256
- # Simple keyword-based validation
257
- summary_lower = bookmark.get('summary', '').lower()
258
- url_lower = bookmark['url'].lower()
259
- if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
260
- bookmark['category'] = 'Social Media'
261
- elif 'wikipedia' in url_lower:
262
- bookmark['category'] = 'Reference and Knowledge Bases'
263
 
264
  logger.info("Successfully generated summary and assigned category")
265
  except openai.error.RateLimitError as e:
@@ -269,21 +260,47 @@ Category: [One category]
269
  time.sleep(60) # Wait before retrying
270
  except Exception as e:
271
  logger.error(f"Error generating summary and assigning category for {bookmark.get('url')}: {e}", exc_info=True)
272
- # For slow links, provide a summary from metadata or title
273
- if bookmark.get('slow_link', False):
274
- bookmark['summary'] = metadata.get('description') or metadata.get('title') or 'No summary available.'
275
- # For dead links, attempt to set summary; if not possible, leave it unset
276
- elif bookmark.get('dead_link', False):
277
- bookmark['summary'] = metadata.get('description') or metadata.get('title') or ''
278
- else:
279
- bookmark['summary'] = 'No summary available.'
280
  bookmark['category'] = 'Uncategorized'
281
  finally:
282
  llm_queue.task_done()
283
 
284
- # Start the LLM worker thread
285
- llm_thread = threading.Thread(target=llm_worker, daemon=True)
286
- llm_thread.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
  def extract_main_content(soup):
289
  """
@@ -356,7 +373,7 @@ def get_page_metadata(soup):
356
 
357
  def generate_summary_and_assign_category(bookmark):
358
  """
359
- Generate a concise summary and assign a category.
360
  This function decides whether to use metadata or enqueue an LLM call.
361
  """
362
  # Check if metadata can provide a summary
@@ -367,32 +384,19 @@ def generate_summary_and_assign_category(bookmark):
367
  # Use description as summary
368
  bookmark['summary'] = description
369
  # Assign category based on description or title
370
- assign_category_based_on_summary(bookmark)
371
  logger.info(f"Summary derived from metadata for {bookmark.get('url')}")
372
  elif title:
373
  # Use title as summary
374
  bookmark['summary'] = title
375
  # Assign category based on title
376
- assign_category_based_on_summary(bookmark)
377
  logger.info(f"Summary derived from title for {bookmark.get('url')}")
378
  else:
379
  # Enqueue for LLM processing
380
  logger.info(f"No sufficient metadata for {bookmark.get('url')}. Enqueuing for LLM summary generation.")
381
  llm_queue.put(bookmark)
382
 
383
- def assign_category_based_on_summary(bookmark):
384
- """
385
- Assign category based on simple keyword matching in the summary.
386
- """
387
- summary_lower = bookmark.get('summary', '').lower()
388
- url_lower = bookmark['url'].lower()
389
- if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
390
- bookmark['category'] = 'Social Media'
391
- elif 'wikipedia' in url_lower:
392
- bookmark['category'] = 'Reference and Knowledge Bases'
393
- else:
394
- bookmark['category'] = 'Uncategorized'
395
-
396
  def parse_bookmarks(file_content):
397
  """
398
  Parse bookmarks from HTML file.
@@ -440,17 +444,20 @@ def fetch_url_info(bookmark):
440
 
441
  if response.status_code >= 500:
442
  bookmark['dead_link'] = True
443
- bookmark['html_content'] = content # Keep content to extract metadata if possible
 
444
  logger.warning(f"Dead link detected: {url} with status {response.status_code}")
445
  else:
446
  bookmark['dead_link'] = False
447
  bookmark['html_content'] = content
 
448
  logger.info(f"Fetched information for {url}")
449
 
450
  except requests.exceptions.Timeout:
451
  bookmark['dead_link'] = False
452
  bookmark['etag'] = 'N/A'
453
  bookmark['status_code'] = 'Timeout'
 
454
  bookmark['html_content'] = ''
455
  bookmark['slow_link'] = True
456
  logger.warning(f"Timeout while fetching {url}. Marking as 'Slow'.")
@@ -458,22 +465,10 @@ def fetch_url_info(bookmark):
458
  bookmark['dead_link'] = True
459
  bookmark['etag'] = 'N/A'
460
  bookmark['status_code'] = 'Error'
 
461
  bookmark['html_content'] = ''
462
  logger.error(f"Error fetching URL info for {url}: {e}", exc_info=True)
463
  finally:
464
- # Extract meta description for dead links if content is available
465
- if bookmark.get('dead_link', False) and bookmark.get('html_content'):
466
- soup = BeautifulSoup(bookmark['html_content'], 'html.parser')
467
- metadata = get_page_metadata(soup)
468
- bookmark['description'] = metadata.get('description', '')
469
- elif not bookmark.get('dead_link', False):
470
- # For active and slow links, attempt to extract description
471
- soup = BeautifulSoup(bookmark['html_content'], 'html.parser')
472
- metadata = get_page_metadata(soup)
473
- bookmark['description'] = metadata.get('description', '')
474
- else:
475
- bookmark['description'] = ''
476
-
477
  with lock:
478
  fetch_cache[url] = {
479
  'etag': bookmark.get('etag'),
@@ -491,8 +486,7 @@ def vectorize_and_index(bookmarks_list):
491
  global faiss_index
492
  logger.info("Vectorizing summaries and building FAISS index")
493
  try:
494
- # Safely access 'summary' using .get() to avoid KeyError
495
- summaries = [bookmark.get('summary', '') for bookmark in bookmarks_list]
496
  embeddings = embedding_model.encode(summaries)
497
  dimension = embeddings.shape[1]
498
  index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))
@@ -517,26 +511,19 @@ def display_bookmarks():
517
  status = "❌ Dead Link"
518
  card_style = "border: 2px solid red;"
519
  text_style = "color: white;"
520
- # For dead links, use 'summary' if available
521
- summary = bookmark.get('summary', '')
522
- if not summary:
523
- # Provide a default message or leave it empty
524
- summary = 'No summary available.'
525
  elif bookmark.get('slow_link'):
526
  status = "⏳ Slow Response"
527
  card_style = "border: 2px solid orange;"
528
  text_style = "color: white;"
529
- # For slow links, always provide a summary
530
- summary = bookmark.get('summary', 'No summary available.')
531
  else:
532
  status = "✅ Active"
533
  card_style = "border: 2px solid green;"
534
  text_style = "color: white;"
535
- summary = bookmark.get('summary', 'No summary available.')
536
 
537
  title = bookmark['title']
538
  url = bookmark['url']
539
  etag = bookmark.get('etag', 'N/A')
 
540
  category = bookmark.get('category', 'Uncategorized')
541
 
542
  # Escape HTML content to prevent XSS attacks
@@ -762,19 +749,12 @@ Bookmarks:
762
  Provide a concise and helpful response.
763
  """
764
 
765
- def estimate_tokens(text):
766
- return len(text) / 4 # Approximation
767
-
768
- prompt_tokens = estimate_tokens(prompt)
769
- max_tokens = 300
770
- total_tokens = prompt_tokens + max_tokens
771
-
772
  response = openai.ChatCompletion.create(
773
  model='llama-3.1-70b-versatile',
774
  messages=[
775
  {"role": "user", "content": prompt}
776
  ],
777
- max_tokens=int(max_tokens),
778
  temperature=0.7,
779
  )
780
 
@@ -971,8 +951,12 @@ Navigate through the tabs to explore each feature in detail.
971
  logger.info("Launching Gradio app")
972
  demo.launch(debug=True)
973
  except Exception as e:
974
- logger.error(f"Error building the app: {e}", exc_info=True)
975
- print(f"Error building the app: {e}")
976
 
977
  if __name__ == "__main__":
 
 
 
 
978
  build_app()
 
15
  from concurrent.futures import ThreadPoolExecutor
16
  import threading
17
  from queue import Queue, Empty
18
+ import json
19
 
20
  # Import OpenAI library
21
  import openai
 
146
  try:
147
  # Rate Limiting
148
  rpm_bucket.wait_for_token()
149
+ # Estimate tokens: prompt + max_tokens
150
+ # Here, we assume max_tokens=150
151
+ tpm_bucket.wait_for_token(tokens=150)
152
 
153
  html_content = bookmark.get('html_content', '')
154
  soup = BeautifulSoup(html_content, 'html.parser')
 
189
  Categories:
190
  {', '.join([f'"{cat}"' for cat in CATEGORIES])}
191
  Format:
192
+ Please provide your response in the following JSON format:
193
+ {{
194
+ "summary": "Your summary here.",
195
+ "category": "One category from the list."
196
+ }}
197
  """
198
  else:
199
  prompt = f"""
 
206
  Categories:
207
  {', '.join([f'"{cat}"' for cat in CATEGORIES])}
208
  Format:
209
+ Please provide your response in the following JSON format:
210
+ {{
211
+ "summary": "Your summary here.",
212
+ "category": "One category from the list."
213
+ }}
214
  """
215
 
 
 
 
 
 
 
 
 
216
  response = openai.ChatCompletion.create(
217
  model='llama-3.1-70b-versatile',
218
  messages=[
219
  {"role": "user", "content": prompt}
220
  ],
221
+ max_tokens=150,
222
  temperature=0.5,
223
  )
224
 
 
226
  if not content:
227
  raise ValueError("Empty response received from the model.")
228
 
229
+ # Parse JSON response
230
+ try:
231
+ json_response = json.loads(content)
232
+ summary = json_response.get('summary', '').strip()
233
+ category = json_response.get('category', '').strip()
234
 
235
+ # Validate and assign
236
+ if not summary:
237
+ summary = metadata.get('description') or metadata.get('title') or 'No summary available.'
238
+ bookmark['summary'] = summary
239
+
240
+ if category in CATEGORIES:
241
+ bookmark['category'] = category
 
 
 
 
 
 
 
 
242
  else:
243
+ # Fallback to keyword-based categorization
244
+ bookmark['category'] = categorize_based_on_summary(summary, bookmark['url'])
245
 
246
+ except json.JSONDecodeError:
247
+ logger.error(f"Failed to parse JSON response for {bookmark.get('url')}. Using fallback methods.")
248
+ # Fallback methods
249
+ bookmark['summary'] = metadata.get('description') or metadata.get('title') or 'No summary available.'
250
+ bookmark['category'] = categorize_based_on_summary(bookmark['summary'], bookmark['url'])
 
251
 
252
+ # Additional keyword-based validation
253
+ bookmark['category'] = validate_category(bookmark)
 
 
 
 
 
254
 
255
  logger.info("Successfully generated summary and assigned category")
256
  except openai.error.RateLimitError as e:
 
260
  time.sleep(60) # Wait before retrying
261
  except Exception as e:
262
  logger.error(f"Error generating summary and assigning category for {bookmark.get('url')}: {e}", exc_info=True)
263
+ # Assign default values on failure
264
+ bookmark['summary'] = 'No summary available.'
 
 
 
 
 
 
265
  bookmark['category'] = 'Uncategorized'
266
  finally:
267
  llm_queue.task_done()
268
 
269
+ def categorize_based_on_summary(summary, url):
270
+ """
271
+ Assign category based on keywords in the summary or URL.
272
+ """
273
+ summary_lower = summary.lower()
274
+ url_lower = url.lower()
275
+ if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
276
+ return 'Social Media'
277
+ elif 'wikipedia' in url_lower:
278
+ return 'Reference and Knowledge Bases'
279
+ elif 'cloud computing' in summary_lower or 'aws' in summary_lower:
280
+ return 'Technology'
281
+ elif 'news' in summary_lower or 'media' in summary_lower:
282
+ return 'News and Media'
283
+ elif 'education' in summary_lower or 'learning' in summary_lower:
284
+ return 'Education and Learning'
285
+ # Add more conditions as needed
286
+ else:
287
+ return 'Uncategorized'
288
+
289
+ def validate_category(bookmark):
290
+ """
291
+ Further validate and adjust the category if needed.
292
+ """
293
+ # Example: Specific cases based on URL
294
+ url_lower = bookmark['url'].lower()
295
+ if 'facebook' in url_lower or 'x.com' in url_lower:
296
+ return 'Social Media'
297
+ elif 'wikipedia' in url_lower:
298
+ return 'Reference and Knowledge Bases'
299
+ elif 'aws.amazon.com' in url_lower:
300
+ return 'Technology'
301
+ # Add more specific cases as needed
302
+ else:
303
+ return bookmark['category']
304
 
305
  def extract_main_content(soup):
306
  """
 
373
 
374
  def generate_summary_and_assign_category(bookmark):
375
  """
376
+ Generate a concise summary and assign a category using a single LLM call.
377
  This function decides whether to use metadata or enqueue an LLM call.
378
  """
379
  # Check if metadata can provide a summary
 
384
  # Use description as summary
385
  bookmark['summary'] = description
386
  # Assign category based on description or title
387
+ bookmark['category'] = categorize_based_on_summary(description, bookmark['url'])
388
  logger.info(f"Summary derived from metadata for {bookmark.get('url')}")
389
  elif title:
390
  # Use title as summary
391
  bookmark['summary'] = title
392
  # Assign category based on title
393
+ bookmark['category'] = categorize_based_on_summary(title, bookmark['url'])
394
  logger.info(f"Summary derived from title for {bookmark.get('url')}")
395
  else:
396
  # Enqueue for LLM processing
397
  logger.info(f"No sufficient metadata for {bookmark.get('url')}. Enqueuing for LLM summary generation.")
398
  llm_queue.put(bookmark)
399
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  def parse_bookmarks(file_content):
401
  """
402
  Parse bookmarks from HTML file.
 
444
 
445
  if response.status_code >= 500:
446
  bookmark['dead_link'] = True
447
+ bookmark['description'] = ''
448
+ bookmark['html_content'] = ''
449
  logger.warning(f"Dead link detected: {url} with status {response.status_code}")
450
  else:
451
  bookmark['dead_link'] = False
452
  bookmark['html_content'] = content
453
+ bookmark['description'] = ''
454
  logger.info(f"Fetched information for {url}")
455
 
456
  except requests.exceptions.Timeout:
457
  bookmark['dead_link'] = False
458
  bookmark['etag'] = 'N/A'
459
  bookmark['status_code'] = 'Timeout'
460
+ bookmark['description'] = ''
461
  bookmark['html_content'] = ''
462
  bookmark['slow_link'] = True
463
  logger.warning(f"Timeout while fetching {url}. Marking as 'Slow'.")
 
465
  bookmark['dead_link'] = True
466
  bookmark['etag'] = 'N/A'
467
  bookmark['status_code'] = 'Error'
468
+ bookmark['description'] = ''
469
  bookmark['html_content'] = ''
470
  logger.error(f"Error fetching URL info for {url}: {e}", exc_info=True)
471
  finally:
 
 
 
 
 
 
 
 
 
 
 
 
 
472
  with lock:
473
  fetch_cache[url] = {
474
  'etag': bookmark.get('etag'),
 
486
  global faiss_index
487
  logger.info("Vectorizing summaries and building FAISS index")
488
  try:
489
+ summaries = [bookmark['summary'] for bookmark in bookmarks_list]
 
490
  embeddings = embedding_model.encode(summaries)
491
  dimension = embeddings.shape[1]
492
  index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))
 
511
  status = "❌ Dead Link"
512
  card_style = "border: 2px solid red;"
513
  text_style = "color: white;"
 
 
 
 
 
514
  elif bookmark.get('slow_link'):
515
  status = "⏳ Slow Response"
516
  card_style = "border: 2px solid orange;"
517
  text_style = "color: white;"
 
 
518
  else:
519
  status = "✅ Active"
520
  card_style = "border: 2px solid green;"
521
  text_style = "color: white;"
 
522
 
523
  title = bookmark['title']
524
  url = bookmark['url']
525
  etag = bookmark.get('etag', 'N/A')
526
+ summary = bookmark.get('summary', '')
527
  category = bookmark.get('category', 'Uncategorized')
528
 
529
  # Escape HTML content to prevent XSS attacks
 
749
  Provide a concise and helpful response.
750
  """
751
 
 
 
 
 
 
 
 
752
  response = openai.ChatCompletion.create(
753
  model='llama-3.1-70b-versatile',
754
  messages=[
755
  {"role": "user", "content": prompt}
756
  ],
757
+ max_tokens=300,
758
  temperature=0.7,
759
  )
760
 
 
951
  logger.info("Launching Gradio app")
952
  demo.launch(debug=True)
953
  except Exception as e:
954
+ logger.error(f"Error building Gradio app: {e}", exc_info=True)
955
+ print(f"Error building Gradio app: {e}")
956
 
957
  if __name__ == "__main__":
958
+ # Start the LLM worker thread before launching the app
959
+ llm_thread = threading.Thread(target=llm_worker, daemon=True)
960
+ llm_thread.start()
961
+
962
  build_app()