siddhartharya commited on
Commit
97165e2
·
verified ·
1 Parent(s): dd78c27

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +325 -248
app.py CHANGED
@@ -45,19 +45,8 @@ faiss_index = None
45
  bookmarks = []
46
  fetch_cache = {}
47
 
48
- # Groq API Rate Limits
49
- GROQ_RPM = 30 # requests per minute
50
- GROQ_TPM = 40000 # tokens per minute
51
- SECONDS_PER_MINUTE = 60
52
- MIN_TIME_BETWEEN_CALLS = SECONDS_PER_MINUTE / GROQ_RPM # 2 seconds between calls
53
- MAX_CONCURRENT_CALLS = 3 # Keep concurrent calls limited to prevent rate limits
54
- TOKEN_BUFFER = 0.9 # Use 90% of token limit to be safe
55
-
56
- # Rate limiting tools
57
- api_lock = threading.Lock()
58
- request_times = [] # Track request timestamps
59
- token_usage = [] # Track token usage
60
- LLM_SEMAPHORE = threading.Semaphore(MAX_CONCURRENT_CALLS)
61
 
62
  # Define the categories
63
  CATEGORIES = [
@@ -94,34 +83,10 @@ if not GROQ_API_KEY:
94
  openai.api_key = GROQ_API_KEY
95
  openai.api_base = "https://api.groq.com/openai/v1"
96
 
97
- def manage_rate_limits():
98
- """
99
- Manage both request and token rate limits.
100
- Returns the time to wait (if any) before making next request.
101
- """
102
- current_time = time.time()
103
- minute_ago = current_time - SECONDS_PER_MINUTE
104
-
105
- # Clean up old entries
106
- global request_times, token_usage
107
- request_times = [t for t in request_times if t > minute_ago]
108
- token_usage = [t for t, _ in token_usage if t > minute_ago]
109
-
110
- # Check request rate
111
- if len(request_times) >= GROQ_RPM:
112
- oldest_request = request_times[0]
113
- return max(0, SECONDS_PER_MINUTE - (current_time - oldest_request))
114
-
115
- # Check token rate
116
- total_tokens = sum(tokens for _, tokens in token_usage)
117
- if total_tokens >= GROQ_TPM * TOKEN_BUFFER:
118
- return 1.0 # Wait a second if near token limit
119
-
120
- return 0
121
 
122
- def estimate_tokens(text):
123
- """Estimate tokens in text using GPT-3 tokenizer approximation"""
124
- return len(text.split()) * 1.3 # Rough estimate: 1.3 tokens per word
125
  def extract_main_content(soup):
126
  """
127
  Extract the main content from a webpage while filtering out boilerplate content.
@@ -191,13 +156,199 @@ def get_page_metadata(soup):
191
 
192
  return metadata
193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  def fetch_url_info(bookmark):
195
  """
196
  Fetch information about a URL.
197
  """
198
  url = bookmark['url']
199
  if url in fetch_cache:
200
- with api_lock:
201
  bookmark.update(fetch_cache[url])
202
  return
203
 
@@ -216,20 +367,17 @@ def fetch_url_info(bookmark):
216
 
217
  if response.status_code >= 500:
218
  bookmark['dead_link'] = True
219
- bookmark['description'] = ''
220
- bookmark['html_content'] = ''
221
  logger.warning(f"Dead link detected: {url} with status {response.status_code}")
222
  else:
223
  bookmark['dead_link'] = False
224
  bookmark['html_content'] = content
225
- bookmark['description'] = ''
226
  logger.info(f"Fetched information for {url}")
227
 
228
  except requests.exceptions.Timeout:
229
  bookmark['dead_link'] = False
230
  bookmark['etag'] = 'N/A'
231
  bookmark['status_code'] = 'Timeout'
232
- bookmark['description'] = ''
233
  bookmark['html_content'] = ''
234
  bookmark['slow_link'] = True
235
  logger.warning(f"Timeout while fetching {url}. Marking as 'Slow'.")
@@ -237,11 +385,23 @@ def fetch_url_info(bookmark):
237
  bookmark['dead_link'] = True
238
  bookmark['etag'] = 'N/A'
239
  bookmark['status_code'] = 'Error'
240
- bookmark['description'] = ''
241
  bookmark['html_content'] = ''
242
  logger.error(f"Error fetching URL info for {url}: {e}", exc_info=True)
243
  finally:
244
- with api_lock:
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  fetch_cache[url] = {
246
  'etag': bookmark.get('etag'),
247
  'status_code': bookmark.get('status_code'),
@@ -251,87 +411,6 @@ def fetch_url_info(bookmark):
251
  'slow_link': bookmark.get('slow_link', False),
252
  }
253
 
254
- def process_bookmarks_batch(bookmarks_batch):
255
- """Process a batch of bookmarks with controlled rate limiting"""
256
- for bookmark in bookmarks_batch:
257
- with LLM_SEMAPHORE:
258
- while True:
259
- with api_lock:
260
- wait_time = manage_rate_limits()
261
- if wait_time <= 0:
262
- break
263
- logger.info(f"Rate limiting: Waiting for {wait_time:.2f} seconds...")
264
- time.sleep(wait_time)
265
-
266
- try:
267
- html_content = bookmark.get('html_content', '')
268
- soup = BeautifulSoup(html_content, 'html.parser')
269
- metadata = get_page_metadata(soup)
270
- main_content = extract_main_content(soup)
271
-
272
- # Prepare shortened prompt to reduce tokens
273
- content = f"Title: {metadata['title']}\nURL: {bookmark['url']}"
274
- if len(main_content) > 1000: # Limit content length
275
- main_content = main_content[:1000] + "..."
276
-
277
- prompt = f"""Analyze this webpage:
278
- {content}
279
- Content: {main_content}
280
- Provide in format:
281
- Summary: [2 sentences max]
282
- Category: [{', '.join(CATEGORIES)}]"""
283
-
284
- # Estimate tokens
285
- input_tokens = estimate_tokens(prompt)
286
- max_tokens = 150
287
- total_tokens = input_tokens + max_tokens
288
-
289
- # Make API call
290
- response = openai.ChatCompletion.create(
291
- model='llama-3.1-70b-versatile',
292
- messages=[{"role": "user", "content": prompt}],
293
- max_tokens=max_tokens,
294
- temperature=0.5,
295
- )
296
-
297
- # Track rate limits
298
- with api_lock:
299
- current_time = time.time()
300
- request_times.append(current_time)
301
- token_usage.append((current_time, total_tokens))
302
-
303
- content = response['choices'][0]['message']['content'].strip()
304
-
305
- # Process response
306
- summary_match = re.search(r"Summary:\s*(.*?)(?:\n|$)", content)
307
- category_match = re.search(r"Category:\s*(.*?)(?:\n|$)", content)
308
-
309
- bookmark['summary'] = summary_match.group(1).strip() if summary_match else 'No summary available.'
310
-
311
- if category_match:
312
- category = category_match.group(1).strip().strip('"')
313
- bookmark['category'] = category if category in CATEGORIES else 'Uncategorized'
314
- else:
315
- bookmark['category'] = 'Uncategorized'
316
-
317
- # Quick category validation
318
- if 'social media' in bookmark['url'].lower() or 'twitter' in bookmark['url'].lower() or 'x.com' in bookmark['url'].lower():
319
- bookmark['category'] = 'Social Media'
320
- elif 'wikipedia' in bookmark['url'].lower():
321
- bookmark['category'] = 'Reference and Knowledge Bases'
322
-
323
- logger.info(f"Successfully processed bookmark: {bookmark['url']}")
324
- break
325
-
326
- except openai.error.RateLimitError as e:
327
- wait_time = int(e.headers.get('Retry-After', 5))
328
- logger.warning(f"Rate limit hit, waiting {wait_time} seconds...")
329
- time.sleep(wait_time)
330
- except Exception as e:
331
- logger.error(f"Error processing bookmark: {e}")
332
- bookmark['summary'] = 'Processing failed.'
333
- bookmark['category'] = 'Uncategorized'
334
- break
335
  def vectorize_and_index(bookmarks_list):
336
  """
337
  Create vector embeddings for bookmarks and build FAISS index with ID mapping.
@@ -339,7 +418,8 @@ def vectorize_and_index(bookmarks_list):
339
  global faiss_index
340
  logger.info("Vectorizing summaries and building FAISS index")
341
  try:
342
- summaries = [bookmark['summary'] for bookmark in bookmarks_list]
 
343
  embeddings = embedding_model.encode(summaries)
344
  dimension = embeddings.shape[1]
345
  index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))
@@ -364,19 +444,26 @@ def display_bookmarks():
364
  status = "❌ Dead Link"
365
  card_style = "border: 2px solid red;"
366
  text_style = "color: white;"
 
 
 
 
 
367
  elif bookmark.get('slow_link'):
368
  status = "⏳ Slow Response"
369
  card_style = "border: 2px solid orange;"
370
  text_style = "color: white;"
 
 
371
  else:
372
  status = "✅ Active"
373
  card_style = "border: 2px solid green;"
374
  text_style = "color: white;"
 
375
 
376
  title = bookmark['title']
377
  url = bookmark['url']
378
  etag = bookmark.get('etag', 'N/A')
379
- summary = bookmark.get('summary', '')
380
  category = bookmark.get('category', 'Uncategorized')
381
 
382
  # Escape HTML content to prevent XSS attacks
@@ -403,7 +490,7 @@ def display_bookmarks():
403
 
404
  def process_uploaded_file(file, state_bookmarks):
405
  """
406
- Process uploaded file with optimized batch processing
407
  """
408
  global bookmarks, faiss_index
409
  logger.info("Processing uploaded file")
@@ -414,62 +501,52 @@ def process_uploaded_file(file, state_bookmarks):
414
 
415
  try:
416
  file_content = file.decode('utf-8')
 
 
 
 
 
417
  bookmarks = parse_bookmarks(file_content)
 
 
 
418
 
419
- if not bookmarks:
420
- return "No bookmarks found in the file.", '', state_bookmarks, display_bookmarks(), gr.update(choices=[])
 
421
 
422
- # Assign IDs
423
- for idx, bookmark in enumerate(bookmarks):
424
- bookmark['id'] = idx
425
 
426
- # First fetch all URLs concurrently
427
- with ThreadPoolExecutor(max_workers=10) as executor:
428
- executor.map(fetch_url_info, bookmarks)
 
429
 
430
- # Process bookmarks in parallel with controlled concurrency
431
- batch_size = min(MAX_CONCURRENT_CALLS, len(bookmarks))
432
- batches = [bookmarks[i:i + batch_size] for i in range(0, len(bookmarks), batch_size)]
433
-
434
- with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_CALLS) as executor:
435
- executor.map(process_bookmarks_batch, batches)
436
 
437
- # Build FAISS index
438
  faiss_index = vectorize_and_index(bookmarks)
 
 
 
439
 
440
- # Update display and state
441
- bookmark_html = display_bookmarks()
442
- choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
443
- for i, bookmark in enumerate(bookmarks)]
444
- state_bookmarks = bookmarks.copy()
445
 
446
- return "✅ Processing complete!", bookmark_html, state_bookmarks, bookmark_html, gr.update(choices=choices)
 
 
 
447
 
448
- except Exception as e:
449
- logger.error(f"Error processing file: {e}")
450
- return f"Error processing file: {str(e)}", '', state_bookmarks, display_bookmarks(), gr.update(choices=[])
451
 
452
- def parse_bookmarks(file_content):
453
- """
454
- Parse bookmarks from HTML file.
455
- """
456
- logger.info("Parsing bookmarks")
457
- try:
458
- soup = BeautifulSoup(file_content, 'html.parser')
459
- extracted_bookmarks = []
460
- for link in soup.find_all('a'):
461
- url = link.get('href')
462
- title = link.text.strip()
463
- if url and title:
464
- if url.startswith('http://') or url.startswith('https://'):
465
- extracted_bookmarks.append({'url': url, 'title': title})
466
- else:
467
- logger.info(f"Skipping non-http/https URL: {url}")
468
- logger.info(f"Extracted {len(extracted_bookmarks)} bookmarks")
469
- return extracted_bookmarks
470
- except Exception as e:
471
- logger.error("Error parsing bookmarks: %s", e, exc_info=True)
472
- raise
473
 
474
  def delete_selected_bookmarks(selected_indices, state_bookmarks):
475
  """
@@ -533,6 +610,7 @@ def edit_selected_bookmarks_category(selected_indices, new_category, state_bookm
533
  state_bookmarks = bookmarks.copy()
534
 
535
  return message, gr.update(choices=choices), display_bookmarks(), state_bookmarks
 
536
  def export_bookmarks():
537
  """
538
  Export bookmarks to an HTML file.
@@ -576,76 +654,75 @@ def chatbot_response(user_query, chat_history):
576
  try:
577
  chat_history.append({"role": "user", "content": user_query})
578
 
579
- with LLM_SEMAPHORE:
580
- while True:
581
- with api_lock:
582
- wait_time = manage_rate_limits()
583
- if wait_time <= 0:
584
- break
585
- logger.info(f"Rate limiting: Waiting for {wait_time:.2f} seconds...")
586
- time.sleep(wait_time)
587
-
588
- try:
589
- # Search for relevant bookmarks
590
- query_vector = embedding_model.encode([user_query]).astype('float32')
591
- k = 5
592
- distances, ids = faiss_index.search(query_vector, k)
593
- ids = ids.flatten()
594
-
595
- id_to_bookmark = {bookmark['id']: bookmark for bookmark in bookmarks}
596
- matching_bookmarks = [id_to_bookmark.get(id) for id in ids if id in id_to_bookmark]
597
-
598
- if not matching_bookmarks:
599
- answer = "No relevant bookmarks found for your query."
600
- chat_history.append({"role": "assistant", "content": answer})
601
- return chat_history
602
-
603
- # Prepare concise prompt
604
- bookmarks_info = "\n".join([
605
- f"Title: {bookmark['title']}\nURL: {bookmark['url']}\nSummary: {bookmark['summary']}"
606
- for bookmark in matching_bookmarks
607
- ])
608
-
609
- prompt = f"""User Query: "{user_query}"
610
- Found Bookmarks:
611
  {bookmarks_info}
612
- Provide a helpful, concise response."""
613
-
614
- # Estimate tokens and make API call
615
- input_tokens = estimate_tokens(prompt)
616
- max_tokens = 300
617
- total_tokens = input_tokens + max_tokens
618
-
619
- response = openai.ChatCompletion.create(
620
- model='llama-3.1-70b-versatile',
621
- messages=[{"role": "user", "content": prompt}],
622
- max_tokens=max_tokens,
623
- temperature=0.7,
624
- )
625
-
626
- # Track rate limits
627
- with api_lock:
628
- current_time = time.time()
629
- request_times.append(current_time)
630
- token_usage.append((current_time, total_tokens))
631
-
632
- answer = response['choices'][0]['message']['content'].strip()
633
- logger.info("Chatbot response generated")
634
-
635
- chat_history.append({"role": "assistant", "content": answer})
636
- return chat_history
637
-
638
- except openai.error.RateLimitError as e:
639
- wait_time = int(e.headers.get('Retry-After', 5))
640
- logger.warning(f"Rate limit hit, waiting {wait_time} seconds...")
641
- time.sleep(wait_time)
642
- continue
643
- except Exception as e:
644
- error_message = f"⚠️ Error processing your query: {str(e)}"
645
- logger.error(error_message, exc_info=True)
646
- chat_history.append({"role": "assistant", "content": error_message})
647
- return chat_history
648
 
 
 
 
 
 
649
  except Exception as e:
650
  error_message = f"⚠️ Error processing your query: {str(e)}"
651
  logger.error(error_message, exc_info=True)
@@ -767,13 +844,13 @@ Navigate through the tabs to explore each feature in detail.
767
  """)
768
 
769
  manage_output = gr.Textbox(label="🔄 Status", interactive=False)
770
-
771
- # Move bookmark_selector here
772
  bookmark_selector = gr.CheckboxGroup(
773
  label="✅ Select Bookmarks",
774
  choices=[]
775
  )
776
-
777
  new_category = gr.Dropdown(
778
  label="🆕 New Category",
779
  choices=CATEGORIES,
@@ -832,4 +909,4 @@ Navigate through the tabs to explore each feature in detail.
832
  print(f"Error building the app: {e}")
833
 
834
  if __name__ == "__main__":
835
- build_app()
 
45
  bookmarks = []
46
  fetch_cache = {}
47
 
48
+ # Lock for thread-safe operations
49
+ lock = threading.Lock()
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  # Define the categories
52
  CATEGORIES = [
 
83
  openai.api_key = GROQ_API_KEY
84
  openai.api_base = "https://api.groq.com/openai/v1"
85
 
86
+ # Initialize global variables for rate limiting
87
+ api_lock = threading.Lock()
88
+ last_api_call_time = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
 
 
 
90
  def extract_main_content(soup):
91
  """
92
  Extract the main content from a webpage while filtering out boilerplate content.
 
156
 
157
  return metadata
158
 
159
+ def generate_summary_and_assign_category(bookmark):
160
+ """
161
+ Generate a concise summary and assign a category using a single LLM call.
162
+ For slow links, always provide a summary.
163
+ For dead links, provide a summary if possible; otherwise, ignore.
164
+ """
165
+ logger.info(f"Generating summary and assigning category for bookmark: {bookmark.get('url')}")
166
+
167
+ max_retries = 3
168
+ retry_count = 0
169
+
170
+ while retry_count < max_retries:
171
+ try:
172
+ # Rate Limiting Logic
173
+ with api_lock:
174
+ global last_api_call_time
175
+ current_time = time.time()
176
+ elapsed = current_time - last_api_call_time
177
+ if elapsed < 2:
178
+ sleep_duration = 2 - elapsed
179
+ logger.info(f"Sleeping for {sleep_duration:.2f} seconds to respect rate limits.")
180
+ time.sleep(sleep_duration)
181
+ last_api_call_time = time.time()
182
+
183
+ html_content = bookmark.get('html_content', '')
184
+ soup = BeautifulSoup(html_content, 'html.parser')
185
+ metadata = get_page_metadata(soup)
186
+ main_content = extract_main_content(soup)
187
+
188
+ # Prepare content for the prompt
189
+ content_parts = []
190
+ if metadata['title']:
191
+ content_parts.append(f"Title: {metadata['title']}")
192
+ if metadata['description']:
193
+ content_parts.append(f"Description: {metadata['description']}")
194
+ if metadata['keywords']:
195
+ content_parts.append(f"Keywords: {metadata['keywords']}")
196
+ if main_content:
197
+ content_parts.append(f"Main Content: {main_content}")
198
+
199
+ content_text = '\n'.join(content_parts)
200
+
201
+ # Detect insufficient or erroneous content
202
+ error_keywords = ['Access Denied', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic']
203
+ if not content_text or len(content_text.split()) < 50:
204
+ use_prior_knowledge = True
205
+ logger.info(f"Content for {bookmark.get('url')} is insufficient. Instructing LLM to use prior knowledge.")
206
+ elif any(keyword.lower() in content_text.lower() for keyword in error_keywords):
207
+ use_prior_knowledge = True
208
+ logger.info(f"Content for {bookmark.get('url')} contains error messages. Instructing LLM to use prior knowledge.")
209
+ else:
210
+ use_prior_knowledge = False
211
+
212
+ if use_prior_knowledge:
213
+ prompt = f"""
214
+ You are a knowledgeable assistant with up-to-date information as of 2023.
215
+ URL: {bookmark.get('url')}
216
+ Provide:
217
+ 1. A concise summary (max two sentences) about this website.
218
+ 2. Assign the most appropriate category from the list below.
219
+ Categories:
220
+ {', '.join([f'"{cat}"' for cat in CATEGORIES])}
221
+ Format:
222
+ Summary: [Your summary]
223
+ Category: [One category]
224
+ """
225
+ else:
226
+ prompt = f"""
227
+ You are an assistant that creates concise webpage summaries and assigns categories.
228
+ Content:
229
+ {content_text}
230
+ Provide:
231
+ 1. A concise summary (max two sentences) focusing on the main topic.
232
+ 2. Assign the most appropriate category from the list below.
233
+ Categories:
234
+ {', '.join([f'"{cat}"' for cat in CATEGORIES])}
235
+ Format:
236
+ Summary: [Your summary]
237
+ Category: [One category]
238
+ """
239
+
240
+ def estimate_tokens(text):
241
+ return len(text) / 4
242
+
243
+ prompt_tokens = estimate_tokens(prompt)
244
+ max_tokens = 150
245
+ total_tokens = prompt_tokens + max_tokens
246
+
247
+ tokens_per_minute = 40000
248
+ tokens_per_second = tokens_per_minute / 60
249
+ required_delay = total_tokens / tokens_per_second
250
+ sleep_time = max(required_delay, 2)
251
+
252
+ response = openai.ChatCompletion.create(
253
+ model='llama-3.1-70b-versatile',
254
+ messages=[
255
+ {"role": "user", "content": prompt}
256
+ ],
257
+ max_tokens=int(max_tokens),
258
+ temperature=0.5,
259
+ )
260
+
261
+ content = response['choices'][0]['message']['content'].strip()
262
+ if not content:
263
+ raise ValueError("Empty response received from the model.")
264
+
265
+ summary_match = re.search(r"Summary:\s*(.*)", content, re.IGNORECASE)
266
+ category_match = re.search(r"Category:\s*(.*)", content, re.IGNORECASE)
267
+
268
+ # Extract summary
269
+ if summary_match:
270
+ summary = summary_match.group(1).strip()
271
+ if summary:
272
+ bookmark['summary'] = summary
273
+ else:
274
+ # For dead links, only set summary if it's a slow link
275
+ if bookmark.get('slow_link', False):
276
+ bookmark['summary'] = metadata.get('description') or metadata.get('title') or 'No summary available.'
277
+ else:
278
+ # For dead links without summary, do not set 'summary'
279
+ bookmark['summary'] = ''
280
+ else:
281
+ if bookmark.get('slow_link', False):
282
+ bookmark['summary'] = metadata.get('description') or metadata.get('title') or 'No summary available.'
283
+ else:
284
+ bookmark['summary'] = ''
285
+
286
+ # Extract category
287
+ if category_match:
288
+ category = category_match.group(1).strip().strip('"')
289
+ bookmark['category'] = category if category in CATEGORIES else 'Uncategorized'
290
+ else:
291
+ bookmark['category'] = 'Uncategorized'
292
+
293
+ # Simple keyword-based validation
294
+ summary_lower = bookmark.get('summary', '').lower()
295
+ url_lower = bookmark['url'].lower()
296
+ if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
297
+ bookmark['category'] = 'Social Media'
298
+ elif 'wikipedia' in url_lower:
299
+ bookmark['category'] = 'Reference and Knowledge Bases'
300
+
301
+ logger.info("Successfully generated summary and assigned category")
302
+ time.sleep(sleep_time)
303
+ break
304
+
305
+ except openai.error.RateLimitError as e:
306
+ retry_count += 1
307
+ wait_time = int(e.headers.get("Retry-After", 5))
308
+ logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying... (Attempt {retry_count}/{max_retries})")
309
+ time.sleep(wait_time)
310
+ except Exception as e:
311
+ logger.error(f"Error generating summary and assigning category: {e}", exc_info=True)
312
+ # For slow links, provide a summary from metadata or title
313
+ if bookmark.get('slow_link', False):
314
+ bookmark['summary'] = metadata.get('description') or metadata.get('title') or 'No summary available.'
315
+ # For dead links, attempt to set summary; if not possible, leave it unset
316
+ elif bookmark.get('dead_link', False):
317
+ bookmark['summary'] = metadata.get('description') or metadata.get('title') or ''
318
+ else:
319
+ bookmark['summary'] = 'No summary available.'
320
+ bookmark['category'] = 'Uncategorized'
321
+ break
322
+
323
+ def parse_bookmarks(file_content):
324
+ """
325
+ Parse bookmarks from HTML file.
326
+ """
327
+ logger.info("Parsing bookmarks")
328
+ try:
329
+ soup = BeautifulSoup(file_content, 'html.parser')
330
+ extracted_bookmarks = []
331
+ for link in soup.find_all('a'):
332
+ url = link.get('href')
333
+ title = link.text.strip()
334
+ if url and title:
335
+ if url.startswith('http://') or url.startswith('https://'):
336
+ extracted_bookmarks.append({'url': url, 'title': title})
337
+ else:
338
+ logger.info(f"Skipping non-http/https URL: {url}")
339
+ logger.info(f"Extracted {len(extracted_bookmarks)} bookmarks")
340
+ return extracted_bookmarks
341
+ except Exception as e:
342
+ logger.error("Error parsing bookmarks: %s", e, exc_info=True)
343
+ raise
344
+
345
  def fetch_url_info(bookmark):
346
  """
347
  Fetch information about a URL.
348
  """
349
  url = bookmark['url']
350
  if url in fetch_cache:
351
+ with lock:
352
  bookmark.update(fetch_cache[url])
353
  return
354
 
 
367
 
368
  if response.status_code >= 500:
369
  bookmark['dead_link'] = True
370
+ bookmark['html_content'] = content # Keep content to extract metadata if possible
 
371
  logger.warning(f"Dead link detected: {url} with status {response.status_code}")
372
  else:
373
  bookmark['dead_link'] = False
374
  bookmark['html_content'] = content
 
375
  logger.info(f"Fetched information for {url}")
376
 
377
  except requests.exceptions.Timeout:
378
  bookmark['dead_link'] = False
379
  bookmark['etag'] = 'N/A'
380
  bookmark['status_code'] = 'Timeout'
 
381
  bookmark['html_content'] = ''
382
  bookmark['slow_link'] = True
383
  logger.warning(f"Timeout while fetching {url}. Marking as 'Slow'.")
 
385
  bookmark['dead_link'] = True
386
  bookmark['etag'] = 'N/A'
387
  bookmark['status_code'] = 'Error'
 
388
  bookmark['html_content'] = ''
389
  logger.error(f"Error fetching URL info for {url}: {e}", exc_info=True)
390
  finally:
391
+ # Extract meta description for dead links if content is available
392
+ if bookmark.get('dead_link', False) and bookmark.get('html_content'):
393
+ soup = BeautifulSoup(bookmark['html_content'], 'html.parser')
394
+ metadata = get_page_metadata(soup)
395
+ bookmark['description'] = metadata.get('description', '')
396
+ elif not bookmark.get('dead_link', False):
397
+ # For active and slow links, attempt to extract description
398
+ soup = BeautifulSoup(bookmark['html_content'], 'html.parser')
399
+ metadata = get_page_metadata(soup)
400
+ bookmark['description'] = metadata.get('description', '')
401
+ else:
402
+ bookmark['description'] = ''
403
+
404
+ with lock:
405
  fetch_cache[url] = {
406
  'etag': bookmark.get('etag'),
407
  'status_code': bookmark.get('status_code'),
 
411
  'slow_link': bookmark.get('slow_link', False),
412
  }
413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
  def vectorize_and_index(bookmarks_list):
415
  """
416
  Create vector embeddings for bookmarks and build FAISS index with ID mapping.
 
418
  global faiss_index
419
  logger.info("Vectorizing summaries and building FAISS index")
420
  try:
421
+ # Safely access 'summary' using .get() to avoid KeyError
422
+ summaries = [bookmark.get('summary', '') for bookmark in bookmarks_list]
423
  embeddings = embedding_model.encode(summaries)
424
  dimension = embeddings.shape[1]
425
  index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))
 
444
  status = "❌ Dead Link"
445
  card_style = "border: 2px solid red;"
446
  text_style = "color: white;"
447
+ # For dead links, use 'summary' if available
448
+ summary = bookmark.get('summary', '')
449
+ if not summary:
450
+ # Optionally, you can skip setting summary or provide a default message
451
+ summary = 'No summary available.'
452
  elif bookmark.get('slow_link'):
453
  status = "⏳ Slow Response"
454
  card_style = "border: 2px solid orange;"
455
  text_style = "color: white;"
456
+ # For slow links, always provide a summary
457
+ summary = bookmark.get('summary', 'No summary available.')
458
  else:
459
  status = "✅ Active"
460
  card_style = "border: 2px solid green;"
461
  text_style = "color: white;"
462
+ summary = bookmark.get('summary', 'No summary available.')
463
 
464
  title = bookmark['title']
465
  url = bookmark['url']
466
  etag = bookmark.get('etag', 'N/A')
 
467
  category = bookmark.get('category', 'Uncategorized')
468
 
469
  # Escape HTML content to prevent XSS attacks
 
490
 
491
  def process_uploaded_file(file, state_bookmarks):
492
  """
493
+ Process the uploaded bookmarks file.
494
  """
495
  global bookmarks, faiss_index
496
  logger.info("Processing uploaded file")
 
501
 
502
  try:
503
  file_content = file.decode('utf-8')
504
+ except UnicodeDecodeError as e:
505
+ logger.error(f"Error decoding the file: {e}", exc_info=True)
506
+ return "Error decoding the file. Please ensure it's a valid HTML file.", '', state_bookmarks, display_bookmarks(), gr.update(choices=[])
507
+
508
+ try:
509
  bookmarks = parse_bookmarks(file_content)
510
+ except Exception as e:
511
+ logger.error(f"Error parsing bookmarks: {e}", exc_info=True)
512
+ return "Error parsing the bookmarks HTML file.", '', state_bookmarks, display_bookmarks(), gr.update(choices=[])
513
 
514
+ if not bookmarks:
515
+ logger.warning("No bookmarks found in the uploaded file")
516
+ return "No bookmarks found in the uploaded file.", '', state_bookmarks, display_bookmarks(), gr.update(choices=[])
517
 
518
+ # Assign unique IDs to bookmarks
519
+ for idx, bookmark in enumerate(bookmarks):
520
+ bookmark['id'] = idx
521
 
522
+ # Fetch bookmark info concurrently
523
+ logger.info("Fetching URL info concurrently")
524
+ with ThreadPoolExecutor(max_workers=10) as executor:
525
+ executor.map(fetch_url_info, bookmarks)
526
 
527
+ # Process bookmarks concurrently with LLM calls
528
+ logger.info("Processing bookmarks with LLM concurrently")
529
+ with ThreadPoolExecutor(max_workers=1) as executor:
530
+ executor.map(generate_summary_and_assign_category, bookmarks)
 
 
531
 
532
+ try:
533
  faiss_index = vectorize_and_index(bookmarks)
534
+ except Exception as e:
535
+ logger.error(f"Error building FAISS index: {e}", exc_info=True)
536
+ return "Error building search index.", '', state_bookmarks, display_bookmarks(), gr.update(choices=[])
537
 
538
+ message = f"✅ Successfully processed {len(bookmarks)} bookmarks."
539
+ logger.info(message)
 
 
 
540
 
541
+ # Generate displays and updates
542
+ bookmark_html = display_bookmarks()
543
+ choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
544
+ for i, bookmark in enumerate(bookmarks)]
545
 
546
+ # Update state
547
+ state_bookmarks = bookmarks.copy()
 
548
 
549
+ return message, bookmark_html, state_bookmarks, bookmark_html, gr.update(choices=choices)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
550
 
551
  def delete_selected_bookmarks(selected_indices, state_bookmarks):
552
  """
 
610
  state_bookmarks = bookmarks.copy()
611
 
612
  return message, gr.update(choices=choices), display_bookmarks(), state_bookmarks
613
+
614
  def export_bookmarks():
615
  """
616
  Export bookmarks to an HTML file.
 
654
  try:
655
  chat_history.append({"role": "user", "content": user_query})
656
 
657
+ with api_lock:
658
+ global last_api_call_time
659
+ current_time = time.time()
660
+ elapsed = current_time - last_api_call_time
661
+ if elapsed < 2:
662
+ sleep_duration = 2 - elapsed
663
+ logger.info(f"Sleeping for {sleep_duration:.2f} seconds to respect rate limits.")
664
+ time.sleep(sleep_duration)
665
+ last_api_call_time = time.time()
666
+
667
+ query_vector = embedding_model.encode([user_query]).astype('float32')
668
+ k = 5
669
+ distances, ids = faiss_index.search(query_vector, k)
670
+ ids = ids.flatten()
671
+
672
+ id_to_bookmark = {bookmark['id']: bookmark for bookmark in bookmarks}
673
+ matching_bookmarks = [id_to_bookmark.get(id) for id in ids if id in id_to_bookmark and id_to_bookmark.get(id).get('summary')]
674
+
675
+ if not matching_bookmarks:
676
+ answer = "No relevant bookmarks found for your query."
677
+ chat_history.append({"role": "assistant", "content": answer})
678
+ return chat_history
679
+
680
+ bookmarks_info = "\n".join([
681
+ f"Title: {bookmark['title']}\nURL: {bookmark['url']}\nSummary: {bookmark['summary']}"
682
+ for bookmark in matching_bookmarks
683
+ ])
684
+
685
+ prompt = f"""
686
+ A user asked: "{user_query}"
687
+ Based on the bookmarks below, provide a helpful answer to the user's query, referencing the relevant bookmarks.
688
+ Bookmarks:
689
  {bookmarks_info}
690
+ Provide a concise and helpful response.
691
+ """
692
+
693
+ def estimate_tokens(text):
694
+ return len(text) / 4
695
+
696
+ prompt_tokens = estimate_tokens(prompt)
697
+ max_tokens = 300
698
+ total_tokens = prompt_tokens + max_tokens
699
+
700
+ tokens_per_minute = 40000
701
+ tokens_per_second = tokens_per_minute / 60
702
+ required_delay = total_tokens / tokens_per_second
703
+ sleep_time = max(required_delay, 2)
704
+
705
+ response = openai.ChatCompletion.create(
706
+ model='llama-3.1-70b-versatile',
707
+ messages=[
708
+ {"role": "user", "content": prompt}
709
+ ],
710
+ max_tokens=int(max_tokens),
711
+ temperature=0.7,
712
+ )
713
+
714
+ answer = response['choices'][0]['message']['content'].strip()
715
+ logger.info("Chatbot response generated")
716
+ time.sleep(sleep_time)
717
+
718
+ chat_history.append({"role": "assistant", "content": answer})
719
+ return chat_history
 
 
 
 
 
 
720
 
721
+ except openai.error.RateLimitError as e:
722
+ wait_time = int(e.headers.get("Retry-After", 5))
723
+ logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying...")
724
+ time.sleep(wait_time)
725
+ return chatbot_response(user_query, chat_history)
726
  except Exception as e:
727
  error_message = f"⚠️ Error processing your query: {str(e)}"
728
  logger.error(error_message, exc_info=True)
 
844
  """)
845
 
846
  manage_output = gr.Textbox(label="🔄 Status", interactive=False)
847
+
848
+ # CheckboxGroup for selecting bookmarks
849
  bookmark_selector = gr.CheckboxGroup(
850
  label="✅ Select Bookmarks",
851
  choices=[]
852
  )
853
+
854
  new_category = gr.Dropdown(
855
  label="🆕 New Category",
856
  choices=CATEGORIES,
 
909
  print(f"Error building the app: {e}")
910
 
911
  if __name__ == "__main__":
912
+ build_app()