siddhartharya commited on
Commit
2e66ec2
·
verified ·
1 Parent(s): 2ff005a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +148 -111
app.py CHANGED
@@ -8,14 +8,13 @@ import numpy as np
8
  import requests
9
  import time
10
  import re
11
- import base64
12
  import logging
13
  import os
14
  import sys
15
  import concurrent.futures
16
  from concurrent.futures import ThreadPoolExecutor
17
  import threading
18
- from collections import deque
19
 
20
  # Import OpenAI library
21
  import openai
@@ -105,7 +104,6 @@ class TokenBucket:
105
  with self.lock:
106
  now = time.time()
107
  elapsed = now - self.timestamp
108
- # Refill tokens
109
  refill = elapsed * self.rate
110
  self.tokens = min(self.capacity, self.tokens + refill)
111
  self.timestamp = now
@@ -126,93 +124,28 @@ tpm_rate = TPM_LIMIT / 60 # tokens per second
126
  rpm_bucket = TokenBucket(rate=rpm_rate, capacity=RPM_LIMIT)
127
  tpm_bucket = TokenBucket(rate=tpm_rate, capacity=TPM_LIMIT)
128
 
129
- def extract_main_content(soup):
130
- """
131
- Extract the main content from a webpage while filtering out boilerplate content.
132
- """
133
- if not soup:
134
- return ""
135
-
136
- # Remove unwanted elements
137
- for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'noscript']):
138
- element.decompose()
139
 
140
- # Extract text from <p> tags
141
- p_tags = soup.find_all('p')
142
- if p_tags:
143
- content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
144
- else:
145
- # Fallback to body content
146
- content = soup.get_text(separator=' ', strip=True)
147
-
148
- # Clean up the text
149
- content = re.sub(r'\s+', ' ', content)
150
-
151
- # Truncate content to a reasonable length (e.g., 1500 words)
152
- words = content.split()
153
- if len(words) > 1500:
154
- content = ' '.join(words[:1500])
155
-
156
- return content
157
-
158
- def get_page_metadata(soup):
159
  """
160
- Extract metadata from the webpage including title, description, and keywords.
161
  """
162
- metadata = {
163
- 'title': '',
164
- 'description': '',
165
- 'keywords': ''
166
- }
167
-
168
- if not soup:
169
- return metadata
170
-
171
- # Get title
172
- title_tag = soup.find('title')
173
- if title_tag and title_tag.string:
174
- metadata['title'] = title_tag.string.strip()
175
-
176
- # Get meta description
177
- meta_desc = (
178
- soup.find('meta', attrs={'name': 'description'}) or
179
- soup.find('meta', attrs={'property': 'og:description'}) or
180
- soup.find('meta', attrs={'name': 'twitter:description'})
181
- )
182
- if meta_desc:
183
- metadata['description'] = meta_desc.get('content', '').strip()
184
-
185
- # Get meta keywords
186
- meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
187
- if meta_keywords:
188
- metadata['keywords'] = meta_keywords.get('content', '').strip()
189
-
190
- # Get OG title if main title is empty
191
- if not metadata['title']:
192
- og_title = soup.find('meta', attrs={'property': 'og:title'})
193
- if og_title:
194
- metadata['title'] = og_title.get('content', '').strip()
195
-
196
- return metadata
197
-
198
- def generate_summary_and_assign_category(bookmark):
199
- """
200
- Generate a concise summary and assign a category using a single LLM call.
201
- For slow links, always provide a summary.
202
- For dead links, provide a summary if possible; otherwise, ignore.
203
- """
204
- logger.info(f"Generating summary and assigning category for bookmark: {bookmark.get('url')}")
205
 
206
- max_retries = 3
207
- retry_count = 0
 
208
 
209
- while retry_count < max_retries:
210
  try:
211
  # Rate Limiting
212
  rpm_bucket.wait_for_token()
213
- # Estimate tokens: prompt + max_tokens
214
- # Here, we assume max_tokens=150
215
- tpm_bucket.wait_for_token(tokens=150)
216
 
217
  html_content = bookmark.get('html_content', '')
218
  soup = BeautifulSoup(html_content, 'html.parser')
@@ -272,17 +205,13 @@ Category: [One category]
272
  """
273
 
274
  def estimate_tokens(text):
275
- return len(text) / 4
276
 
277
  prompt_tokens = estimate_tokens(prompt)
278
  max_tokens = 150
279
  total_tokens = prompt_tokens + max_tokens
280
 
281
- tokens_per_minute = 40000
282
- tokens_per_second = tokens_per_minute / 60
283
- required_delay = total_tokens / tokens_per_second
284
- sleep_time = max(required_delay, 2)
285
-
286
  response = openai.ChatCompletion.create(
287
  model='llama-3.1-70b-versatile',
288
  messages=[
@@ -333,16 +262,13 @@ Category: [One category]
333
  bookmark['category'] = 'Reference and Knowledge Bases'
334
 
335
  logger.info("Successfully generated summary and assigned category")
336
- time.sleep(sleep_time)
337
- break
338
-
339
  except openai.error.RateLimitError as e:
340
- retry_count += 1
341
- wait_time = int(e.headers.get("Retry-After", 5))
342
- logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying... (Attempt {retry_count}/{max_retries})")
343
- time.sleep(wait_time)
344
  except Exception as e:
345
- logger.error(f"Error generating summary and assigning category: {e}", exc_info=True)
346
  # For slow links, provide a summary from metadata or title
347
  if bookmark.get('slow_link', False):
348
  bookmark['summary'] = metadata.get('description') or metadata.get('title') or 'No summary available.'
@@ -352,7 +278,120 @@ Category: [One category]
352
  else:
353
  bookmark['summary'] = 'No summary available.'
354
  bookmark['category'] = 'Uncategorized'
355
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
 
357
  def parse_bookmarks(file_content):
358
  """
@@ -558,10 +597,14 @@ def process_uploaded_file(file, state_bookmarks):
558
  with ThreadPoolExecutor(max_workers=10) as executor:
559
  executor.map(fetch_url_info, bookmarks)
560
 
561
- # Process bookmarks concurrently with LLM calls
562
- logger.info("Processing bookmarks with LLM concurrently")
563
- with ThreadPoolExecutor(max_workers=5) as executor:
564
- executor.map(generate_summary_and_assign_category, bookmarks)
 
 
 
 
565
 
566
  try:
567
  faiss_index = vectorize_and_index(bookmarks)
@@ -690,7 +733,7 @@ def chatbot_response(user_query, chat_history):
690
 
691
  # Rate Limiting
692
  rpm_bucket.wait_for_token()
693
- tpm_bucket.wait_for_token(tokens=300) # Assuming max_tokens=300
694
 
695
  query_vector = embedding_model.encode([user_query]).astype('float32')
696
  k = 5
@@ -720,17 +763,12 @@ Provide a concise and helpful response.
720
  """
721
 
722
  def estimate_tokens(text):
723
- return len(text) / 4
724
 
725
  prompt_tokens = estimate_tokens(prompt)
726
  max_tokens = 300
727
  total_tokens = prompt_tokens + max_tokens
728
 
729
- tokens_per_minute = 40000
730
- tokens_per_second = tokens_per_minute / 60
731
- required_delay = total_tokens / tokens_per_second
732
- sleep_time = max(required_delay, 2)
733
-
734
  response = openai.ChatCompletion.create(
735
  model='llama-3.1-70b-versatile',
736
  messages=[
@@ -742,14 +780,13 @@ Provide a concise and helpful response.
742
 
743
  answer = response['choices'][0]['message']['content'].strip()
744
  logger.info("Chatbot response generated")
745
- time.sleep(sleep_time)
746
 
747
  chat_history.append({"role": "assistant", "content": answer})
748
  return chat_history
749
 
750
  except openai.error.RateLimitError as e:
751
- wait_time = int(e.headers.get("Retry-After", 5))
752
- logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying...")
753
  time.sleep(wait_time)
754
  return chatbot_response(user_query, chat_history)
755
  except Exception as e:
 
8
  import requests
9
  import time
10
  import re
 
11
  import logging
12
  import os
13
  import sys
14
  import concurrent.futures
15
  from concurrent.futures import ThreadPoolExecutor
16
  import threading
17
+ from queue import Queue, Empty
18
 
19
  # Import OpenAI library
20
  import openai
 
104
  with self.lock:
105
  now = time.time()
106
  elapsed = now - self.timestamp
 
107
  refill = elapsed * self.rate
108
  self.tokens = min(self.capacity, self.tokens + refill)
109
  self.timestamp = now
 
124
  rpm_bucket = TokenBucket(rate=rpm_rate, capacity=RPM_LIMIT)
125
  tpm_bucket = TokenBucket(rate=tpm_rate, capacity=TPM_LIMIT)
126
 
127
+ # Queue for LLM tasks
128
+ llm_queue = Queue()
 
 
 
 
 
 
 
 
129
 
130
+ def llm_worker():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  """
132
+ Worker thread to process LLM tasks from the queue while respecting rate limits.
133
  """
134
+ logger.info("LLM worker started.")
135
+ while True:
136
+ try:
137
+ bookmark = llm_queue.get(timeout=60) # Wait for a task
138
+ except Empty:
139
+ continue # No task, continue waiting
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
+ if bookmark is None:
142
+ logger.info("LLM worker shutting down.")
143
+ break # Exit signal
144
 
 
145
  try:
146
  # Rate Limiting
147
  rpm_bucket.wait_for_token()
148
+ tpm_bucket.wait_for_token(tokens=150) # Assuming max_tokens=150 per request
 
 
149
 
150
  html_content = bookmark.get('html_content', '')
151
  soup = BeautifulSoup(html_content, 'html.parser')
 
205
  """
206
 
207
  def estimate_tokens(text):
208
+ return len(text) / 4 # Approximation
209
 
210
  prompt_tokens = estimate_tokens(prompt)
211
  max_tokens = 150
212
  total_tokens = prompt_tokens + max_tokens
213
 
214
+ # Prepare the prompt with token estimation
 
 
 
 
215
  response = openai.ChatCompletion.create(
216
  model='llama-3.1-70b-versatile',
217
  messages=[
 
262
  bookmark['category'] = 'Reference and Knowledge Bases'
263
 
264
  logger.info("Successfully generated summary and assigned category")
 
 
 
265
  except openai.error.RateLimitError as e:
266
+ logger.warning(f"LLM Rate limit reached while processing {bookmark.get('url')}. Retrying later...")
267
+ # Re-enqueue the bookmark for retry
268
+ llm_queue.put(bookmark)
269
+ time.sleep(60) # Wait before retrying
270
  except Exception as e:
271
+ logger.error(f"Error generating summary and assigning category for {bookmark.get('url')}: {e}", exc_info=True)
272
  # For slow links, provide a summary from metadata or title
273
  if bookmark.get('slow_link', False):
274
  bookmark['summary'] = metadata.get('description') or metadata.get('title') or 'No summary available.'
 
278
  else:
279
  bookmark['summary'] = 'No summary available.'
280
  bookmark['category'] = 'Uncategorized'
281
+ finally:
282
+ llm_queue.task_done()
283
+
284
+ # Start the LLM worker thread
285
+ llm_thread = threading.Thread(target=llm_worker, daemon=True)
286
+ llm_thread.start()
287
+
288
+ def extract_main_content(soup):
289
+ """
290
+ Extract the main content from a webpage while filtering out boilerplate content.
291
+ """
292
+ if not soup:
293
+ return ""
294
+
295
+ # Remove unwanted elements
296
+ for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'noscript']):
297
+ element.decompose()
298
+
299
+ # Extract text from <p> tags
300
+ p_tags = soup.find_all('p')
301
+ if p_tags:
302
+ content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
303
+ else:
304
+ # Fallback to body content
305
+ content = soup.get_text(separator=' ', strip=True)
306
+
307
+ # Clean up the text
308
+ content = re.sub(r'\s+', ' ', content)
309
+
310
+ # Truncate content to a reasonable length (e.g., 1500 words)
311
+ words = content.split()
312
+ if len(words) > 1500:
313
+ content = ' '.join(words[:1500])
314
+
315
+ return content
316
+
317
+ def get_page_metadata(soup):
318
+ """
319
+ Extract metadata from the webpage including title, description, and keywords.
320
+ """
321
+ metadata = {
322
+ 'title': '',
323
+ 'description': '',
324
+ 'keywords': ''
325
+ }
326
+
327
+ if not soup:
328
+ return metadata
329
+
330
+ # Get title
331
+ title_tag = soup.find('title')
332
+ if title_tag and title_tag.string:
333
+ metadata['title'] = title_tag.string.strip()
334
+
335
+ # Get meta description
336
+ meta_desc = (
337
+ soup.find('meta', attrs={'name': 'description'}) or
338
+ soup.find('meta', attrs={'property': 'og:description'}) or
339
+ soup.find('meta', attrs={'name': 'twitter:description'})
340
+ )
341
+ if meta_desc:
342
+ metadata['description'] = meta_desc.get('content', '').strip()
343
+
344
+ # Get meta keywords
345
+ meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
346
+ if meta_keywords:
347
+ metadata['keywords'] = meta_keywords.get('content', '').strip()
348
+
349
+ # Get OG title if main title is empty
350
+ if not metadata['title']:
351
+ og_title = soup.find('meta', attrs={'property': 'og:title'})
352
+ if og_title:
353
+ metadata['title'] = og_title.get('content', '').strip()
354
+
355
+ return metadata
356
+
357
+ def generate_summary_and_assign_category(bookmark):
358
+ """
359
+ Generate a concise summary and assign a category.
360
+ This function decides whether to use metadata or enqueue an LLM call.
361
+ """
362
+ # Check if metadata can provide a summary
363
+ description = bookmark.get('description', '').strip()
364
+ title = bookmark.get('title', '').strip()
365
+
366
+ if description:
367
+ # Use description as summary
368
+ bookmark['summary'] = description
369
+ # Assign category based on description or title
370
+ assign_category_based_on_summary(bookmark)
371
+ logger.info(f"Summary derived from metadata for {bookmark.get('url')}")
372
+ elif title:
373
+ # Use title as summary
374
+ bookmark['summary'] = title
375
+ # Assign category based on title
376
+ assign_category_based_on_summary(bookmark)
377
+ logger.info(f"Summary derived from title for {bookmark.get('url')}")
378
+ else:
379
+ # Enqueue for LLM processing
380
+ logger.info(f"No sufficient metadata for {bookmark.get('url')}. Enqueuing for LLM summary generation.")
381
+ llm_queue.put(bookmark)
382
+
383
+ def assign_category_based_on_summary(bookmark):
384
+ """
385
+ Assign category based on simple keyword matching in the summary.
386
+ """
387
+ summary_lower = bookmark.get('summary', '').lower()
388
+ url_lower = bookmark['url'].lower()
389
+ if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
390
+ bookmark['category'] = 'Social Media'
391
+ elif 'wikipedia' in url_lower:
392
+ bookmark['category'] = 'Reference and Knowledge Bases'
393
+ else:
394
+ bookmark['category'] = 'Uncategorized'
395
 
396
  def parse_bookmarks(file_content):
397
  """
 
597
  with ThreadPoolExecutor(max_workers=10) as executor:
598
  executor.map(fetch_url_info, bookmarks)
599
 
600
+ # Process bookmarks for summary and category
601
+ logger.info("Processing bookmarks for summaries and categories")
602
+ for bookmark in bookmarks:
603
+ generate_summary_and_assign_category(bookmark)
604
+
605
+ # Wait until all LLM tasks are completed
606
+ llm_queue.join()
607
+ logger.info("All LLM tasks have been processed")
608
 
609
  try:
610
  faiss_index = vectorize_and_index(bookmarks)
 
733
 
734
  # Rate Limiting
735
  rpm_bucket.wait_for_token()
736
+ tpm_bucket.wait_for_token(tokens=300) # Assuming max_tokens=300 per request
737
 
738
  query_vector = embedding_model.encode([user_query]).astype('float32')
739
  k = 5
 
763
  """
764
 
765
  def estimate_tokens(text):
766
+ return len(text) / 4 # Approximation
767
 
768
  prompt_tokens = estimate_tokens(prompt)
769
  max_tokens = 300
770
  total_tokens = prompt_tokens + max_tokens
771
 
 
 
 
 
 
772
  response = openai.ChatCompletion.create(
773
  model='llama-3.1-70b-versatile',
774
  messages=[
 
780
 
781
  answer = response['choices'][0]['message']['content'].strip()
782
  logger.info("Chatbot response generated")
 
783
 
784
  chat_history.append({"role": "assistant", "content": answer})
785
  return chat_history
786
 
787
  except openai.error.RateLimitError as e:
788
+ wait_time = int(e.headers.get("Retry-After", 60))
789
+ logger.warning(f"Chatbot Rate limit reached. Waiting for {wait_time} seconds before retrying...")
790
  time.sleep(wait_time)
791
  return chatbot_response(user_query, chat_history)
792
  except Exception as e: