siddhartharya commited on
Commit
3b1a6a1
·
verified ·
1 Parent(s): b8183dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -103
app.py CHANGED
@@ -153,26 +153,18 @@ def generate_summary(bookmark):
153
  logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
154
 
155
  try:
156
- # Get the HTML soup object from the bookmark
157
- soup = BeautifulSoup(bookmark.get('html_content', ''), 'html.parser')
158
-
159
- # Extract metadata and main content
160
- metadata = get_page_metadata(soup)
161
- main_content = extract_main_content(soup)
162
-
163
- # Prepare content for the prompt
164
- available_content = []
165
- if metadata['title']:
166
- available_content.append(f"Title: {metadata['title']}")
167
- if metadata['description']:
168
- available_content.append(f"Description: {metadata['description']}")
169
- if metadata['keywords']:
170
- available_content.append(f"Keywords: {metadata['keywords']}")
171
- if main_content:
172
- available_content.append(f"Main Content: {main_content}")
173
-
174
- # If content is insufficient, instruct the LLM to use prior knowledge
175
- if not available_content or len(' '.join(available_content).split()) < 50:
176
  prompt = f"""
177
  You are a knowledgeable assistant.
178
 
@@ -188,19 +180,23 @@ Focus on:
188
  Be factual and objective.
189
  """
190
  else:
191
- # Estimate token count and trim content if necessary
192
- max_total_tokens = 8000 # Adjust based on model's maximum context length
193
- prompt_tokens_estimate = len(' '.join(available_content).split()) + 200 # 200 tokens reserved for response
194
- if prompt_tokens_estimate > max_total_tokens:
195
- # Trim main content
196
- allowable_content_tokens = max_total_tokens - 200 # Reserve 200 tokens for response
197
- main_content_tokens = len(main_content.split())
198
- if main_content_tokens > allowable_content_tokens:
199
- main_content = ' '.join(main_content.split()[:allowable_content_tokens])
200
- logger.info("Trimmed main content to fit within token limits.")
201
-
202
- # Update available content
203
- available_content[-1] = f"Main Content: {main_content}"
 
 
 
 
204
 
205
  # Construct the prompt
206
  prompt = f"""
@@ -218,12 +214,12 @@ Be factual and objective.
218
 
219
  # Call the LLM via Groq Cloud API
220
  response = openai.ChatCompletion.create(
221
- model='llama3-8b-8192', # Use the model as per your Groq Cloud API configuration
222
  messages=[
223
  {"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
224
  {"role": "user", "content": prompt}
225
  ],
226
- max_tokens=200, # Adjust as necessary to accommodate longer summaries
227
  temperature=0.5,
228
  )
229
 
@@ -234,48 +230,7 @@ Be factual and objective.
234
 
235
  except Exception as e:
236
  logger.error(f"Error generating summary: {e}", exc_info=True)
237
- # Fallback mechanisms
238
- if metadata['description']:
239
- logger.info("Falling back to meta description")
240
- bookmark['summary'] = metadata['description']
241
- elif main_content:
242
- logger.info("Falling back to main content")
243
- bookmark['summary'] = ' '.join(main_content.split()[:50]) + '...'
244
- elif metadata['title']:
245
- logger.info("Falling back to title")
246
- bookmark['summary'] = metadata['title']
247
- else:
248
- # If all else fails, prompt the LLM to use prior knowledge
249
- prompt = f"""
250
- You are a knowledgeable assistant.
251
-
252
- The user provided a URL: {bookmark.get('url')}
253
-
254
- Please provide a concise summary (2-3 sentences) about this website based on your knowledge.
255
-
256
- Focus on:
257
- - The main purpose or topic of the website.
258
- - Key information or features.
259
- - Target audience or use case (if apparent).
260
-
261
- Be factual and objective.
262
- """
263
- try:
264
- response = openai.ChatCompletion.create(
265
- model='llama3-8b-8192',
266
- messages=[
267
- {"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
268
- {"role": "user", "content": prompt}
269
- ],
270
- max_tokens=200,
271
- temperature=0.5,
272
- )
273
- summary = response['choices'][0]['message']['content'].strip()
274
- logger.info("Successfully generated LLM summary using prior knowledge")
275
- bookmark['summary'] = summary
276
- except Exception as e:
277
- logger.error(f"Error generating summary using prior knowledge: {e}", exc_info=True)
278
- bookmark['summary'] = 'No summary available.'
279
  return bookmark
280
 
281
  def parse_bookmarks(file_content):
@@ -318,39 +273,20 @@ async def fetch_url_info(session, bookmark):
318
  bookmark['etag'] = response.headers.get('ETag', 'N/A')
319
  bookmark['status_code'] = response.status
320
 
 
 
321
  if response.status >= 500:
322
  # Server error, consider as dead link
323
  bookmark['dead_link'] = True
324
  bookmark['description'] = ''
325
  bookmark['html_content'] = ''
326
  logger.warning(f"Dead link detected: {url} with status {response.status}")
327
- elif response.status == 403:
328
- # Forbidden, but may be accessible with proper headers
329
- logger.info(f"Received 403 for {url}, retrying with different headers")
330
- # Try with different headers or methods if necessary
331
- # For now, we'll proceed to read the content
332
- content = await response.text()
333
- bookmark['dead_link'] = False
334
- bookmark['html_content'] = content
335
- bookmark['description'] = ''
336
- elif response.status == 400:
337
- # Bad request, may be due to missing parameters
338
- bookmark['dead_link'] = False
339
- content = await response.text()
340
- bookmark['html_content'] = content
341
- bookmark['description'] = ''
342
- elif response.status >= 400:
343
- # Other client errors
344
- bookmark['dead_link'] = True
345
- bookmark['description'] = ''
346
- bookmark['html_content'] = ''
347
- logger.warning(f"Dead link detected: {url} with status {response.status}")
348
  else:
349
  bookmark['dead_link'] = False
350
- content = await response.text()
351
- bookmark['html_content'] = content # Store full HTML for summary generation
352
  bookmark['description'] = ''
353
  logger.info(f"Fetched information for {url}")
 
354
  except Exception as e:
355
  bookmark['dead_link'] = True
356
  bookmark['etag'] = 'N/A'
@@ -364,7 +300,7 @@ async def fetch_url_info(session, bookmark):
364
  'status_code': bookmark.get('status_code'),
365
  'dead_link': bookmark.get('dead_link'),
366
  'description': bookmark.get('description'),
367
- 'html_content': bookmark.get('html_content', '')
368
  }
369
  return bookmark
370
 
@@ -417,7 +353,7 @@ Respond with only the category name.
417
 
418
  try:
419
  response = openai.ChatCompletion.create(
420
- model='llama3-8b-8192', # Use the model as per your Groq Cloud API configuration
421
  messages=[
422
  {"role": "system", "content": "You categorize webpages based on their content."},
423
  {"role": "user", "content": prompt}
@@ -695,7 +631,7 @@ Provide a concise and helpful response.
695
  """
696
 
697
  response = openai.ChatCompletion.create(
698
- model='llama3-8b-8192', # Use the model as per your Groq Cloud API configuration
699
  messages=[
700
  {"role": "system", "content": "You assist users by finding relevant information from their bookmarks."},
701
  {"role": "user", "content": prompt}
 
153
  logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
154
 
155
  try:
156
+ html_content = bookmark.get('html_content', '')
157
+
158
+ # Check for insufficient or error content
159
+ error_keywords = ['Access Denied', 'Error', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic']
160
+ if not html_content or len(html_content) < 500 or any(keyword.lower() in html_content.lower() for keyword in error_keywords):
161
+ logger.info(f"Content for {bookmark.get('url')} is insufficient or contains errors. Using prior knowledge.")
162
+ use_prior_knowledge = True
163
+ else:
164
+ use_prior_knowledge = False
165
+
166
+ if use_prior_knowledge:
167
+ # Construct prompt to use prior knowledge
 
 
 
 
 
 
 
 
168
  prompt = f"""
169
  You are a knowledgeable assistant.
170
 
 
180
  Be factual and objective.
181
  """
182
  else:
183
+ # Get the HTML soup object from the bookmark
184
+ soup = BeautifulSoup(html_content, 'html.parser')
185
+
186
+ # Extract metadata and main content
187
+ metadata = get_page_metadata(soup)
188
+ main_content = extract_main_content(soup)
189
+
190
+ # Prepare content for the prompt
191
+ available_content = []
192
+ if metadata['title']:
193
+ available_content.append(f"Title: {metadata['title']}")
194
+ if metadata['description']:
195
+ available_content.append(f"Description: {metadata['description']}")
196
+ if metadata['keywords']:
197
+ available_content.append(f"Keywords: {metadata['keywords']}")
198
+ if main_content:
199
+ available_content.append(f"Main Content: {main_content}")
200
 
201
  # Construct the prompt
202
  prompt = f"""
 
214
 
215
  # Call the LLM via Groq Cloud API
216
  response = openai.ChatCompletion.create(
217
+ model='llama3-8b-8192',
218
  messages=[
219
  {"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
220
  {"role": "user", "content": prompt}
221
  ],
222
+ max_tokens=200,
223
  temperature=0.5,
224
  )
225
 
 
230
 
231
  except Exception as e:
232
  logger.error(f"Error generating summary: {e}", exc_info=True)
233
+ bookmark['summary'] = 'No summary available.'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  return bookmark
235
 
236
  def parse_bookmarks(file_content):
 
273
  bookmark['etag'] = response.headers.get('ETag', 'N/A')
274
  bookmark['status_code'] = response.status
275
 
276
+ content = await response.text()
277
+
278
  if response.status >= 500:
279
  # Server error, consider as dead link
280
  bookmark['dead_link'] = True
281
  bookmark['description'] = ''
282
  bookmark['html_content'] = ''
283
  logger.warning(f"Dead link detected: {url} with status {response.status}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  else:
285
  bookmark['dead_link'] = False
286
+ bookmark['html_content'] = content
 
287
  bookmark['description'] = ''
288
  logger.info(f"Fetched information for {url}")
289
+
290
  except Exception as e:
291
  bookmark['dead_link'] = True
292
  bookmark['etag'] = 'N/A'
 
300
  'status_code': bookmark.get('status_code'),
301
  'dead_link': bookmark.get('dead_link'),
302
  'description': bookmark.get('description'),
303
+ 'html_content': bookmark.get('html_content', ''),
304
  }
305
  return bookmark
306
 
 
353
 
354
  try:
355
  response = openai.ChatCompletion.create(
356
+ model='llama3-8b-8192',
357
  messages=[
358
  {"role": "system", "content": "You categorize webpages based on their content."},
359
  {"role": "user", "content": prompt}
 
631
  """
632
 
633
  response = openai.ChatCompletion.create(
634
+ model='llama3-8b-8192',
635
  messages=[
636
  {"role": "system", "content": "You assist users by finding relevant information from their bookmarks."},
637
  {"role": "user", "content": prompt}