Update app.py
Browse files
app.py
CHANGED
|
@@ -153,26 +153,18 @@ def generate_summary(bookmark):
|
|
| 153 |
logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
|
| 154 |
|
| 155 |
try:
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
available_content.append(f"Description: {metadata['description']}")
|
| 169 |
-
if metadata['keywords']:
|
| 170 |
-
available_content.append(f"Keywords: {metadata['keywords']}")
|
| 171 |
-
if main_content:
|
| 172 |
-
available_content.append(f"Main Content: {main_content}")
|
| 173 |
-
|
| 174 |
-
# If content is insufficient, instruct the LLM to use prior knowledge
|
| 175 |
-
if not available_content or len(' '.join(available_content).split()) < 50:
|
| 176 |
prompt = f"""
|
| 177 |
You are a knowledgeable assistant.
|
| 178 |
|
|
@@ -188,19 +180,23 @@ Focus on:
|
|
| 188 |
Be factual and objective.
|
| 189 |
"""
|
| 190 |
else:
|
| 191 |
-
#
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
available_content
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
# Construct the prompt
|
| 206 |
prompt = f"""
|
|
@@ -218,12 +214,12 @@ Be factual and objective.
|
|
| 218 |
|
| 219 |
# Call the LLM via Groq Cloud API
|
| 220 |
response = openai.ChatCompletion.create(
|
| 221 |
-
model='llama3-8b-8192',
|
| 222 |
messages=[
|
| 223 |
{"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
|
| 224 |
{"role": "user", "content": prompt}
|
| 225 |
],
|
| 226 |
-
max_tokens=200,
|
| 227 |
temperature=0.5,
|
| 228 |
)
|
| 229 |
|
|
@@ -234,48 +230,7 @@ Be factual and objective.
|
|
| 234 |
|
| 235 |
except Exception as e:
|
| 236 |
logger.error(f"Error generating summary: {e}", exc_info=True)
|
| 237 |
-
|
| 238 |
-
if metadata['description']:
|
| 239 |
-
logger.info("Falling back to meta description")
|
| 240 |
-
bookmark['summary'] = metadata['description']
|
| 241 |
-
elif main_content:
|
| 242 |
-
logger.info("Falling back to main content")
|
| 243 |
-
bookmark['summary'] = ' '.join(main_content.split()[:50]) + '...'
|
| 244 |
-
elif metadata['title']:
|
| 245 |
-
logger.info("Falling back to title")
|
| 246 |
-
bookmark['summary'] = metadata['title']
|
| 247 |
-
else:
|
| 248 |
-
# If all else fails, prompt the LLM to use prior knowledge
|
| 249 |
-
prompt = f"""
|
| 250 |
-
You are a knowledgeable assistant.
|
| 251 |
-
|
| 252 |
-
The user provided a URL: {bookmark.get('url')}
|
| 253 |
-
|
| 254 |
-
Please provide a concise summary (2-3 sentences) about this website based on your knowledge.
|
| 255 |
-
|
| 256 |
-
Focus on:
|
| 257 |
-
- The main purpose or topic of the website.
|
| 258 |
-
- Key information or features.
|
| 259 |
-
- Target audience or use case (if apparent).
|
| 260 |
-
|
| 261 |
-
Be factual and objective.
|
| 262 |
-
"""
|
| 263 |
-
try:
|
| 264 |
-
response = openai.ChatCompletion.create(
|
| 265 |
-
model='llama3-8b-8192',
|
| 266 |
-
messages=[
|
| 267 |
-
{"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
|
| 268 |
-
{"role": "user", "content": prompt}
|
| 269 |
-
],
|
| 270 |
-
max_tokens=200,
|
| 271 |
-
temperature=0.5,
|
| 272 |
-
)
|
| 273 |
-
summary = response['choices'][0]['message']['content'].strip()
|
| 274 |
-
logger.info("Successfully generated LLM summary using prior knowledge")
|
| 275 |
-
bookmark['summary'] = summary
|
| 276 |
-
except Exception as e:
|
| 277 |
-
logger.error(f"Error generating summary using prior knowledge: {e}", exc_info=True)
|
| 278 |
-
bookmark['summary'] = 'No summary available.'
|
| 279 |
return bookmark
|
| 280 |
|
| 281 |
def parse_bookmarks(file_content):
|
|
@@ -318,39 +273,20 @@ async def fetch_url_info(session, bookmark):
|
|
| 318 |
bookmark['etag'] = response.headers.get('ETag', 'N/A')
|
| 319 |
bookmark['status_code'] = response.status
|
| 320 |
|
|
|
|
|
|
|
| 321 |
if response.status >= 500:
|
| 322 |
# Server error, consider as dead link
|
| 323 |
bookmark['dead_link'] = True
|
| 324 |
bookmark['description'] = ''
|
| 325 |
bookmark['html_content'] = ''
|
| 326 |
logger.warning(f"Dead link detected: {url} with status {response.status}")
|
| 327 |
-
elif response.status == 403:
|
| 328 |
-
# Forbidden, but may be accessible with proper headers
|
| 329 |
-
logger.info(f"Received 403 for {url}, retrying with different headers")
|
| 330 |
-
# Try with different headers or methods if necessary
|
| 331 |
-
# For now, we'll proceed to read the content
|
| 332 |
-
content = await response.text()
|
| 333 |
-
bookmark['dead_link'] = False
|
| 334 |
-
bookmark['html_content'] = content
|
| 335 |
-
bookmark['description'] = ''
|
| 336 |
-
elif response.status == 400:
|
| 337 |
-
# Bad request, may be due to missing parameters
|
| 338 |
-
bookmark['dead_link'] = False
|
| 339 |
-
content = await response.text()
|
| 340 |
-
bookmark['html_content'] = content
|
| 341 |
-
bookmark['description'] = ''
|
| 342 |
-
elif response.status >= 400:
|
| 343 |
-
# Other client errors
|
| 344 |
-
bookmark['dead_link'] = True
|
| 345 |
-
bookmark['description'] = ''
|
| 346 |
-
bookmark['html_content'] = ''
|
| 347 |
-
logger.warning(f"Dead link detected: {url} with status {response.status}")
|
| 348 |
else:
|
| 349 |
bookmark['dead_link'] = False
|
| 350 |
-
|
| 351 |
-
bookmark['html_content'] = content # Store full HTML for summary generation
|
| 352 |
bookmark['description'] = ''
|
| 353 |
logger.info(f"Fetched information for {url}")
|
|
|
|
| 354 |
except Exception as e:
|
| 355 |
bookmark['dead_link'] = True
|
| 356 |
bookmark['etag'] = 'N/A'
|
|
@@ -364,7 +300,7 @@ async def fetch_url_info(session, bookmark):
|
|
| 364 |
'status_code': bookmark.get('status_code'),
|
| 365 |
'dead_link': bookmark.get('dead_link'),
|
| 366 |
'description': bookmark.get('description'),
|
| 367 |
-
'html_content': bookmark.get('html_content', '')
|
| 368 |
}
|
| 369 |
return bookmark
|
| 370 |
|
|
@@ -417,7 +353,7 @@ Respond with only the category name.
|
|
| 417 |
|
| 418 |
try:
|
| 419 |
response = openai.ChatCompletion.create(
|
| 420 |
-
model='llama3-8b-8192',
|
| 421 |
messages=[
|
| 422 |
{"role": "system", "content": "You categorize webpages based on their content."},
|
| 423 |
{"role": "user", "content": prompt}
|
|
@@ -695,7 +631,7 @@ Provide a concise and helpful response.
|
|
| 695 |
"""
|
| 696 |
|
| 697 |
response = openai.ChatCompletion.create(
|
| 698 |
-
model='llama3-8b-8192',
|
| 699 |
messages=[
|
| 700 |
{"role": "system", "content": "You assist users by finding relevant information from their bookmarks."},
|
| 701 |
{"role": "user", "content": prompt}
|
|
|
|
| 153 |
logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
|
| 154 |
|
| 155 |
try:
|
| 156 |
+
html_content = bookmark.get('html_content', '')
|
| 157 |
+
|
| 158 |
+
# Check for insufficient or error content
|
| 159 |
+
error_keywords = ['Access Denied', 'Error', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic']
|
| 160 |
+
if not html_content or len(html_content) < 500 or any(keyword.lower() in html_content.lower() for keyword in error_keywords):
|
| 161 |
+
logger.info(f"Content for {bookmark.get('url')} is insufficient or contains errors. Using prior knowledge.")
|
| 162 |
+
use_prior_knowledge = True
|
| 163 |
+
else:
|
| 164 |
+
use_prior_knowledge = False
|
| 165 |
+
|
| 166 |
+
if use_prior_knowledge:
|
| 167 |
+
# Construct prompt to use prior knowledge
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
prompt = f"""
|
| 169 |
You are a knowledgeable assistant.
|
| 170 |
|
|
|
|
| 180 |
Be factual and objective.
|
| 181 |
"""
|
| 182 |
else:
|
| 183 |
+
# Get the HTML soup object from the bookmark
|
| 184 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 185 |
+
|
| 186 |
+
# Extract metadata and main content
|
| 187 |
+
metadata = get_page_metadata(soup)
|
| 188 |
+
main_content = extract_main_content(soup)
|
| 189 |
+
|
| 190 |
+
# Prepare content for the prompt
|
| 191 |
+
available_content = []
|
| 192 |
+
if metadata['title']:
|
| 193 |
+
available_content.append(f"Title: {metadata['title']}")
|
| 194 |
+
if metadata['description']:
|
| 195 |
+
available_content.append(f"Description: {metadata['description']}")
|
| 196 |
+
if metadata['keywords']:
|
| 197 |
+
available_content.append(f"Keywords: {metadata['keywords']}")
|
| 198 |
+
if main_content:
|
| 199 |
+
available_content.append(f"Main Content: {main_content}")
|
| 200 |
|
| 201 |
# Construct the prompt
|
| 202 |
prompt = f"""
|
|
|
|
| 214 |
|
| 215 |
# Call the LLM via Groq Cloud API
|
| 216 |
response = openai.ChatCompletion.create(
|
| 217 |
+
model='llama3-8b-8192',
|
| 218 |
messages=[
|
| 219 |
{"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
|
| 220 |
{"role": "user", "content": prompt}
|
| 221 |
],
|
| 222 |
+
max_tokens=200,
|
| 223 |
temperature=0.5,
|
| 224 |
)
|
| 225 |
|
|
|
|
| 230 |
|
| 231 |
except Exception as e:
|
| 232 |
logger.error(f"Error generating summary: {e}", exc_info=True)
|
| 233 |
+
bookmark['summary'] = 'No summary available.'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
return bookmark
|
| 235 |
|
| 236 |
def parse_bookmarks(file_content):
|
|
|
|
| 273 |
bookmark['etag'] = response.headers.get('ETag', 'N/A')
|
| 274 |
bookmark['status_code'] = response.status
|
| 275 |
|
| 276 |
+
content = await response.text()
|
| 277 |
+
|
| 278 |
if response.status >= 500:
|
| 279 |
# Server error, consider as dead link
|
| 280 |
bookmark['dead_link'] = True
|
| 281 |
bookmark['description'] = ''
|
| 282 |
bookmark['html_content'] = ''
|
| 283 |
logger.warning(f"Dead link detected: {url} with status {response.status}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
else:
|
| 285 |
bookmark['dead_link'] = False
|
| 286 |
+
bookmark['html_content'] = content
|
|
|
|
| 287 |
bookmark['description'] = ''
|
| 288 |
logger.info(f"Fetched information for {url}")
|
| 289 |
+
|
| 290 |
except Exception as e:
|
| 291 |
bookmark['dead_link'] = True
|
| 292 |
bookmark['etag'] = 'N/A'
|
|
|
|
| 300 |
'status_code': bookmark.get('status_code'),
|
| 301 |
'dead_link': bookmark.get('dead_link'),
|
| 302 |
'description': bookmark.get('description'),
|
| 303 |
+
'html_content': bookmark.get('html_content', ''),
|
| 304 |
}
|
| 305 |
return bookmark
|
| 306 |
|
|
|
|
| 353 |
|
| 354 |
try:
|
| 355 |
response = openai.ChatCompletion.create(
|
| 356 |
+
model='llama3-8b-8192',
|
| 357 |
messages=[
|
| 358 |
{"role": "system", "content": "You categorize webpages based on their content."},
|
| 359 |
{"role": "user", "content": prompt}
|
|
|
|
| 631 |
"""
|
| 632 |
|
| 633 |
response = openai.ChatCompletion.create(
|
| 634 |
+
model='llama3-8b-8192',
|
| 635 |
messages=[
|
| 636 |
{"role": "system", "content": "You assist users by finding relevant information from their bookmarks."},
|
| 637 |
{"role": "user", "content": prompt}
|