Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -153,26 +153,18 @@ def generate_summary(bookmark):
|
|
153 |
logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
|
154 |
|
155 |
try:
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
available_content.append(f"Description: {metadata['description']}")
|
169 |
-
if metadata['keywords']:
|
170 |
-
available_content.append(f"Keywords: {metadata['keywords']}")
|
171 |
-
if main_content:
|
172 |
-
available_content.append(f"Main Content: {main_content}")
|
173 |
-
|
174 |
-
# If content is insufficient, instruct the LLM to use prior knowledge
|
175 |
-
if not available_content or len(' '.join(available_content).split()) < 50:
|
176 |
prompt = f"""
|
177 |
You are a knowledgeable assistant.
|
178 |
|
@@ -188,19 +180,23 @@ Focus on:
|
|
188 |
Be factual and objective.
|
189 |
"""
|
190 |
else:
|
191 |
-
#
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
available_content
|
|
|
|
|
|
|
|
|
204 |
|
205 |
# Construct the prompt
|
206 |
prompt = f"""
|
@@ -218,12 +214,12 @@ Be factual and objective.
|
|
218 |
|
219 |
# Call the LLM via Groq Cloud API
|
220 |
response = openai.ChatCompletion.create(
|
221 |
-
model='llama3-8b-8192',
|
222 |
messages=[
|
223 |
{"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
|
224 |
{"role": "user", "content": prompt}
|
225 |
],
|
226 |
-
max_tokens=200,
|
227 |
temperature=0.5,
|
228 |
)
|
229 |
|
@@ -234,48 +230,7 @@ Be factual and objective.
|
|
234 |
|
235 |
except Exception as e:
|
236 |
logger.error(f"Error generating summary: {e}", exc_info=True)
|
237 |
-
|
238 |
-
if metadata['description']:
|
239 |
-
logger.info("Falling back to meta description")
|
240 |
-
bookmark['summary'] = metadata['description']
|
241 |
-
elif main_content:
|
242 |
-
logger.info("Falling back to main content")
|
243 |
-
bookmark['summary'] = ' '.join(main_content.split()[:50]) + '...'
|
244 |
-
elif metadata['title']:
|
245 |
-
logger.info("Falling back to title")
|
246 |
-
bookmark['summary'] = metadata['title']
|
247 |
-
else:
|
248 |
-
# If all else fails, prompt the LLM to use prior knowledge
|
249 |
-
prompt = f"""
|
250 |
-
You are a knowledgeable assistant.
|
251 |
-
|
252 |
-
The user provided a URL: {bookmark.get('url')}
|
253 |
-
|
254 |
-
Please provide a concise summary (2-3 sentences) about this website based on your knowledge.
|
255 |
-
|
256 |
-
Focus on:
|
257 |
-
- The main purpose or topic of the website.
|
258 |
-
- Key information or features.
|
259 |
-
- Target audience or use case (if apparent).
|
260 |
-
|
261 |
-
Be factual and objective.
|
262 |
-
"""
|
263 |
-
try:
|
264 |
-
response = openai.ChatCompletion.create(
|
265 |
-
model='llama3-8b-8192',
|
266 |
-
messages=[
|
267 |
-
{"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
|
268 |
-
{"role": "user", "content": prompt}
|
269 |
-
],
|
270 |
-
max_tokens=200,
|
271 |
-
temperature=0.5,
|
272 |
-
)
|
273 |
-
summary = response['choices'][0]['message']['content'].strip()
|
274 |
-
logger.info("Successfully generated LLM summary using prior knowledge")
|
275 |
-
bookmark['summary'] = summary
|
276 |
-
except Exception as e:
|
277 |
-
logger.error(f"Error generating summary using prior knowledge: {e}", exc_info=True)
|
278 |
-
bookmark['summary'] = 'No summary available.'
|
279 |
return bookmark
|
280 |
|
281 |
def parse_bookmarks(file_content):
|
@@ -318,39 +273,20 @@ async def fetch_url_info(session, bookmark):
|
|
318 |
bookmark['etag'] = response.headers.get('ETag', 'N/A')
|
319 |
bookmark['status_code'] = response.status
|
320 |
|
|
|
|
|
321 |
if response.status >= 500:
|
322 |
# Server error, consider as dead link
|
323 |
bookmark['dead_link'] = True
|
324 |
bookmark['description'] = ''
|
325 |
bookmark['html_content'] = ''
|
326 |
logger.warning(f"Dead link detected: {url} with status {response.status}")
|
327 |
-
elif response.status == 403:
|
328 |
-
# Forbidden, but may be accessible with proper headers
|
329 |
-
logger.info(f"Received 403 for {url}, retrying with different headers")
|
330 |
-
# Try with different headers or methods if necessary
|
331 |
-
# For now, we'll proceed to read the content
|
332 |
-
content = await response.text()
|
333 |
-
bookmark['dead_link'] = False
|
334 |
-
bookmark['html_content'] = content
|
335 |
-
bookmark['description'] = ''
|
336 |
-
elif response.status == 400:
|
337 |
-
# Bad request, may be due to missing parameters
|
338 |
-
bookmark['dead_link'] = False
|
339 |
-
content = await response.text()
|
340 |
-
bookmark['html_content'] = content
|
341 |
-
bookmark['description'] = ''
|
342 |
-
elif response.status >= 400:
|
343 |
-
# Other client errors
|
344 |
-
bookmark['dead_link'] = True
|
345 |
-
bookmark['description'] = ''
|
346 |
-
bookmark['html_content'] = ''
|
347 |
-
logger.warning(f"Dead link detected: {url} with status {response.status}")
|
348 |
else:
|
349 |
bookmark['dead_link'] = False
|
350 |
-
|
351 |
-
bookmark['html_content'] = content # Store full HTML for summary generation
|
352 |
bookmark['description'] = ''
|
353 |
logger.info(f"Fetched information for {url}")
|
|
|
354 |
except Exception as e:
|
355 |
bookmark['dead_link'] = True
|
356 |
bookmark['etag'] = 'N/A'
|
@@ -364,7 +300,7 @@ async def fetch_url_info(session, bookmark):
|
|
364 |
'status_code': bookmark.get('status_code'),
|
365 |
'dead_link': bookmark.get('dead_link'),
|
366 |
'description': bookmark.get('description'),
|
367 |
-
'html_content': bookmark.get('html_content', '')
|
368 |
}
|
369 |
return bookmark
|
370 |
|
@@ -417,7 +353,7 @@ Respond with only the category name.
|
|
417 |
|
418 |
try:
|
419 |
response = openai.ChatCompletion.create(
|
420 |
-
model='llama3-8b-8192',
|
421 |
messages=[
|
422 |
{"role": "system", "content": "You categorize webpages based on their content."},
|
423 |
{"role": "user", "content": prompt}
|
@@ -695,7 +631,7 @@ Provide a concise and helpful response.
|
|
695 |
"""
|
696 |
|
697 |
response = openai.ChatCompletion.create(
|
698 |
-
model='llama3-8b-8192',
|
699 |
messages=[
|
700 |
{"role": "system", "content": "You assist users by finding relevant information from their bookmarks."},
|
701 |
{"role": "user", "content": prompt}
|
|
|
153 |
logger.info(f"Generating summary for bookmark: {bookmark.get('url')}")
|
154 |
|
155 |
try:
|
156 |
+
html_content = bookmark.get('html_content', '')
|
157 |
+
|
158 |
+
# Check for insufficient or error content
|
159 |
+
error_keywords = ['Access Denied', 'Error', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic']
|
160 |
+
if not html_content or len(html_content) < 500 or any(keyword.lower() in html_content.lower() for keyword in error_keywords):
|
161 |
+
logger.info(f"Content for {bookmark.get('url')} is insufficient or contains errors. Using prior knowledge.")
|
162 |
+
use_prior_knowledge = True
|
163 |
+
else:
|
164 |
+
use_prior_knowledge = False
|
165 |
+
|
166 |
+
if use_prior_knowledge:
|
167 |
+
# Construct prompt to use prior knowledge
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
prompt = f"""
|
169 |
You are a knowledgeable assistant.
|
170 |
|
|
|
180 |
Be factual and objective.
|
181 |
"""
|
182 |
else:
|
183 |
+
# Get the HTML soup object from the bookmark
|
184 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
185 |
+
|
186 |
+
# Extract metadata and main content
|
187 |
+
metadata = get_page_metadata(soup)
|
188 |
+
main_content = extract_main_content(soup)
|
189 |
+
|
190 |
+
# Prepare content for the prompt
|
191 |
+
available_content = []
|
192 |
+
if metadata['title']:
|
193 |
+
available_content.append(f"Title: {metadata['title']}")
|
194 |
+
if metadata['description']:
|
195 |
+
available_content.append(f"Description: {metadata['description']}")
|
196 |
+
if metadata['keywords']:
|
197 |
+
available_content.append(f"Keywords: {metadata['keywords']}")
|
198 |
+
if main_content:
|
199 |
+
available_content.append(f"Main Content: {main_content}")
|
200 |
|
201 |
# Construct the prompt
|
202 |
prompt = f"""
|
|
|
214 |
|
215 |
# Call the LLM via Groq Cloud API
|
216 |
response = openai.ChatCompletion.create(
|
217 |
+
model='llama3-8b-8192',
|
218 |
messages=[
|
219 |
{"role": "system", "content": "You are a helpful assistant that creates concise webpage summaries."},
|
220 |
{"role": "user", "content": prompt}
|
221 |
],
|
222 |
+
max_tokens=200,
|
223 |
temperature=0.5,
|
224 |
)
|
225 |
|
|
|
230 |
|
231 |
except Exception as e:
|
232 |
logger.error(f"Error generating summary: {e}", exc_info=True)
|
233 |
+
bookmark['summary'] = 'No summary available.'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
return bookmark
|
235 |
|
236 |
def parse_bookmarks(file_content):
|
|
|
273 |
bookmark['etag'] = response.headers.get('ETag', 'N/A')
|
274 |
bookmark['status_code'] = response.status
|
275 |
|
276 |
+
content = await response.text()
|
277 |
+
|
278 |
if response.status >= 500:
|
279 |
# Server error, consider as dead link
|
280 |
bookmark['dead_link'] = True
|
281 |
bookmark['description'] = ''
|
282 |
bookmark['html_content'] = ''
|
283 |
logger.warning(f"Dead link detected: {url} with status {response.status}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
else:
|
285 |
bookmark['dead_link'] = False
|
286 |
+
bookmark['html_content'] = content
|
|
|
287 |
bookmark['description'] = ''
|
288 |
logger.info(f"Fetched information for {url}")
|
289 |
+
|
290 |
except Exception as e:
|
291 |
bookmark['dead_link'] = True
|
292 |
bookmark['etag'] = 'N/A'
|
|
|
300 |
'status_code': bookmark.get('status_code'),
|
301 |
'dead_link': bookmark.get('dead_link'),
|
302 |
'description': bookmark.get('description'),
|
303 |
+
'html_content': bookmark.get('html_content', ''),
|
304 |
}
|
305 |
return bookmark
|
306 |
|
|
|
353 |
|
354 |
try:
|
355 |
response = openai.ChatCompletion.create(
|
356 |
+
model='llama3-8b-8192',
|
357 |
messages=[
|
358 |
{"role": "system", "content": "You categorize webpages based on their content."},
|
359 |
{"role": "user", "content": prompt}
|
|
|
631 |
"""
|
632 |
|
633 |
response = openai.ChatCompletion.create(
|
634 |
+
model='llama3-8b-8192',
|
635 |
messages=[
|
636 |
{"role": "system", "content": "You assist users by finding relevant information from their bookmarks."},
|
637 |
{"role": "user", "content": prompt}
|