|
|
|
|
|
|
|
|
import gradio as gr |
|
|
from bs4 import BeautifulSoup |
|
|
import requests |
|
|
from sentence_transformers import SentenceTransformer |
|
|
import faiss |
|
|
import numpy as np |
|
|
import asyncio |
|
|
import aiohttp |
|
|
import re |
|
|
import base64 |
|
|
import logging |
|
|
import os |
|
|
import sys |
|
|
|
|
|
|
|
|
import openai |
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
logger.setLevel(logging.INFO) |
|
|
|
|
|
|
|
|
console_handler = logging.StreamHandler(sys.stdout) |
|
|
console_handler.setLevel(logging.INFO) |
|
|
|
|
|
|
|
|
formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s %(message)s') |
|
|
console_handler.setFormatter(formatter) |
|
|
|
|
|
|
|
|
logger.addHandler(console_handler) |
|
|
|
|
|
|
|
|
logger.info("Initializing models and variables") |
|
|
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
faiss_index = None |
|
|
bookmarks = [] |
|
|
fetch_cache = {} |
|
|
|
|
|
|
|
|
CATEGORIES = [ |
|
|
"Social Media", |
|
|
"News and Media", |
|
|
"Education and Learning", |
|
|
"Entertainment", |
|
|
"Shopping and E-commerce", |
|
|
"Finance and Banking", |
|
|
"Technology", |
|
|
"Health and Fitness", |
|
|
"Travel and Tourism", |
|
|
"Food and Recipes", |
|
|
"Sports", |
|
|
"Arts and Culture", |
|
|
"Government and Politics", |
|
|
"Business and Economy", |
|
|
"Science and Research", |
|
|
"Personal Blogs and Journals", |
|
|
"Job Search and Careers", |
|
|
"Music and Audio", |
|
|
"Videos and Movies", |
|
|
"Reference and Knowledge Bases", |
|
|
"Dead Link", |
|
|
"Uncategorized", |
|
|
] |
|
|
|
|
|
|
|
|
GROQ_API_KEY = os.getenv('GROQ_API_KEY') |
|
|
|
|
|
if not GROQ_API_KEY: |
|
|
logger.error("GROQ_API_KEY environment variable not set.") |
|
|
|
|
|
|
|
|
openai.api_key = GROQ_API_KEY |
|
|
openai.api_base = "https://api.groq.com/openai/v1" |
|
|
|
|
|
|
|
|
def parse_bookmarks(file_content): |
|
|
logger.info("Parsing bookmarks") |
|
|
try: |
|
|
soup = BeautifulSoup(file_content, 'html.parser') |
|
|
extracted_bookmarks = [] |
|
|
for link in soup.find_all('a'): |
|
|
url = link.get('href') |
|
|
title = link.text.strip() |
|
|
if url and title: |
|
|
extracted_bookmarks.append({'url': url, 'title': title}) |
|
|
logger.info(f"Extracted {len(extracted_bookmarks)} bookmarks") |
|
|
return extracted_bookmarks |
|
|
except Exception as e: |
|
|
logger.error("Error parsing bookmarks: %s", e) |
|
|
raise |
|
|
|
|
|
|
|
|
async def fetch_url_info(session, bookmark): |
|
|
url = bookmark['url'] |
|
|
if url in fetch_cache: |
|
|
bookmark.update(fetch_cache[url]) |
|
|
return bookmark |
|
|
|
|
|
try: |
|
|
logger.info(f"Fetching URL info for: {url}") |
|
|
async with session.get(url, timeout=5) as response: |
|
|
bookmark['etag'] = response.headers.get('ETag', 'N/A') |
|
|
bookmark['status_code'] = response.status |
|
|
|
|
|
if response.status >= 400: |
|
|
bookmark['dead_link'] = True |
|
|
bookmark['description'] = '' |
|
|
logger.warning(f"Dead link detected: {url} with status {response.status}") |
|
|
else: |
|
|
bookmark['dead_link'] = False |
|
|
content = await response.text() |
|
|
soup = BeautifulSoup(content, 'html.parser') |
|
|
|
|
|
|
|
|
meta_description = soup.find('meta', attrs={'name': 'description'}) |
|
|
og_description = soup.find('meta', attrs={'property': 'og:description'}) |
|
|
if og_description and og_description.get('content'): |
|
|
description = og_description.get('content') |
|
|
elif meta_description and meta_description.get('content'): |
|
|
description = meta_description.get('content') |
|
|
else: |
|
|
description = '' |
|
|
|
|
|
bookmark['description'] = description |
|
|
logger.info(f"Fetched description for {url}") |
|
|
except Exception as e: |
|
|
bookmark['dead_link'] = True |
|
|
bookmark['etag'] = 'N/A' |
|
|
bookmark['status_code'] = 'N/A' |
|
|
bookmark['description'] = '' |
|
|
logger.error(f"Error fetching URL info for {url}: {e}") |
|
|
finally: |
|
|
fetch_cache[url] = { |
|
|
'etag': bookmark.get('etag'), |
|
|
'status_code': bookmark.get('status_code'), |
|
|
'dead_link': bookmark.get('dead_link'), |
|
|
'description': bookmark.get('description'), |
|
|
} |
|
|
return bookmark |
|
|
|
|
|
|
|
|
async def process_bookmarks_async(bookmarks): |
|
|
logger.info("Processing bookmarks asynchronously") |
|
|
try: |
|
|
async with aiohttp.ClientSession() as session: |
|
|
tasks = [] |
|
|
for bookmark in bookmarks: |
|
|
task = asyncio.ensure_future(fetch_url_info(session, bookmark)) |
|
|
tasks.append(task) |
|
|
await asyncio.gather(*tasks) |
|
|
logger.info("Completed processing bookmarks asynchronously") |
|
|
except Exception as e: |
|
|
logger.error(f"Error in asynchronous processing of bookmarks: {e}") |
|
|
raise |
|
|
|
|
|
|
|
|
def generate_summary(bookmark): |
|
|
description = bookmark.get('description', '') |
|
|
if description: |
|
|
bookmark['summary'] = description |
|
|
else: |
|
|
title = bookmark.get('title', '') |
|
|
if title: |
|
|
bookmark['summary'] = title |
|
|
else: |
|
|
bookmark['summary'] = 'No summary available.' |
|
|
logger.info(f"Generated summary for bookmark: {bookmark.get('url')}") |
|
|
return bookmark |
|
|
|
|
|
|
|
|
def assign_category(bookmark): |
|
|
if bookmark.get('dead_link'): |
|
|
bookmark['category'] = 'Dead Link' |
|
|
logger.info(f"Assigned category 'Dead Link' to bookmark: {bookmark.get('url')}") |
|
|
return bookmark |
|
|
|
|
|
summary = bookmark.get('summary', '').lower() |
|
|
assigned_category = 'Uncategorized' |
|
|
|
|
|
|
|
|
category_keywords = { |
|
|
"Social Media": ["social media", "networking", "friends", "connect", "posts", "profile"], |
|
|
"News and Media": ["news", "journalism", "media", "headlines", "breaking news"], |
|
|
"Education and Learning": ["education", "learning", "courses", "tutorial", "university", "academy", "study"], |
|
|
"Entertainment": ["entertainment", "movies", "tv shows", "games", "comics", "fun"], |
|
|
"Shopping and E-commerce": ["shopping", "e-commerce", "buy", "sell", "marketplace", "deals", "store"], |
|
|
"Finance and Banking": ["finance", "banking", "investment", "money", "economy", "stock", "trading"], |
|
|
"Technology": ["technology", "tech", "gadgets", "software", "computers", "innovation"], |
|
|
"Health and Fitness": ["health", "fitness", "medical", "wellness", "exercise", "diet"], |
|
|
"Travel and Tourism": ["travel", "tourism", "destinations", "hotels", "flights", "vacation"], |
|
|
"Food and Recipes": ["food", "recipes", "cooking", "cuisine", "restaurant", "dining"], |
|
|
"Sports": ["sports", "scores", "teams", "athletics", "matches", "leagues"], |
|
|
"Arts and Culture": ["arts", "culture", "museum", "gallery", "exhibition", "artistic"], |
|
|
"Government and Politics": ["government", "politics", "policy", "election", "public service"], |
|
|
"Business and Economy": ["business", "corporate", "industry", "economy", "markets"], |
|
|
"Science and Research": ["science", "research", "experiment", "laboratory", "study", "scientific"], |
|
|
"Personal Blogs and Journals": ["blog", "journal", "personal", "diary", "thoughts", "opinions"], |
|
|
"Job Search and Careers": ["jobs", "careers", "recruitment", "resume", "employment", "hiring"], |
|
|
"Music and Audio": ["music", "audio", "songs", "albums", "artists", "bands"], |
|
|
"Videos and Movies": ["video", "movies", "film", "clips", "trailers", "cinema"], |
|
|
"Reference and Knowledge Bases": ["reference", "encyclopedia", "dictionary", "wiki", "knowledge", "information"], |
|
|
} |
|
|
|
|
|
for category, keywords in category_keywords.items(): |
|
|
for keyword in keywords: |
|
|
if re.search(r'\b' + re.escape(keyword) + r'\b', summary): |
|
|
assigned_category = category |
|
|
logger.info(f"Assigned category '{assigned_category}' to bookmark: {bookmark.get('url')}") |
|
|
break |
|
|
if assigned_category != 'Uncategorized': |
|
|
break |
|
|
|
|
|
bookmark['category'] = assigned_category |
|
|
if assigned_category == 'Uncategorized': |
|
|
logger.info(f"No matching category found for bookmark: {bookmark.get('url')}") |
|
|
return bookmark |
|
|
|
|
|
|
|
|
def vectorize_and_index(bookmarks): |
|
|
logger.info("Vectorizing summaries and building FAISS index") |
|
|
try: |
|
|
summaries = [bookmark['summary'] for bookmark in bookmarks] |
|
|
embeddings = embedding_model.encode(summaries) |
|
|
dimension = embeddings.shape[1] |
|
|
faiss_idx = faiss.IndexFlatL2(dimension) |
|
|
faiss_idx.add(np.array(embeddings)) |
|
|
logger.info("FAISS index built successfully") |
|
|
return faiss_idx, embeddings |
|
|
except Exception as e: |
|
|
logger.error(f"Error in vectorizing and indexing: {e}") |
|
|
raise |
|
|
|
|
|
|
|
|
def display_bookmarks(): |
|
|
logger.info("Generating HTML display for bookmarks") |
|
|
cards = '' |
|
|
for i, bookmark in enumerate(bookmarks): |
|
|
index = i + 1 |
|
|
status = "Dead Link" if bookmark.get('dead_link') else "Active" |
|
|
title = bookmark['title'] |
|
|
url = bookmark['url'] |
|
|
etag = bookmark.get('etag', 'N/A') |
|
|
summary = bookmark.get('summary', '') |
|
|
category = bookmark.get('category', 'Uncategorized') |
|
|
|
|
|
|
|
|
if bookmark.get('dead_link'): |
|
|
card_style = "border: 2px solid #D32F2F;" |
|
|
text_style = "color: #D32F2F;" |
|
|
else: |
|
|
card_style = "" |
|
|
text_style = "" |
|
|
|
|
|
card_html = f''' |
|
|
<div class="card" style="{card_style}"> |
|
|
<div class="card-content"> |
|
|
<h3 style="{text_style}">{index}. {title}</h3> |
|
|
<p style="{text_style}"><strong>Category:</strong> {category}</p> |
|
|
<p style="{text_style}"><strong>URL:</strong> <a href="{url}" target="_blank" style="{text_style}">{url}</a></p> |
|
|
<p style="{text_style}"><strong>Status:</strong> {status}</p> |
|
|
<p style="{text_style}"><strong>ETag:</strong> {etag}</p> |
|
|
<p style="{text_style}"><strong>Summary:</strong> {summary}</p> |
|
|
</div> |
|
|
</div> |
|
|
''' |
|
|
cards += card_html |
|
|
logger.info("HTML display generated") |
|
|
return cards |
|
|
|
|
|
|
|
|
def process_uploaded_file(file): |
|
|
global bookmarks, faiss_index |
|
|
logger.info("Processing uploaded file") |
|
|
if file is None: |
|
|
logger.warning("No file uploaded") |
|
|
return "Please upload a bookmarks HTML file.", '', gr.update(), '' |
|
|
try: |
|
|
file_content = file.decode('utf-8') |
|
|
except UnicodeDecodeError as e: |
|
|
logger.error(f"Error decoding the file: {e}") |
|
|
return "Error decoding the file. Please ensure it's a valid HTML file.", '', gr.update(), '' |
|
|
|
|
|
try: |
|
|
bookmarks = parse_bookmarks(file_content) |
|
|
except Exception as e: |
|
|
logger.error(f"Error parsing bookmarks: {e}") |
|
|
return "Error parsing the bookmarks HTML file.", '', gr.update(), '' |
|
|
|
|
|
if not bookmarks: |
|
|
logger.warning("No bookmarks found in the uploaded file") |
|
|
return "No bookmarks found in the uploaded file.", '', gr.update(), '' |
|
|
|
|
|
|
|
|
try: |
|
|
asyncio.run(process_bookmarks_async(bookmarks)) |
|
|
except Exception as e: |
|
|
logger.error(f"Error processing bookmarks asynchronously: {e}") |
|
|
return "Error processing bookmarks.", '', gr.update(), '' |
|
|
|
|
|
|
|
|
for bookmark in bookmarks: |
|
|
generate_summary(bookmark) |
|
|
assign_category(bookmark) |
|
|
|
|
|
try: |
|
|
faiss_index, embeddings = vectorize_and_index(bookmarks) |
|
|
except Exception as e: |
|
|
logger.error(f"Error building FAISS index: {e}") |
|
|
return "Error building search index.", '', gr.update(), '' |
|
|
|
|
|
message = f"Successfully processed {len(bookmarks)} bookmarks." |
|
|
logger.info(message) |
|
|
bookmark_html = display_bookmarks() |
|
|
|
|
|
|
|
|
choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})" for i, bookmark in enumerate(bookmarks)] |
|
|
bookmark_selector_update = gr.update(choices=choices, value=[]) |
|
|
|
|
|
|
|
|
bookmark_display_manage_update = display_bookmarks() |
|
|
|
|
|
return message, bookmark_html, bookmark_selector_update, bookmark_display_manage_update |
|
|
|
|
|
|
|
|
def delete_selected_bookmarks(selected_indices): |
|
|
global bookmarks, faiss_index |
|
|
if not selected_indices: |
|
|
return "No bookmarks selected.", gr.update(), '' |
|
|
indices = [int(s.split('.')[0])-1 for s in selected_indices] |
|
|
indices = sorted(indices, reverse=True) |
|
|
for idx in indices: |
|
|
if 0 <= idx < len(bookmarks): |
|
|
logger.info(f"Deleting bookmark at index {idx + 1}") |
|
|
bookmarks.pop(idx) |
|
|
if bookmarks: |
|
|
faiss_index, embeddings = vectorize_and_index(bookmarks) |
|
|
else: |
|
|
faiss_index = None |
|
|
message = "Selected bookmarks deleted successfully." |
|
|
logger.info(message) |
|
|
|
|
|
choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})" for i, bookmark in enumerate(bookmarks)] |
|
|
bookmark_selector_update = gr.update(choices=choices, value=[]) |
|
|
|
|
|
bookmarks_html = display_bookmarks() |
|
|
return message, bookmark_selector_update, bookmarks_html |
|
|
|
|
|
|
|
|
def edit_selected_bookmarks_category(selected_indices, new_category): |
|
|
if not selected_indices: |
|
|
return "No bookmarks selected.", '', gr.update() |
|
|
if not new_category: |
|
|
return "No new category selected.", '', gr.update() |
|
|
indices = [int(s.split('.')[0])-1 for s in selected_indices] |
|
|
for idx in indices: |
|
|
if 0 <= idx < len(bookmarks): |
|
|
bookmarks[idx]['category'] = new_category |
|
|
message = "Category updated for selected bookmarks." |
|
|
logger.info(message) |
|
|
|
|
|
choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})" for i, bookmark in enumerate(bookmarks)] |
|
|
bookmark_selector_update = gr.update(choices=choices, value=[]) |
|
|
|
|
|
bookmarks_html = display_bookmarks() |
|
|
return message, bookmark_selector_update, bookmarks_html |
|
|
|
|
|
|
|
|
def export_bookmarks(): |
|
|
if not bookmarks: |
|
|
logger.warning("No bookmarks to export") |
|
|
return "No bookmarks to export." |
|
|
try: |
|
|
logger.info("Exporting bookmarks to HTML") |
|
|
|
|
|
soup = BeautifulSoup("<!DOCTYPE NETSCAPE-Bookmark-file-1><Title>Bookmarks</Title><H1>Bookmarks</H1>", 'html.parser') |
|
|
dl = soup.new_tag('DL') |
|
|
for bookmark in bookmarks: |
|
|
dt = soup.new_tag('DT') |
|
|
a = soup.new_tag('A', href=bookmark['url']) |
|
|
a.string = bookmark['title'] |
|
|
dt.append(a) |
|
|
dl.append(dt) |
|
|
soup.append(dl) |
|
|
html_content = str(soup) |
|
|
|
|
|
b64 = base64.b64encode(html_content.encode()).decode() |
|
|
href = f'data:text/html;base64,{b64}' |
|
|
logger.info("Bookmarks exported successfully") |
|
|
return f'<a href="{href}" download="bookmarks.html">Download Exported Bookmarks</a>' |
|
|
except Exception as e: |
|
|
logger.error(f"Error exporting bookmarks: {e}") |
|
|
return "Error exporting bookmarks." |
|
|
|
|
|
|
|
|
def chatbot_response(user_query): |
|
|
if not GROQ_API_KEY: |
|
|
logger.warning("GROQ_API_KEY not set.") |
|
|
return "API key not set. Please set the GROQ_API_KEY environment variable in the Hugging Face Space settings." |
|
|
|
|
|
if not bookmarks: |
|
|
logger.warning("No bookmarks available for chatbot") |
|
|
return "No bookmarks available. Please upload and process your bookmarks first." |
|
|
|
|
|
logger.info(f"Chatbot received query: {user_query}") |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
max_bookmarks = 50 |
|
|
bookmark_data = "" |
|
|
for idx, bookmark in enumerate(bookmarks[:max_bookmarks]): |
|
|
bookmark_data += f"{idx+1}. Title: {bookmark['title']}\nURL: {bookmark['url']}\nSummary: {bookmark['summary']}\n\n" |
|
|
|
|
|
|
|
|
prompt = f""" |
|
|
You are an assistant that helps users find relevant bookmarks from their collection based on their queries. |
|
|
|
|
|
User Query: |
|
|
{user_query} |
|
|
|
|
|
Bookmarks: |
|
|
{bookmark_data} |
|
|
|
|
|
Please identify the most relevant bookmarks that match the user's query. Provide a concise list including the index, title, URL, and a brief summary. |
|
|
""" |
|
|
|
|
|
|
|
|
response = openai.ChatCompletion.create( |
|
|
model='llama3-8b-8192', |
|
|
messages=[ |
|
|
{"role": "system", "content": "You help users find relevant bookmarks based on their queries."}, |
|
|
{"role": "user", "content": prompt} |
|
|
], |
|
|
max_tokens=500, |
|
|
temperature=0.7, |
|
|
) |
|
|
|
|
|
|
|
|
answer = response['choices'][0]['message']['content'].strip() |
|
|
logger.info("Chatbot response generated using Groq Cloud API") |
|
|
return answer |
|
|
|
|
|
except Exception as e: |
|
|
error_message = f"Error processing your query: {str(e)}" |
|
|
logger.error(error_message) |
|
|
print(error_message) |
|
|
return error_message |
|
|
|
|
|
|
|
|
def build_app(): |
|
|
try: |
|
|
logger.info("Building Gradio app") |
|
|
with gr.Blocks(css="app.css") as demo: |
|
|
gr.Markdown("<h1>Bookmark Manager App</h1>") |
|
|
|
|
|
with gr.Tab("Upload and Process Bookmarks"): |
|
|
upload = gr.File(label="Upload Bookmarks HTML File", type='binary') |
|
|
process_button = gr.Button("Process Bookmarks") |
|
|
output_text = gr.Textbox(label="Output") |
|
|
bookmark_display = gr.HTML(label="Bookmarks") |
|
|
|
|
|
def update_bookmark_display(file): |
|
|
return process_uploaded_file(file) |
|
|
|
|
|
process_button.click( |
|
|
process_uploaded_file, |
|
|
inputs=upload, |
|
|
outputs=[output_text, bookmark_display, gr.State(), gr.State()] |
|
|
) |
|
|
|
|
|
with gr.Tab("Chat with Bookmarks"): |
|
|
user_input = gr.Textbox(label="Ask about your bookmarks") |
|
|
chat_output = gr.Textbox(label="Chatbot Response") |
|
|
chat_button = gr.Button("Send") |
|
|
|
|
|
chat_button.click( |
|
|
chatbot_response, |
|
|
inputs=user_input, |
|
|
outputs=chat_output |
|
|
) |
|
|
|
|
|
with gr.Tab("Manage Bookmarks"): |
|
|
manage_output = gr.Textbox(label="Manage Output", interactive=False) |
|
|
bookmark_display_manage = gr.HTML(label="Bookmarks") |
|
|
bookmark_selector = gr.CheckboxGroup(label="Select Bookmarks", choices=[]) |
|
|
|
|
|
new_category_input = gr.Dropdown(label="New Category", choices=CATEGORIES) |
|
|
with gr.Row(): |
|
|
delete_button = gr.Button("Delete Selected Bookmarks") |
|
|
edit_category_button = gr.Button("Edit Category of Selected Bookmarks") |
|
|
export_button = gr.Button("Export Bookmarks") |
|
|
download_link = gr.HTML(label="Download Exported Bookmarks") |
|
|
|
|
|
|
|
|
bookmark_display_manage.value = display_bookmarks() |
|
|
bookmark_selector.choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})" for i, bookmark in enumerate(bookmarks)] |
|
|
|
|
|
|
|
|
delete_button.click( |
|
|
delete_selected_bookmarks, |
|
|
inputs=bookmark_selector, |
|
|
outputs=[manage_output, bookmark_selector, bookmark_display_manage] |
|
|
) |
|
|
|
|
|
edit_category_button.click( |
|
|
edit_selected_bookmarks_category, |
|
|
inputs=[bookmark_selector, new_category_input], |
|
|
outputs=[manage_output, bookmark_selector, bookmark_display_manage] |
|
|
) |
|
|
|
|
|
export_button.click( |
|
|
export_bookmarks, |
|
|
inputs=None, |
|
|
outputs=download_link |
|
|
) |
|
|
|
|
|
|
|
|
process_button.click( |
|
|
process_uploaded_file, |
|
|
inputs=upload, |
|
|
outputs=[output_text, bookmark_display, bookmark_selector, bookmark_display_manage] |
|
|
) |
|
|
|
|
|
logger.info("Launching Gradio app") |
|
|
demo.launch(debug=True) |
|
|
except Exception as e: |
|
|
logger.error(f"Error building the app: {e}") |
|
|
print(f"Error building the app: {e}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
build_app() |
|
|
|