# app.py import gradio as gr from bs4 import BeautifulSoup import requests from transformers import pipeline from sentence_transformers import SentenceTransformer import faiss import numpy as np # Initialize models and variables summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") embedding_model = SentenceTransformer('all-MiniLM-L6-v2') faiss_index = None # Renamed from 'index' to 'faiss_index' bookmarks = [] fetch_cache = {} # Helper functions def parse_bookmarks(file_content): soup = BeautifulSoup(file_content, 'html.parser') extracted_bookmarks = [] for link in soup.find_all('a'): url = link.get('href') title = link.text if url and title: extracted_bookmarks.append({'url': url, 'title': title}) return extracted_bookmarks def fetch_url_info(bookmark): url = bookmark['url'] if url in fetch_cache: bookmark.update(fetch_cache[url]) return bookmark try: response = requests.get(url, timeout=5) bookmark['etag'] = response.headers.get('ETag', 'N/A') bookmark['status_code'] = response.status_code if response.status_code >= 400: bookmark['dead_link'] = True bookmark['content'] = '' else: bookmark['dead_link'] = False soup = BeautifulSoup(response.content, 'html.parser') meta_tags = {meta.get('name', ''): meta.get('content', '') for meta in soup.find_all('meta')} bookmark['meta_tags'] = meta_tags bookmark['content'] = soup.get_text(separator=' ', strip=True) except Exception as e: bookmark['dead_link'] = True bookmark['etag'] = 'N/A' bookmark['status_code'] = 'N/A' bookmark['meta_tags'] = {} bookmark['content'] = '' finally: fetch_cache[url] = { 'etag': bookmark.get('etag'), 'status_code': bookmark.get('status_code'), 'dead_link': bookmark.get('dead_link'), 'meta_tags': bookmark.get('meta_tags'), 'content': bookmark.get('content'), } return bookmark def generate_summary(bookmark): content = bookmark.get('content', '') if content: # Limit content to first 2000 characters to save resources content = content[:2000] summary = summarizer(content, max_length=50, min_length=25, do_sample=False) bookmark['summary'] = summary[0]['summary_text'] else: bookmark['summary'] = 'No content available to summarize.' return bookmark def vectorize_and_index(bookmarks): summaries = [bookmark['summary'] for bookmark in bookmarks] embeddings = embedding_model.encode(summaries) dimension = embeddings.shape[1] faiss_idx = faiss.IndexFlatL2(dimension) faiss_idx.add(np.array(embeddings)) return faiss_idx, embeddings def process_uploaded_file(file): global bookmarks, faiss_index if file is None: return "Please upload a bookmarks HTML file." file_content = file.read().decode('utf-8') bookmarks = parse_bookmarks(file_content) for bookmark in bookmarks: fetch_url_info(bookmark) generate_summary(bookmark) faiss_index, embeddings = vectorize_and_index(bookmarks) return f"Successfully processed {len(bookmarks)} bookmarks." def chatbot_response(user_query): if faiss_index is None or not bookmarks: return "No bookmarks available. Please upload and process your bookmarks first." # Vectorize user query user_embedding = embedding_model.encode([user_query]) D, I = faiss_index.search(np.array(user_embedding), k=5) # Retrieve top 5 matches # Generate response response = "" for idx in I[0]: bookmark = bookmarks[idx] response += f"Title: {bookmark['title']}\nURL: {bookmark['url']}\nSummary: {bookmark['summary']}\n\n" return response.strip() def display_bookmarks(): bookmark_list = [] for i, bookmark in enumerate(bookmarks): status = "Dead Link" if bookmark.get('dead_link') else "Active" bookmark_list.append([i, bookmark['title'], bookmark['url'], status]) return bookmark_list def edit_bookmark(bookmark_idx, new_title, new_url): global faiss_index # Reference the global faiss_index variable try: bookmark_idx = int(bookmark_idx) bookmarks[bookmark_idx]['title'] = new_title bookmarks[bookmark_idx]['url'] = new_url fetch_url_info(bookmarks[bookmark_idx]) generate_summary(bookmarks[bookmark_idx]) # Rebuild the FAISS index faiss_index, embeddings = vectorize_and_index(bookmarks) return "Bookmark updated successfully." except Exception as e: return f"Error: {str(e)}" def delete_bookmark(bookmark_idx): global faiss_index # Reference the global faiss_index variable try: bookmark_idx = int(bookmark_idx) bookmarks.pop(bookmark_idx) # Rebuild the FAISS index if bookmarks: faiss_index, embeddings = vectorize_and_index(bookmarks) else: faiss_index = None # No bookmarks left return "Bookmark deleted successfully." except Exception as e: return f"Error: {str(e)}" def build_app(): with gr.Blocks() as demo: gr.Markdown("# Bookmark Manager App") with gr.Tab("Upload and Process Bookmarks"): upload = gr.File(label="Upload Bookmarks HTML File") process_button = gr.Button("Process Bookmarks") output_text = gr.Textbox(label="Output") process_button.click( process_uploaded_file, inputs=upload, outputs=output_text ) with gr.Tab("Chat with Bookmarks"): user_input = gr.Textbox(label="Ask about your bookmarks") chat_output = gr.Textbox(label="Chatbot Response") chat_button = gr.Button("Send") chat_button.click( chatbot_response, inputs=user_input, outputs=chat_output ) with gr.Tab("Manage Bookmarks"): bookmark_table = gr.Dataframe( headers=["Index", "Title", "URL", "Status"], datatype=["number", "str", "str", "str"], interactive=False ) refresh_button = gr.Button("Refresh Bookmark List") with gr.Row(): index_input = gr.Number(label="Bookmark Index") new_title_input = gr.Textbox(label="New Title") new_url_input = gr.Textbox(label="New URL") edit_button = gr.Button("Edit Bookmark") delete_button = gr.Button("Delete Bookmark") manage_output = gr.Textbox(label="Manage Output") refresh_button.click( display_bookmarks, inputs=None, outputs=bookmark_table ) edit_button.click( edit_bookmark, inputs=[index_input, new_title_input, new_url_input], outputs=manage_output ) delete_button.click( delete_bookmark, inputs=index_input, outputs=manage_output ) demo.launch() if __name__ == "__main__": build_app()