Spaces:
Running
Running
File size: 5,398 Bytes
05de921 dcf746e 05de921 31c955d 05de921 cdd7269 05de921 c425950 05de921 880f9ee cdd7269 05de921 cdd7269 05de921 cdd7269 05de921 314bf31 05de921 db87ed3 18ec658 e985ab1 0e041b2 05de921 59084a2 05de921 59084a2 05de921 97165e2 05de921 00cf45f 05de921 00cf45f 05de921 00cf45f 05de921 c425950 05de921 97165e2 05de921 00cf45f 05de921 00cf45f 05de921 00cf45f 05de921 97165e2 05de921 00cf45f 05de921 35f5bd8 05de921 00cf45f 05de921 00cf45f 05de921 8ba26a5 05de921 00cf45f 05de921 35f5bd8 05de921 6e6eade 05de921 31c955d 05de921 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import os
import time
import threading
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import gradio as gr
from concurrent.futures import ThreadPoolExecutor
import logging
# Suppress warnings from urllib3
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Environment variable keys for API access
GROQ_API_KEY_BASIC = os.getenv('GROQ_API_KEY_BASIC')
GROQ_API_KEY_ADVANCED = os.getenv('GROQ_API_KEY_ADVANCED')
# LLM Models
MODEL_BASIC = 'llama-3.1-8b-instant'
MODEL_ADVANCED = 'llama-3.1-70b-versatile'
# Verify API keys
if not GROQ_API_KEY_BASIC or not GROQ_API_KEY_ADVANCED:
logger.error("Both GROQ_API_KEY_BASIC and GROQ_API_KEY_ADVANCED must be set.")
exit()
# Embedding model and FAISS index initialization
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
faiss_index = None
bookmarks = []
# Define categories
CATEGORIES = [
"Social Media", "News and Media", "Education and Learning", "Entertainment",
"Shopping and E-commerce", "Finance and Banking", "Technology", "Health and Fitness",
"Travel and Tourism", "Food and Recipes", "Sports", "Arts and Culture",
"Government and Politics", "Business and Economy", "Science and Research",
"Personal Blogs and Journals", "Job Search and Careers", "Music and Audio",
"Videos and Movies", "Reference and Knowledge Bases", "Dead Link", "Uncategorized"
]
# Task routing logic
def select_model_for_task(content_length):
"""Choose LLM model based on task complexity."""
if content_length < 500: # Simple tasks
return GROQ_API_KEY_BASIC, MODEL_BASIC
else: # Complex tasks
return GROQ_API_KEY_ADVANCED, MODEL_ADVANCED
# Fetch URL info function
def fetch_url_info(bookmark):
try:
response = requests.get(bookmark['url'], timeout=10, verify=False)
bookmark['html_content'] = response.text
bookmark['status_code'] = response.status_code
except Exception as e:
logger.error(f"Failed to fetch URL info for {bookmark['url']}: {e}")
bookmark['html_content'] = ''
bookmark['status_code'] = 'Error'
# Generate summary and assign category
def generate_summary_and_assign_category(bookmark):
content_length = len(bookmark.get('html_content', ''))
api_key, model_name = select_model_for_task(content_length)
# Prepare the prompt
prompt = f"""
You are an assistant. Summarize the following webpage content:
{bookmark.get('html_content', '')}
Assign one category from this list: {', '.join(CATEGORIES)}.
Respond in the format:
Summary: [Your summary]
Category: [One category]
"""
try:
response = requests.post(
f"https://api.openai.com/v1/chat/completions",
headers={"Authorization": f"Bearer {api_key}"},
json={
"model": model_name,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 150,
"temperature": 0.7,
},
)
result = response.json()
content = result['choices'][0]['message']['content']
# Extract summary and category
summary_start = content.find("Summary:")
category_start = content.find("Category:")
bookmark['summary'] = content[summary_start + 9:category_start].strip()
bookmark['category'] = content[category_start + 9:].strip()
except Exception as e:
logger.error(f"Error processing LLM response for {bookmark['url']}: {e}")
bookmark['summary'] = 'No summary available.'
bookmark['category'] = 'Uncategorized'
# Vectorize summaries and build FAISS index
def vectorize_and_index(bookmarks):
global faiss_index
summaries = [b['summary'] for b in bookmarks]
embeddings = embedding_model.encode(summaries)
dimension = embeddings.shape[1]
index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))
ids = np.arange(len(bookmarks))
index.add_with_ids(embeddings, ids)
faiss_index = index
# Gradio interface setup
def process_bookmarks(file):
global bookmarks
file_content = file.read().decode('utf-8')
soup = BeautifulSoup(file_content, 'html.parser')
# Parse bookmarks
bookmarks = [
{'url': link.get('href'), 'title': link.text, 'html_content': ''}
for link in soup.find_all('a') if link.get('href')
]
# Fetch URLs concurrently
with ThreadPoolExecutor() as executor:
executor.map(fetch_url_info, bookmarks)
# Process bookmarks with LLM
with ThreadPoolExecutor() as executor:
executor.map(generate_summary_and_assign_category, bookmarks)
# Build FAISS index
vectorize_and_index(bookmarks)
return bookmarks
# Build Gradio app
with gr.Blocks() as demo:
gr.Markdown("# Smart Bookmark Manager")
file_input = gr.File(label="Upload Bookmark File", type="binary")
submit_button = gr.Button("Process")
output = gr.Textbox(label="Output")
def handle_submit(file):
processed = process_bookmarks(file)
return "\n".join([f"{b['title']} - {b['category']}" for b in processed])
submit_button.click(handle_submit, inputs=file_input, outputs=output)
demo.launch()
|