selvaonline's picture
Upload app.py with huggingface_hub
a6565d8 verified
raw
history blame
13.8 kB
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import json
import os
import requests
import re
# Function to extract text from HTML (from shopping_assistant.py)
def extract_text_from_html(html):
"""
Extract text from HTML without using BeautifulSoup
"""
# Remove HTML tags
text = re.sub(r'<[^>]+>', ' ', html)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Decode HTML entities
text = text.replace('&nbsp;', ' ').replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
return text.strip()
# Function to fetch deals from DealsFinders.com (from shopping_assistant.py)
def fetch_deals_data(url="https://www.dealsfinders.com/wp-json/wp/v2/posts", num_pages=2, per_page=100):
"""
Fetch deals data exclusively from the DealsFinders API
"""
all_deals = []
# Fetch from the DealsFinders API
for page in range(1, num_pages + 1):
try:
# Add a user agent to avoid being blocked
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
response = requests.get(f"{url}?page={page}&per_page={per_page}", headers=headers)
if response.status_code == 200:
deals = response.json()
all_deals.extend(deals)
print(f"Fetched page {page} with {len(deals)} deals from DealsFinders API")
# If we get fewer deals than requested, we've reached the end
if len(deals) < per_page:
print(f"Reached the end of available deals at page {page}")
break
else:
print(f"Failed to fetch page {page} from DealsFinders API: {response.status_code}")
break
except Exception as e:
print(f"Error fetching page {page} from DealsFinders API: {str(e)}")
break
return all_deals
# Function to process deals data (from shopping_assistant.py)
def process_deals_data(deals_data):
"""
Process the deals data into a structured format
"""
processed_deals = []
for deal in deals_data:
try:
# Extract relevant information using our HTML text extractor
content_html = deal.get('content', {}).get('rendered', '')
excerpt_html = deal.get('excerpt', {}).get('rendered', '')
clean_content = extract_text_from_html(content_html)
clean_excerpt = extract_text_from_html(excerpt_html)
processed_deal = {
'id': deal.get('id'),
'title': deal.get('title', {}).get('rendered', ''),
'link': deal.get('link', ''),
'date': deal.get('date', ''),
'content': clean_content,
'excerpt': clean_excerpt
}
processed_deals.append(processed_deal)
except Exception as e:
print(f"Error processing deal: {str(e)}")
return processed_deals
# Define product categories
category_descriptions = {
"electronics": "Electronic devices like headphones, speakers, TVs, smartphones, and gadgets",
"computers": "Laptops, desktops, computer parts, monitors, and computing accessories",
"mobile": "Mobile phones, smartphones, phone cases, screen protectors, and chargers",
"audio": "Headphones, earbuds, speakers, microphones, and audio equipment",
"clothing": "Clothes, shirts, pants, dresses, and fashion items",
"footwear": "Shoes, boots, sandals, slippers, and all types of footwear",
"home": "Home decor, furniture, bedding, and household items",
"kitchen": "Kitchen appliances, cookware, utensils, and kitchen gadgets",
"toys": "Toys, games, and children's entertainment items",
"sports": "Sports equipment, fitness gear, and outdoor recreation items",
"beauty": "Beauty products, makeup, skincare, and personal care items",
"books": "Books, e-books, audiobooks, and reading materials"
}
# List of categories
categories = list(category_descriptions.keys())
# Try to load the recommended models
try:
# 1. Load BART model for zero-shot classification
from transformers import pipeline
# Initialize the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
print("Using facebook/bart-large-mnli for classification")
# 2. Load MPNet model for semantic search
from sentence_transformers import SentenceTransformer, util
# Load the sentence transformer model
sentence_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
print("Using sentence-transformers/all-mpnet-base-v2 for semantic search")
# Pre-compute embeddings for category descriptions
category_texts = list(category_descriptions.values())
category_embeddings = sentence_model.encode(category_texts, convert_to_tensor=True)
# Using recommended models
using_recommended_models = True
except Exception as e:
# Fall back to local model if recommended models fail to load
print(f"Error loading recommended models: {str(e)}")
print("Falling back to local model")
model_path = os.path.dirname(os.path.abspath(__file__))
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
# Load the local categories
try:
with open(os.path.join(model_path, "categories.json"), "r") as f:
categories = json.load(f)
except Exception as e:
print(f"Error loading categories: {str(e)}")
categories = ["electronics", "clothing", "home", "kitchen", "toys", "other"]
# Not using recommended models
using_recommended_models = False
# Global variable to store deals data
deals_cache = None
def classify_text(text, fetch_deals=True):
"""
Classify the text using the model and fetch relevant deals
"""
global deals_cache
# Get the top categories based on the model type
if using_recommended_models:
# Using BART for zero-shot classification
result = classifier(text, categories, multi_label=True)
# Extract categories and scores
top_categories = []
for i, (category, score) in enumerate(zip(result['labels'], result['scores'])):
if score > 0.1: # Lower threshold for zero-shot classification
top_categories.append((category, score))
# Limit to top 3 categories
if i >= 2:
break
else:
# Using the original classification model
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
# Get the model prediction
with torch.no_grad():
outputs = model(**inputs)
predictions = torch.sigmoid(outputs.logits)
# Get the top categories
top_categories = []
for i, score in enumerate(predictions[0]):
if score > 0.5: # Threshold for multi-label classification
top_categories.append((categories[i], score.item()))
# Sort by score
top_categories.sort(key=lambda x: x[1], reverse=True)
# Format the classification results
if top_categories:
result = f"Top categories for '{text}':\n\n"
for category, score in top_categories:
result += f"- {category}: {score:.4f}\n"
result += f"\nBased on your query, I would recommend looking for deals in the **{top_categories[0][0]}** category.\n\n"
else:
result = f"No categories found for '{text}'. Please try a different query.\n\n"
# Fetch and display deals if requested
if fetch_deals:
result += "## Relevant Deals from DealsFinders.com\n\n"
try:
# Fetch deals data if not already cached
if deals_cache is None:
deals_data = fetch_deals_data(num_pages=2) # Limit to 2 pages for faster response
deals_cache = process_deals_data(deals_data)
# Using MPNet for semantic search if available
if using_recommended_models:
# Create deal texts for semantic search
deal_texts = []
for deal in deals_cache:
# Combine title and excerpt for better matching
deal_text = f"{deal['title']} {deal['excerpt']}"
deal_texts.append(deal_text)
# Encode the query and deals
query_embedding = sentence_model.encode(text, convert_to_tensor=True)
deal_embeddings = sentence_model.encode(deal_texts, convert_to_tensor=True)
# Calculate semantic similarity
similarities = util.cos_sim(query_embedding, deal_embeddings)[0]
# Get top 5 most similar deals
top_indices = torch.topk(similarities, k=min(5, len(deals_cache))).indices
# Extract the relevant deals
relevant_deals = [deals_cache[idx] for idx in top_indices]
else:
# Fallback to keyword-based search
query_terms = text.lower().split()
expanded_terms = list(query_terms)
# Add related terms based on the query
if any(term in text.lower() for term in ['headphone', 'headphones']):
expanded_terms.extend(['earbuds', 'earphones', 'earpods', 'airpods', 'audio', 'bluetooth', 'wireless'])
elif any(term in text.lower() for term in ['laptop', 'computer']):
expanded_terms.extend(['notebook', 'macbook', 'chromebook', 'pc'])
elif any(term in text.lower() for term in ['tv', 'television']):
expanded_terms.extend(['smart tv', 'roku', 'streaming'])
elif any(term in text.lower() for term in ['kitchen', 'appliance']):
expanded_terms.extend(['mixer', 'blender', 'toaster', 'microwave', 'oven'])
# Score deals based on relevance to the query
scored_deals = []
for deal in deals_cache:
title = deal['title'].lower()
content = deal['content'].lower()
excerpt = deal['excerpt'].lower()
score = 0
# Check original query terms (higher weight)
for term in query_terms:
if term in title:
score += 10
if term in content:
score += 3
if term in excerpt:
score += 3
# Check expanded terms (lower weight)
for term in expanded_terms:
if term not in query_terms: # Skip original terms
if term in title:
score += 5
if term in content:
score += 1
if term in excerpt:
score += 1
# Add to scored deals if it has any relevance
if score > 0:
scored_deals.append((deal, score))
# Sort by score (descending)
scored_deals.sort(key=lambda x: x[1], reverse=True)
# Extract the deals from the scored list
relevant_deals = [deal for deal, _ in scored_deals[:5]]
if relevant_deals:
for i, deal in enumerate(relevant_deals, 1):
result += f"{i}. [{deal['title']}]({deal['link']})\n\n"
else:
result += "No specific deals found for your query. Try a different search term or browse the recommended category.\n\n"
except Exception as e:
result += f"Error fetching deals: {str(e)}\n\n"
return result
# Create the Gradio interface
demo = gr.Interface(
fn=classify_text,
inputs=[
gr.Textbox(
lines=2,
placeholder="Enter your shopping query here...",
label="Shopping Query"
),
gr.Checkbox(
label="Fetch Deals",
value=True,
info="Check to fetch and display deals from DealsFinders.com"
)
],
outputs=gr.Markdown(label="Results"),
title="Shopping Assistant",
description="""
This demo shows how to use the Shopping Assistant model to classify shopping queries into categories and find relevant deals.
Enter a shopping query below to see which categories it belongs to and find deals from DealsFinders.com.
Examples:
- "I'm looking for headphones"
- "Do you have any kitchen appliance deals?"
- "Show me the best laptop deals"
- "I need a new smart TV"
""",
examples=[
["I'm looking for headphones", True],
["Do you have any kitchen appliance deals?", True],
["Show me the best laptop deals", True],
["I need a new smart TV", True],
["headphone deals", True]
],
theme=gr.themes.Soft()
)
# Launch the app
if __name__ == "__main__":
demo.launch()