Spaces:

selvaonline
/

shopping-assistant-demo

Sleeping

File size: 26,081 Bytes

import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import json
import os
import requests
import re

# Function to extract text from HTML (from shopping_assistant.py)
def extract_text_from_html(html):
    """
    Extract text from HTML without using BeautifulSoup
    """
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', html)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Decode HTML entities
    text = text.replace('&nbsp;', ' ').replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
    return text.strip()

# Sample deals data to use as fallback
SAMPLE_DEALS = [
  {
    "id": 1,
    "title": {
      "rendered": "Apple AirPods Pro (2nd Generation) - 20% Off"
    },
    "link": "https://www.example.com/deals/airpods-pro",
    "date": "2025-02-25T10:00:00",
    "content": {
      "rendered": "<p>Get the latest Apple AirPods Pro (2nd Generation) for 20% off the regular price. These wireless earbuds feature active noise cancellation, transparency mode, and spatial audio with dynamic head tracking.</p><p>Regular price: $249.99</p><p>Deal price: $199.99</p><p>You save: $50.00</p>"
    },
    "excerpt": {
      "rendered": "<p>Apple AirPods Pro (2nd Generation) with active noise cancellation and transparency mode. Now 20% off - only $199.99!</p>"
    }
  },
  {
    "id": 2,
    "title": {
      "rendered": "Samsung 65\" QLED 4K Smart TV - $300 Off"
    },
    "link": "https://www.example.com/deals/samsung-qled-tv",
    "date": "2025-02-26T09:30:00",
    "content": {
      "rendered": "<p>Upgrade your home entertainment with this Samsung 65\" QLED 4K Smart TV. Features Quantum HDR, Motion Xcelerator Turbo+, and Object Tracking Sound for an immersive viewing experience.</p><p>Regular price: $1,299.99</p><p>Deal price: $999.99</p><p>You save: $300.00</p>"
    },
    "excerpt": {
      "rendered": "<p>Samsung 65\" QLED 4K Smart TV with Quantum HDR and Object Tracking Sound. Save $300 - now only $999.99!</p>"
    }
  },
  {
    "id": 3,
    "title": {
      "rendered": "Sony WH-1000XM5 Wireless Headphones - 25% Off"
    },
    "link": "https://www.example.com/deals/sony-wh1000xm5",
    "date": "2025-02-26T14:15:00",
    "content": {
      "rendered": "<p>Experience industry-leading noise cancellation with the Sony WH-1000XM5 wireless headphones. Features 30-hour battery life, quick charging, and exceptional sound quality with the new Integrated Processor V1.</p><p>Regular price: $399.99</p><p>Deal price: $299.99</p><p>You save: $100.00</p>"
    },
    "excerpt": {
      "rendered": "<p>Sony WH-1000XM5 wireless headphones with industry-leading noise cancellation and 30-hour battery life. Now 25% off at $299.99!</p>"
    }
  },
  {
    "id": 4,
    "title": {
      "rendered": "Bose QuietComfort Ultra Headphones - 20% Off"
    },
    "link": "https://www.example.com/deals/bose-quietcomfort-ultra",
    "date": "2025-02-25T15:30:00",
    "content": {
      "rendered": "<p>Experience the ultimate in noise cancellation with Bose QuietComfort Ultra headphones. Features spatial audio, custom EQ, and up to 24 hours of battery life.</p><p>Regular price: $429.99</p><p>Deal price: $343.99</p><p>You save: $86.00</p>"
    },
    "excerpt": {
      "rendered": "<p>Bose QuietComfort Ultra headphones with advanced noise cancellation and spatial audio. Now 20% off at $343.99!</p>"
    }
  },
  {
    "id": 5,
    "title": {
      "rendered": "Beats Studio Pro Wireless Headphones - 40% Off"
    },
    "link": "https://www.example.com/deals/beats-studio-pro",
    "date": "2025-02-26T16:30:00",
    "content": {
      "rendered": "<p>The Beats Studio Pro wireless headphones deliver premium sound with active noise cancellation, transparency mode, and up to 40 hours of battery life.</p><p>Regular price: $349.99</p><p>Deal price: $209.99</p><p>You save: $140.00</p>"
    },
    "excerpt": {
      "rendered": "<p>Beats Studio Pro wireless headphones with active noise cancellation and 40-hour battery life. Now 40% off at $209.99!</p>"
    }
  },
  {
    "id": 6,
    "title": {
      "rendered": "Dyson V12 Detect Slim Cordless Vacuum - $150 Off"
    },
    "link": "https://www.example.com/deals/dyson-v12",
    "date": "2025-02-27T08:45:00",
    "content": {
      "rendered": "<p>The Dyson V12 Detect Slim cordless vacuum features a laser that reveals microscopic dust, an LCD screen that displays particle counts, and powerful suction for deep cleaning.</p><p>Regular price: $649.99</p><p>Deal price: $499.99</p><p>You save: $150.00</p>"
    },
    "excerpt": {
      "rendered": "<p>Dyson V12 Detect Slim cordless vacuum with laser dust detection and powerful suction. Save $150 - now only $499.99!</p>"
    }
  },
  {
    "id": 7,
    "title": {
      "rendered": "Nintendo Switch OLED Model - Bundle Deal"
    },
    "link": "https://www.example.com/deals/nintendo-switch-oled",
    "date": "2025-02-27T11:20:00",
    "content": {
      "rendered": "<p>Get the Nintendo Switch OLED Model with a vibrant 7-inch OLED screen, plus two games and a carrying case. The perfect gaming package for home or on-the-go play.</p><p>Regular price: $439.99</p><p>Deal price: $379.99</p><p>You save: $60.00</p>"
    },
    "excerpt": {
      "rendered": "<p>Nintendo Switch OLED Model bundle with two games and carrying case. Special bundle price of $379.99!</p>"
    }
  },
  {
    "id": 8,
    "title": {
      "rendered": "MacBook Air M3 - $200 Off"
    },
    "link": "https://www.example.com/deals/macbook-air-m3",
    "date": "2025-02-26T10:45:00",
    "content": {
      "rendered": "<p>The latest MacBook Air with M3 chip offers incredible performance and battery life in an ultra-thin design. Features a 13.6-inch Liquid Retina display, 8GB RAM, and 256GB SSD storage.</p><p>Regular price: $1,099.99</p><p>Deal price: $899.99</p><p>You save: $200.00</p>"
    },
    "excerpt": {
      "rendered": "<p>MacBook Air with M3 chip, 13.6-inch Liquid Retina display, and all-day battery life. Save $200 - now only $899.99!</p>"
    }
  },
  {
    "id": 9,
    "title": {
      "rendered": "Kindle Paperwhite Signature Edition - 30% Off"
    },
    "link": "https://www.example.com/deals/kindle-paperwhite",
    "date": "2025-02-27T09:15:00",
    "content": {
      "rendered": "<p>The Kindle Paperwhite Signature Edition features a 6.8-inch display, wireless charging, auto-adjusting front light, and 32GB storage. Perfect for reading anywhere, anytime.</p><p>Regular price: $189.99</p><p>Deal price: $132.99</p><p>You save: $57.00</p>"
    },
    "excerpt": {
      "rendered": "<p>Kindle Paperwhite Signature Edition with 6.8-inch display, wireless charging, and 32GB storage. Now 30% off at $132.99!</p>"
    }
  },
  {
    "id": 10,
    "title": {
      "rendered": "LG C3 65\" OLED 4K Smart TV - $500 Off"
    },
    "link": "https://www.example.com/deals/lg-c3-oled",
    "date": "2025-02-25T13:00:00",
    "content": {
      "rendered": "<p>Experience stunning picture quality with the LG C3 65\" OLED 4K Smart TV. Features self-lit OLED pixels, Dolby Vision, Dolby Atmos, and NVIDIA G-SYNC for gaming.</p><p>Regular price: $1,799.99</p><p>Deal price: $1,299.99</p><p>You save: $500.00</p>"
    },
    "excerpt": {
      "rendered": "<p>LG C3 65\" OLED 4K Smart TV with self-lit pixels and Dolby Vision. Save $500 - now only $1,299.99!</p>"
    }
  }
]

# Function to fetch deals from DealsFinders.com (from shopping_assistant.py)
def fetch_deals_data(url="https://www.dealsfinders.com/wp-json/wp/v2/posts", num_pages=2, per_page=100, use_sample_data=False):
    """
    Fetch deals data exclusively from the DealsFinders API or use sample data
    """
    # If use_sample_data is True, return the sample deals
    if use_sample_data:
        print("Using sample deals data")
        return SAMPLE_DEALS
    
    all_deals = []
    
    # Fetch from the DealsFinders API
    for page in range(1, num_pages + 1):
        try:
            # Add a user agent to avoid being blocked
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
            }
            response = requests.get(f"{url}?page={page}&per_page={per_page}", headers=headers)
            
            if response.status_code == 200:
                deals = response.json()
                all_deals.extend(deals)
                print(f"Fetched page {page} with {len(deals)} deals from DealsFinders API")
                
                # If we get fewer deals than requested, we've reached the end
                if len(deals) < per_page:
                    print(f"Reached the end of available deals at page {page}")
                    break
            else:
                print(f"Failed to fetch page {page} from DealsFinders API: {response.status_code}")
                print("Falling back to sample deals data")
                return SAMPLE_DEALS
        except Exception as e:
            print(f"Error fetching page {page} from DealsFinders API: {str(e)}")
            print("Falling back to sample deals data")
            return SAMPLE_DEALS
    
    # If no deals were fetched, use sample data
    if not all_deals:
        print("No deals fetched from API. Using sample deals data")
        return SAMPLE_DEALS
    
    return all_deals

# Function to process deals data (from shopping_assistant.py)
def process_deals_data(deals_data):
    """
    Process the deals data into a structured format
    """
    processed_deals = []
    
    for deal in deals_data:
        try:
            # Extract relevant information using our HTML text extractor
            content_html = deal.get('content', {}).get('rendered', '')
            excerpt_html = deal.get('excerpt', {}).get('rendered', '')
            
            clean_content = extract_text_from_html(content_html)
            clean_excerpt = extract_text_from_html(excerpt_html)
            
            processed_deal = {
                'id': deal.get('id'),
                'title': deal.get('title', {}).get('rendered', ''),
                'link': deal.get('link', ''),
                'date': deal.get('date', ''),
                'content': clean_content,
                'excerpt': clean_excerpt
            }
            processed_deals.append(processed_deal)
        except Exception as e:
            print(f"Error processing deal: {str(e)}")
    
    return processed_deals

# Define product categories
category_descriptions = {
    "electronics": "Electronic devices like headphones, speakers, TVs, smartphones, and gadgets",
    "computers": "Laptops, desktops, computer parts, monitors, and computing accessories",
    "mobile": "Mobile phones, smartphones, phone cases, screen protectors, and chargers",
    "audio": "Headphones, earbuds, speakers, microphones, and audio equipment",
    "clothing": "Clothes, shirts, pants, dresses, and fashion items",
    "footwear": "Shoes, boots, sandals, slippers, and all types of footwear",
    "home": "Home decor, furniture, bedding, and household items",
    "kitchen": "Kitchen appliances, cookware, utensils, and kitchen gadgets",
    "toys": "Toys, games, and children's entertainment items",
    "sports": "Sports equipment, fitness gear, and outdoor recreation items",
    "beauty": "Beauty products, makeup, skincare, and personal care items",
    "books": "Books, e-books, audiobooks, and reading materials"
}

# List of categories
categories = list(category_descriptions.keys())

# Try to load the recommended models
try:
    # 1. Load BART model for zero-shot classification
    from transformers import pipeline
    
    # Initialize the zero-shot classification pipeline
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    print("Using facebook/bart-large-mnli for classification")
    
    # 2. Load MPNet model for semantic search
    from sentence_transformers import SentenceTransformer, util
    
    # Load the sentence transformer model
    sentence_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    print("Using sentence-transformers/all-mpnet-base-v2 for semantic search")
    
    # Pre-compute embeddings for category descriptions
    category_texts = list(category_descriptions.values())
    category_embeddings = sentence_model.encode(category_texts, convert_to_tensor=True)
    
    # Using recommended models
    using_recommended_models = True
except Exception as e:
    # Fall back to local model if recommended models fail to load
    print(f"Error loading recommended models: {str(e)}")
    print("Falling back to local model")
    
    model_path = os.path.dirname(os.path.abspath(__file__))
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    
    # Load the local categories
    try:
        with open(os.path.join(model_path, "categories.json"), "r") as f:
            categories = json.load(f)
    except Exception as e:
        print(f"Error loading categories: {str(e)}")
        categories = ["electronics", "clothing", "home", "kitchen", "toys", "other"]
    
    # Not using recommended models
    using_recommended_models = False

# File path for storing deals data locally
DEALS_DATA_PATH = "deals_data.json"

# Function to fetch and save a large number of deals
def fetch_and_save_deals(max_deals=10000, per_page=100):
    """
    Fetch a large number of deals and save them to a local file
    """
    print(f"Fetching up to {max_deals} deals...")
    
    all_deals = []
    num_pages = min(max_deals // per_page + (1 if max_deals % per_page > 0 else 0), 100)  # Limit to 100 pages max
    
    # Fetch from the DealsFinders API
    for page in range(1, num_pages + 1):
        try:
            # Add a user agent to avoid being blocked
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
            }
            response = requests.get(f"https://www.dealsfinders.com/wp-json/wp/v2/posts?page={page}&per_page={per_page}", headers=headers)
            
            if response.status_code == 200:
                deals = response.json()
                all_deals.extend(deals)
                print(f"Fetched page {page} with {len(deals)} deals from DealsFinders API")
                
                # If we get fewer deals than requested, we've reached the end
                if len(deals) < per_page:
                    print(f"Reached the end of available deals at page {page}")
                    break
                
                # If we've reached the maximum number of deals, stop
                if len(all_deals) >= max_deals:
                    all_deals = all_deals[:max_deals]  # Trim to max_deals
                    print(f"Reached the maximum number of deals ({max_deals})")
                    break
            else:
                print(f"Failed to fetch page {page} from DealsFinders API: {response.status_code}")
                break
        except Exception as e:
            print(f"Error fetching page {page} from DealsFinders API: {str(e)}")
            break
    
    # Process the deals
    processed_deals = process_deals_data(all_deals)
    
    # Save the deals to a local file
    try:
        with open(DEALS_DATA_PATH, "w") as f:
            json.dump(processed_deals, f)
        print(f"Saved {len(processed_deals)} deals to {DEALS_DATA_PATH}")
        return processed_deals
    except Exception as e:
        print(f"Error saving deals to file: {str(e)}")
        return processed_deals

# Function to load deals from the local file
def load_deals_from_file():
    """
    Load deals from the local file
    """
    try:
        if os.path.exists(DEALS_DATA_PATH):
            with open(DEALS_DATA_PATH, "r") as f:
                deals = json.load(f)
            print(f"Loaded {len(deals)} deals from {DEALS_DATA_PATH}")
            return deals
        else:
            print(f"Deals file {DEALS_DATA_PATH} does not exist")
            return None
    except Exception as e:
        print(f"Error loading deals from file: {str(e)}")
        return None

# Global variable to store deals data
deals_cache = None

# Load deals from file on startup
try:
    # Try to load from file
    deals_cache = load_deals_from_file()
    
    # If file doesn't exist or is empty, use sample data
    if deals_cache is None or len(deals_cache) == 0:
        print("No deals found in local file. Using sample data...")
        deals_cache = process_deals_data(SAMPLE_DEALS)
        
    print(f"Initialized with {len(deals_cache) if deals_cache else 0} deals")
except Exception as e:
    print(f"Error initializing deals cache: {str(e)}")
    # Fall back to sample data
    deals_cache = process_deals_data(SAMPLE_DEALS)
    print(f"Initialized with {len(deals_cache)} sample deals")

def classify_text(text, fetch_deals=True):
    """
    Classify the text using the model and fetch relevant deals
    """
    global deals_cache
    
    # Get the top categories based on the model type
    if using_recommended_models:
        # Using BART for zero-shot classification
        result = classifier(text, categories, multi_label=True)
        
        # Extract categories and scores
        top_categories = []
        for i, (category, score) in enumerate(zip(result['labels'], result['scores'])):
            if score > 0.1:  # Lower threshold for zero-shot classification
                top_categories.append((category, score))
            
            # Limit to top 3 categories
            if i >= 2:
                break
    else:
        # Using the original classification model
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        
        # Get the model prediction
        with torch.no_grad():
            outputs = model(**inputs)
            predictions = torch.sigmoid(outputs.logits)
        
        # Get the top categories
        top_categories = []
        for i, score in enumerate(predictions[0]):
            if score > 0.5:  # Threshold for multi-label classification
                top_categories.append((categories[i], score.item()))
        
        # Sort by score
        top_categories.sort(key=lambda x: x[1], reverse=True)
    
    # Format the classification results
    if top_categories:
        result = f"Top categories for '{text}':\n\n"
        for category, score in top_categories:
            result += f"- {category}: {score:.4f}\n"
        
        result += f"\nBased on your query, I would recommend looking for deals in the **{top_categories[0][0]}** category.\n\n"
    else:
        result = f"No categories found for '{text}'. Please try a different query.\n\n"
    
    # Fetch and display deals if requested
    if fetch_deals:
        result += "## Relevant Deals from DealsFinders.com\n\n"
        
        try:
            # Fetch deals data if not already cached
            if deals_cache is None:
                # Use sample data by default in Hugging Face space environment
                deals_data = fetch_deals_data(num_pages=2, use_sample_data=True)  # Use sample data for reliability
                deals_cache = process_deals_data(deals_data)
            
            # Using MPNet for semantic search if available
            if using_recommended_models:
                # Create deal texts for semantic search
                deal_texts = []
                for deal in deals_cache:
                    # Combine title and excerpt for better matching
                    deal_text = f"{deal['title']} {deal['excerpt']}"
                    deal_texts.append(deal_text)
                
                # Encode the query and deals
                query_embedding = sentence_model.encode(text, convert_to_tensor=True)
                deal_embeddings = sentence_model.encode(deal_texts, convert_to_tensor=True)
                
                # Calculate semantic similarity
                similarities = util.cos_sim(query_embedding, deal_embeddings)[0]
                
                # Get top 5 most similar deals
                top_indices = torch.topk(similarities, k=min(5, len(deals_cache))).indices
                
                # Extract the relevant deals
                relevant_deals = [deals_cache[idx] for idx in top_indices]
            else:
                # Improved keyword-based search with category awareness
                query_terms = text.lower().split()
                expanded_terms = list(query_terms)
                
                # Get the top category from the classification results
                top_category = top_categories[0][0] if top_categories else None
                
                # Add category-specific terms
                if top_category == "electronics":
                    expanded_terms.extend(['electronic', 'device', 'gadget', 'tech', 'technology'])
                    if any(term in text.lower() for term in ['headphone', 'headphones']):
                        expanded_terms.extend(['earbuds', 'earphones', 'earpods', 'airpods', 'audio', 'bluetooth', 'wireless'])
                    elif any(term in text.lower() for term in ['laptop', 'computer']):
                        expanded_terms.extend(['notebook', 'macbook', 'chromebook', 'pc'])
                    elif any(term in text.lower() for term in ['tv', 'television']):
                        expanded_terms.extend(['smart tv', 'roku', 'streaming'])
                elif top_category == "kitchen":
                    expanded_terms.extend(['appliance', 'cookware', 'utensil', 'blender', 'mixer', 'toaster', 'microwave', 'oven'])
                elif top_category == "home":
                    expanded_terms.extend(['furniture', 'decor', 'decoration', 'bedding', 'household'])
                elif top_category == "clothing":
                    expanded_terms.extend(['clothes', 'shirt', 'pants', 'dress', 'fashion', 'wear', 'apparel'])
                elif top_category == "toys":
                    expanded_terms.extend(['game', 'play', 'children', 'kid', 'kids', 'fun'])
                
                # Score deals based on relevance to the query
                scored_deals = []
                for deal in deals_cache:
                    title = deal['title'].lower()
                    content = deal['content'].lower()
                    excerpt = deal['excerpt'].lower()
                    
                    score = 0
                    
                    # Check original query terms (higher weight)
                    for term in query_terms:
                        if term in title:
                            score += 10
                        if term in content:
                            score += 3
                        if term in excerpt:
                            score += 3
                    
                    # Check expanded terms (lower weight)
                    for term in expanded_terms:
                        if term not in query_terms:  # Skip original terms
                            if term in title:
                                score += 5
                            if term in content:
                                score += 1
                            if term in excerpt:
                                score += 1
                    
                    # Boost score for deals matching the top category
                    if top_category:
                        if top_category.lower() in title.lower():
                            score += 15
                        if top_category.lower() in content.lower():
                            score += 5
                        if top_category.lower() in excerpt.lower():
                            score += 5
                    
                    # Add to scored deals if it has any relevance
                    if score > 0:
                        scored_deals.append((deal, score))
                
                # Sort by score (descending)
                scored_deals.sort(key=lambda x: x[1], reverse=True)
                
                # Extract the deals from the scored list
                relevant_deals = [deal for deal, _ in scored_deals[:5]]
            
            if relevant_deals:
                for i, deal in enumerate(relevant_deals, 1):
                    result += f"{i}. [{deal['title']}]({deal['link']})\n\n"
            else:
                result += "No specific deals found for your query. Try a different search term or browse the recommended category.\n\n"
        
        except Exception as e:
            result += f"Error fetching deals: {str(e)}\n\n"
    
    return result

# Create the Gradio interface
demo = gr.Interface(
    fn=classify_text,
    inputs=[
        gr.Textbox(
            lines=2, 
            placeholder="Enter your shopping query here...",
            label="Shopping Query"
        ),
        gr.Checkbox(
            label="Fetch Deals",
            value=True,
            info="Check to fetch and display deals from DealsFinders.com"
        )
    ],
    outputs=gr.Markdown(label="Results"),
    title="Shopping Assistant",
    description="""
    This demo shows how to use the Shopping Assistant model to classify shopping queries into categories and find relevant deals.
    Enter a shopping query below to see which categories it belongs to and find deals from DealsFinders.com.
    
    Examples:
    - "I'm looking for headphones"
    - "Do you have any kitchen appliance deals?"
    - "Show me the best laptop deals"
    - "I need a new smart TV"
    """,
    examples=[
        ["I'm looking for headphones", True],
        ["Do you have any kitchen appliance deals?", True],
        ["Show me the best laptop deals", True],
        ["I need a new smart TV", True],
        ["headphone deals", True]
    ],
    theme=gr.themes.Soft()
)

# Launch the app
if __name__ == "__main__":
    demo.launch()