Spaces:

selvaonline
/

shopping-assistant-demo

Sleeping

App Files Files Community

selvaonline commited on Feb 28

Commit

b1b6f63

verified ·

1 Parent(s): e667020

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +139 -98

app.py CHANGED Viewed

@@ -83,25 +83,50 @@ def process_deals_data(deals_data):
     return processed_deals
-# Load the e-commerce specific model and tokenizer
 try:
-    # Try to load the e-commerce BERT model
-    tokenizer = AutoTokenizer.from_pretrained("prithivida/ecommerce-bert-base-uncased")
-    model = AutoModelForSequenceClassification.from_pretrained("prithivida/ecommerce-bert-base-uncased")
-    # E-commerce BERT categories
-    categories = [
-        "electronics", "computers", "mobile_phones", "accessories",
-        "clothing", "footwear", "watches", "jewelry",
-        "home", "kitchen", "furniture", "decor",
-        "beauty", "personal_care", "health", "wellness",
-        "toys", "games", "sports", "outdoors",
-        "books", "stationery", "music", "movies"
-    ]
-    print("Using e-commerce BERT model")
 except Exception as e:
-    # Fall back to local model if e-commerce BERT fails to load
-    print(f"Error loading e-commerce BERT model: {str(e)}")
     print("Falling back to local model")
     model_id = "selvaonline/shopping-assistant"
@@ -117,6 +142,9 @@ except Exception as e:
     except Exception as e:
         print(f"Error loading categories: {str(e)}")
         categories = ["electronics", "clothing", "home", "kitchen", "toys", "other"]
 # Global variable to store deals data
 deals_cache = None
@@ -127,46 +155,37 @@ def classify_text(text, fetch_deals=True):
     """
     global deals_cache
-    # Prepare the input for classification
-    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
-    # Get the model prediction
-    with torch.no_grad():
-        outputs = model(**inputs)
-        # Handle different model output formats
-        if hasattr(outputs, 'logits'):
-            # For models that return logits
-            if outputs.logits.shape[1] == len(categories):
-                # Multi-label classification
-                predictions = torch.sigmoid(outputs.logits)
-                # Get the top categories
-                top_categories = []
-                for i, score in enumerate(predictions[0]):
-                    if score > 0.3:  # Lower threshold for e-commerce model
-                        top_categories.append((categories[i], score.item()))
-            else:
-                # Single-label classification
-                probabilities = torch.softmax(outputs.logits, dim=1)
-                values, indices = torch.topk(probabilities, 3)
-                top_categories = []
-                for i, idx in enumerate(indices[0]):
-                    if idx < len(categories):
-                        top_categories.append((categories[idx.item()], values[0][i].item()))
-        else:
-            # Fallback for other model formats
-            predictions = torch.sigmoid(outputs[0])
-            # Get the top categories
-            top_categories = []
-            for i, score in enumerate(predictions[0]):
-                if score > 0.5:
-                    top_categories.append((categories[i], score.item()))
-    # Sort by score
-    top_categories.sort(key=lambda x: x[1], reverse=True)
     # Format the classification results
     if top_categories:
@@ -188,57 +207,79 @@ def classify_text(text, fetch_deals=True):
                 deals_data = fetch_deals_data(num_pages=2)  # Limit to 2 pages for faster response
                 deals_cache = process_deals_data(deals_data)
-            # Extract query terms and expand with related terms
-            query_terms = text.lower().split()
-            expanded_terms = list(query_terms)
-            # Add related terms based on the query
-            if any(term in text.lower() for term in ['headphone', 'headphones']):
-                expanded_terms.extend(['earbuds', 'earphones', 'earpods', 'airpods', 'audio', 'bluetooth', 'wireless'])
-            elif any(term in text.lower() for term in ['laptop', 'computer']):
-                expanded_terms.extend(['notebook', 'macbook', 'chromebook', 'pc'])
-            elif any(term in text.lower() for term in ['tv', 'television']):
-                expanded_terms.extend(['smart tv', 'roku', 'streaming'])
-            elif any(term in text.lower() for term in ['kitchen', 'appliance']):
-                expanded_terms.extend(['mixer', 'blender', 'toaster', 'microwave', 'oven'])
-            # Score deals based on relevance to the query
-            scored_deals = []
-            for deal in deals_cache:
-                title = deal['title'].lower()
-                content = deal['content'].lower()
-                excerpt = deal['excerpt'].lower()
-                score = 0
-                # Check original query terms (higher weight)
-                for term in query_terms:
-                    if term in title:
-                        score += 10
-                    if term in content:
-                        score += 3
-                    if term in excerpt:
-                        score += 3
-                # Check expanded terms (lower weight)
-                for term in expanded_terms:
-                    if term not in query_terms:  # Skip original terms
                         if term in title:
-                            score += 5
                         if term in content:
-                            score += 1
                         if term in excerpt:
-                            score += 1
-                # Add to scored deals if it has any relevance
-                if score > 0:
-                    scored_deals.append((deal, score))
-            # Sort by score (descending)
-            scored_deals.sort(key=lambda x: x[1], reverse=True)
-            # Extract the deals from the scored list
-            relevant_deals = [deal for deal, _ in scored_deals[:5]]
             if relevant_deals:
                 for i, deal in enumerate(relevant_deals, 1):

     return processed_deals
+# Define product categories
+category_descriptions = {
+    "electronics": "Electronic devices like headphones, speakers, TVs, smartphones, and gadgets",
+    "computers": "Laptops, desktops, computer parts, monitors, and computing accessories",
+    "mobile": "Mobile phones, smartphones, phone cases, screen protectors, and chargers",
+    "audio": "Headphones, earbuds, speakers, microphones, and audio equipment",
+    "clothing": "Clothes, shirts, pants, dresses, and fashion items",
+    "footwear": "Shoes, boots, sandals, slippers, and all types of footwear",
+    "home": "Home decor, furniture, bedding, and household items",
+    "kitchen": "Kitchen appliances, cookware, utensils, and kitchen gadgets",
+    "toys": "Toys, games, and children's entertainment items",
+    "sports": "Sports equipment, fitness gear, and outdoor recreation items",
+    "beauty": "Beauty products, makeup, skincare, and personal care items",
+    "books": "Books, e-books, audiobooks, and reading materials"
+}
+# List of categories
+categories = list(category_descriptions.keys())
+# Try to load the recommended models
 try:
+    # 1. Load BART model for zero-shot classification
+    from transformers import pipeline
+    # Initialize the zero-shot classification pipeline
+    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+    print("Using facebook/bart-large-mnli for classification")
+    # 2. Load MPNet model for semantic search
+    from sentence_transformers import SentenceTransformer, util
+    # Load the sentence transformer model
+    sentence_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
+    print("Using sentence-transformers/all-mpnet-base-v2 for semantic search")
+    # Pre-compute embeddings for category descriptions
+    category_texts = list(category_descriptions.values())
+    category_embeddings = sentence_model.encode(category_texts, convert_to_tensor=True)
+    # Using recommended models
+    using_recommended_models = True
 except Exception as e:
+    # Fall back to local model if recommended models fail to load
+    print(f"Error loading recommended models: {str(e)}")
     print("Falling back to local model")
     model_id = "selvaonline/shopping-assistant"
     except Exception as e:
         print(f"Error loading categories: {str(e)}")
         categories = ["electronics", "clothing", "home", "kitchen", "toys", "other"]
+    # Not using recommended models
+    using_recommended_models = False
 # Global variable to store deals data
 deals_cache = None
     """
     global deals_cache
+    # Get the top categories based on the model type
+    if using_recommended_models:
+        # Using BART for zero-shot classification
+        result = classifier(text, categories, multi_label=True)
+        # Extract categories and scores
+        top_categories = []
+        for i, (category, score) in enumerate(zip(result['labels'], result['scores'])):
+            if score > 0.1:  # Lower threshold for zero-shot classification
+                top_categories.append((category, score))
+            # Limit to top 3 categories
+            if i >= 2:
+                break
+    else:
+        # Using the original classification model
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
+        # Get the model prediction
+        with torch.no_grad():
+            outputs = model(**inputs)
+            predictions = torch.sigmoid(outputs.logits)
+        # Get the top categories
+        top_categories = []
+        for i, score in enumerate(predictions[0]):
+            if score > 0.5:  # Threshold for multi-label classification
+                top_categories.append((categories[i], score.item()))
+        # Sort by score
+        top_categories.sort(key=lambda x: x[1], reverse=True)
     # Format the classification results
     if top_categories:
                 deals_data = fetch_deals_data(num_pages=2)  # Limit to 2 pages for faster response
                 deals_cache = process_deals_data(deals_data)
+            # Using MPNet for semantic search if available
+            if using_recommended_models:
+                # Create deal texts for semantic search
+                deal_texts = []
+                for deal in deals_cache:
+                    # Combine title and excerpt for better matching
+                    deal_text = f"{deal['title']} {deal['excerpt']}"
+                    deal_texts.append(deal_text)
+                # Encode the query and deals
+                query_embedding = sentence_model.encode(text, convert_to_tensor=True)
+                deal_embeddings = sentence_model.encode(deal_texts, convert_to_tensor=True)
+                # Calculate semantic similarity
+                similarities = util.cos_sim(query_embedding, deal_embeddings)[0]
+                # Get top 5 most similar deals
+                top_indices = torch.topk(similarities, k=min(5, len(deals_cache))).indices
+                # Extract the relevant deals
+                relevant_deals = [deals_cache[idx] for idx in top_indices]
+            else:
+                # Fallback to keyword-based search
+                query_terms = text.lower().split()
+                expanded_terms = list(query_terms)
+                # Add related terms based on the query
+                if any(term in text.lower() for term in ['headphone', 'headphones']):
+                    expanded_terms.extend(['earbuds', 'earphones', 'earpods', 'airpods', 'audio', 'bluetooth', 'wireless'])
+                elif any(term in text.lower() for term in ['laptop', 'computer']):
+                    expanded_terms.extend(['notebook', 'macbook', 'chromebook', 'pc'])
+                elif any(term in text.lower() for term in ['tv', 'television']):
+                    expanded_terms.extend(['smart tv', 'roku', 'streaming'])
+                elif any(term in text.lower() for term in ['kitchen', 'appliance']):
+                    expanded_terms.extend(['mixer', 'blender', 'toaster', 'microwave', 'oven'])
+                # Score deals based on relevance to the query
+                scored_deals = []
+                for deal in deals_cache:
+                    title = deal['title'].lower()
+                    content = deal['content'].lower()
+                    excerpt = deal['excerpt'].lower()
+                    score = 0
+                    # Check original query terms (higher weight)
+                    for term in query_terms:
                         if term in title:
+                            score += 10
                         if term in content:
+                            score += 3
                         if term in excerpt:
+                            score += 3
+                    # Check expanded terms (lower weight)
+                    for term in expanded_terms:
+                        if term not in query_terms:  # Skip original terms
+                            if term in title:
+                                score += 5
+                            if term in content:
+                                score += 1
+                            if term in excerpt:
+                                score += 1
+                    # Add to scored deals if it has any relevance
+                    if score > 0:
+                        scored_deals.append((deal, score))
+                # Sort by score (descending)
+                scored_deals.sort(key=lambda x: x[1], reverse=True)
+                # Extract the deals from the scored list
+                relevant_deals = [deal for deal, _ in scored_deals[:5]]
             if relevant_deals:
                 for i, deal in enumerate(relevant_deals, 1):