from transformers import ( AutoModelForSequenceClassification, AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer ) from sentence_transformers import SentenceTransformer from datasets import Dataset import torch import numpy as np from typing import Dict, List, Optional import json class MultiModelAnalyzer: def __init__(self): # Initialize different models for different tasks # 1. Category Understanding Model self.category_model = AutoModelForSequenceClassification.from_pretrained( "EMBEDDIA/sloberta-commerce" ) self.category_tokenizer = AutoTokenizer.from_pretrained( "EMBEDDIA/sloberta-commerce" ) # 2. Semantic Understanding Model self.semantic_model = SentenceTransformer('all-mpnet-base-v2') # 3. Feature Extraction Model self.feature_model = AutoModelForTokenClassification.from_pretrained( "bert-base-multilingual-uncased" ) self.feature_tokenizer = AutoTokenizer.from_pretrained( "bert-base-multilingual-uncased" ) def analyze_text(self, text: str) -> Dict: """Combine analysis from all models""" # Get category prediction category = self._predict_category(text) # Get semantic embedding embedding = self._get_semantic_embedding(text) # Extract features features = self._extract_features(text) return { "category": category, "embedding": embedding, "features": features } def _predict_category(self, text: str) -> str: """Predict product category""" inputs = self.category_tokenizer( text, return_tensors="pt", truncation=True, max_length=512 ) outputs = self.category_model(**inputs) predictions = torch.nn.functional.softmax(outputs.logits, dim=1) return predictions.argmax().item() def _get_semantic_embedding(self, text: str) -> np.ndarray: """Get semantic embedding of text""" return self.semantic_model.encode(text) def _extract_features(self, text: str) -> List[str]: """Extract relevant features from text""" inputs = self.feature_tokenizer( text, return_tensors="pt", truncation=True, max_length=512 ) outputs = self.feature_model(**inputs) predictions = outputs.logits.argmax(dim=2) return self._convert_predictions_to_features(predictions, inputs) class ModelTrainer: def __init__(self, model_analyzer: MultiModelAnalyzer): self.analyzer = model_analyzer def prepare_training_data(self, product_data: List[Dict]) -> Dataset: """Prepare data for fine-tuning""" training_data = [] for product in product_data: # Format data for training item = { "text": product["description"], "category": product["category"], "features": product["features"], "price": product["price"] } training_data.append(item) return Dataset.from_list(training_data) def fine_tune_category_model(self, training_data: Dataset): """Fine-tune the category prediction model""" training_args = TrainingArguments( output_dir="./results", num_train_epochs=3, per_device_train_batch_size=8, per_device_eval_batch_size=8, warmup_steps=500, weight_decay=0.01, logging_dir="./logs", logging_steps=10, ) trainer = Trainer( model=self.analyzer.category_model, args=training_args, train_dataset=training_data, tokenizer=self.analyzer.category_tokenizer ) trainer.train() def fine_tune_feature_model(self, training_data: Dataset): """Fine-tune the feature extraction model""" training_args = TrainingArguments( output_dir="./results_feature", num_train_epochs=3, per_device_train_batch_size=8, per_device_eval_batch_size=8, warmup_steps=500, weight_decay=0.01, logging_dir="./logs_feature", logging_steps=10, ) trainer = Trainer( model=self.analyzer.feature_model, args=training_args, train_dataset=training_data, tokenizer=self.analyzer.feature_tokenizer ) trainer.train() class ProductRecommender: def __init__(self): self.model_analyzer = MultiModelAnalyzer() self.trainer = ModelTrainer(self.model_analyzer) def train_on_product_data(self, product_data: List[Dict]): """Train models on product data""" # Prepare training data training_dataset = self.trainer.prepare_training_data(product_data) # Fine-tune models self.trainer.fine_tune_category_model(training_dataset) self.trainer.fine_tune_feature_model(training_dataset) def get_recommendations(self, query: str, product_database: List[Dict]) -> List[Dict]: """Get product recommendations""" # Analyze query query_analysis = self.model_analyzer.analyze_text(query) # Find matching products matches = [] for product in product_database: product_analysis = self.model_analyzer.analyze_text(product['description']) # Calculate similarity score similarity = self._calculate_similarity( query_analysis, product_analysis ) matches.append({ "product": product, "similarity": similarity }) # Sort by similarity matches.sort(key=lambda x: x['similarity'], reverse=True) # Return top 5 matches return [match['product'] for match in matches[:5]] def _calculate_similarity(self, query_analysis: Dict, product_analysis: Dict) -> float: """Calculate similarity between query and product""" # Combine multiple similarity factors category_match = query_analysis['category'] == product_analysis['category'] embedding_similarity = np.dot( query_analysis['embedding'], product_analysis['embedding'] ) feature_overlap = len( set(query_analysis['features']) & set(product_analysis['features']) ) # Weight and combine scores total_score = ( 0.4 * category_match + 0.4 * embedding_similarity + 0.2 * feature_overlap ) return total_score