Spaces:
Sleeping
Sleeping
from transformers import ( | |
AutoModelForSequenceClassification, | |
AutoTokenizer, | |
AutoModelForTokenClassification, | |
TrainingArguments, | |
Trainer | |
) | |
from sentence_transformers import SentenceTransformer | |
from datasets import Dataset | |
import torch | |
import numpy as np | |
from typing import Dict, List, Optional | |
import json | |
class MultiModelAnalyzer: | |
def __init__(self): | |
# Initialize different models for different tasks | |
# 1. Category Understanding Model | |
self.category_model = AutoModelForSequenceClassification.from_pretrained( | |
"EMBEDDIA/sloberta-commerce" | |
) | |
self.category_tokenizer = AutoTokenizer.from_pretrained( | |
"EMBEDDIA/sloberta-commerce" | |
) | |
# 2. Semantic Understanding Model | |
self.semantic_model = SentenceTransformer('all-mpnet-base-v2') | |
# 3. Feature Extraction Model | |
self.feature_model = AutoModelForTokenClassification.from_pretrained( | |
"bert-base-multilingual-uncased" | |
) | |
self.feature_tokenizer = AutoTokenizer.from_pretrained( | |
"bert-base-multilingual-uncased" | |
) | |
def analyze_text(self, text: str) -> Dict: | |
"""Combine analysis from all models""" | |
# Get category prediction | |
category = self._predict_category(text) | |
# Get semantic embedding | |
embedding = self._get_semantic_embedding(text) | |
# Extract features | |
features = self._extract_features(text) | |
return { | |
"category": category, | |
"embedding": embedding, | |
"features": features | |
} | |
def _predict_category(self, text: str) -> str: | |
"""Predict product category""" | |
inputs = self.category_tokenizer( | |
text, | |
return_tensors="pt", | |
truncation=True, | |
max_length=512 | |
) | |
outputs = self.category_model(**inputs) | |
predictions = torch.nn.functional.softmax(outputs.logits, dim=1) | |
return predictions.argmax().item() | |
def _get_semantic_embedding(self, text: str) -> np.ndarray: | |
"""Get semantic embedding of text""" | |
return self.semantic_model.encode(text) | |
def _extract_features(self, text: str) -> List[str]: | |
"""Extract relevant features from text""" | |
inputs = self.feature_tokenizer( | |
text, | |
return_tensors="pt", | |
truncation=True, | |
max_length=512 | |
) | |
outputs = self.feature_model(**inputs) | |
predictions = outputs.logits.argmax(dim=2) | |
return self._convert_predictions_to_features(predictions, inputs) | |
class ModelTrainer: | |
def __init__(self, model_analyzer: MultiModelAnalyzer): | |
self.analyzer = model_analyzer | |
def prepare_training_data(self, product_data: List[Dict]) -> Dataset: | |
"""Prepare data for fine-tuning""" | |
training_data = [] | |
for product in product_data: | |
# Format data for training | |
item = { | |
"text": product["description"], | |
"category": product["category"], | |
"features": product["features"], | |
"price": product["price"] | |
} | |
training_data.append(item) | |
return Dataset.from_list(training_data) | |
def fine_tune_category_model(self, training_data: Dataset): | |
"""Fine-tune the category prediction model""" | |
training_args = TrainingArguments( | |
output_dir="./results", | |
num_train_epochs=3, | |
per_device_train_batch_size=8, | |
per_device_eval_batch_size=8, | |
warmup_steps=500, | |
weight_decay=0.01, | |
logging_dir="./logs", | |
logging_steps=10, | |
) | |
trainer = Trainer( | |
model=self.analyzer.category_model, | |
args=training_args, | |
train_dataset=training_data, | |
tokenizer=self.analyzer.category_tokenizer | |
) | |
trainer.train() | |
def fine_tune_feature_model(self, training_data: Dataset): | |
"""Fine-tune the feature extraction model""" | |
training_args = TrainingArguments( | |
output_dir="./results_feature", | |
num_train_epochs=3, | |
per_device_train_batch_size=8, | |
per_device_eval_batch_size=8, | |
warmup_steps=500, | |
weight_decay=0.01, | |
logging_dir="./logs_feature", | |
logging_steps=10, | |
) | |
trainer = Trainer( | |
model=self.analyzer.feature_model, | |
args=training_args, | |
train_dataset=training_data, | |
tokenizer=self.analyzer.feature_tokenizer | |
) | |
trainer.train() | |
class ProductRecommender: | |
def __init__(self): | |
self.model_analyzer = MultiModelAnalyzer() | |
self.trainer = ModelTrainer(self.model_analyzer) | |
def train_on_product_data(self, product_data: List[Dict]): | |
"""Train models on product data""" | |
# Prepare training data | |
training_dataset = self.trainer.prepare_training_data(product_data) | |
# Fine-tune models | |
self.trainer.fine_tune_category_model(training_dataset) | |
self.trainer.fine_tune_feature_model(training_dataset) | |
def get_recommendations(self, query: str, product_database: List[Dict]) -> List[Dict]: | |
"""Get product recommendations""" | |
# Analyze query | |
query_analysis = self.model_analyzer.analyze_text(query) | |
# Find matching products | |
matches = [] | |
for product in product_database: | |
product_analysis = self.model_analyzer.analyze_text(product['description']) | |
# Calculate similarity score | |
similarity = self._calculate_similarity( | |
query_analysis, | |
product_analysis | |
) | |
matches.append({ | |
"product": product, | |
"similarity": similarity | |
}) | |
# Sort by similarity | |
matches.sort(key=lambda x: x['similarity'], reverse=True) | |
# Return top 5 matches | |
return [match['product'] for match in matches[:5]] | |
def _calculate_similarity(self, query_analysis: Dict, product_analysis: Dict) -> float: | |
"""Calculate similarity between query and product""" | |
# Combine multiple similarity factors | |
category_match = query_analysis['category'] == product_analysis['category'] | |
embedding_similarity = np.dot( | |
query_analysis['embedding'], | |
product_analysis['embedding'] | |
) | |
feature_overlap = len( | |
set(query_analysis['features']) & set(product_analysis['features']) | |
) | |
# Weight and combine scores | |
total_score = ( | |
0.4 * category_match + | |
0.4 * embedding_similarity + | |
0.2 * feature_overlap | |
) | |
return total_score |