noddysnots's picture
Update app.py
e7b9fde verified
raw
history blame
7.04 kB
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
AutoModelForTokenClassification,
TrainingArguments,
Trainer
)
from sentence_transformers import SentenceTransformer
from datasets import Dataset
import torch
import numpy as np
from typing import Dict, List, Optional
import json
class MultiModelAnalyzer:
def __init__(self):
# Initialize different models for different tasks
# 1. Category Understanding Model
self.category_model = AutoModelForSequenceClassification.from_pretrained(
"EMBEDDIA/sloberta-commerce"
)
self.category_tokenizer = AutoTokenizer.from_pretrained(
"EMBEDDIA/sloberta-commerce"
)
# 2. Semantic Understanding Model
self.semantic_model = SentenceTransformer('all-mpnet-base-v2')
# 3. Feature Extraction Model
self.feature_model = AutoModelForTokenClassification.from_pretrained(
"bert-base-multilingual-uncased"
)
self.feature_tokenizer = AutoTokenizer.from_pretrained(
"bert-base-multilingual-uncased"
)
def analyze_text(self, text: str) -> Dict:
"""Combine analysis from all models"""
# Get category prediction
category = self._predict_category(text)
# Get semantic embedding
embedding = self._get_semantic_embedding(text)
# Extract features
features = self._extract_features(text)
return {
"category": category,
"embedding": embedding,
"features": features
}
def _predict_category(self, text: str) -> str:
"""Predict product category"""
inputs = self.category_tokenizer(
text,
return_tensors="pt",
truncation=True,
max_length=512
)
outputs = self.category_model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=1)
return predictions.argmax().item()
def _get_semantic_embedding(self, text: str) -> np.ndarray:
"""Get semantic embedding of text"""
return self.semantic_model.encode(text)
def _extract_features(self, text: str) -> List[str]:
"""Extract relevant features from text"""
inputs = self.feature_tokenizer(
text,
return_tensors="pt",
truncation=True,
max_length=512
)
outputs = self.feature_model(**inputs)
predictions = outputs.logits.argmax(dim=2)
return self._convert_predictions_to_features(predictions, inputs)
class ModelTrainer:
def __init__(self, model_analyzer: MultiModelAnalyzer):
self.analyzer = model_analyzer
def prepare_training_data(self, product_data: List[Dict]) -> Dataset:
"""Prepare data for fine-tuning"""
training_data = []
for product in product_data:
# Format data for training
item = {
"text": product["description"],
"category": product["category"],
"features": product["features"],
"price": product["price"]
}
training_data.append(item)
return Dataset.from_list(training_data)
def fine_tune_category_model(self, training_data: Dataset):
"""Fine-tune the category prediction model"""
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=10,
)
trainer = Trainer(
model=self.analyzer.category_model,
args=training_args,
train_dataset=training_data,
tokenizer=self.analyzer.category_tokenizer
)
trainer.train()
def fine_tune_feature_model(self, training_data: Dataset):
"""Fine-tune the feature extraction model"""
training_args = TrainingArguments(
output_dir="./results_feature",
num_train_epochs=3,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs_feature",
logging_steps=10,
)
trainer = Trainer(
model=self.analyzer.feature_model,
args=training_args,
train_dataset=training_data,
tokenizer=self.analyzer.feature_tokenizer
)
trainer.train()
class ProductRecommender:
def __init__(self):
self.model_analyzer = MultiModelAnalyzer()
self.trainer = ModelTrainer(self.model_analyzer)
def train_on_product_data(self, product_data: List[Dict]):
"""Train models on product data"""
# Prepare training data
training_dataset = self.trainer.prepare_training_data(product_data)
# Fine-tune models
self.trainer.fine_tune_category_model(training_dataset)
self.trainer.fine_tune_feature_model(training_dataset)
def get_recommendations(self, query: str, product_database: List[Dict]) -> List[Dict]:
"""Get product recommendations"""
# Analyze query
query_analysis = self.model_analyzer.analyze_text(query)
# Find matching products
matches = []
for product in product_database:
product_analysis = self.model_analyzer.analyze_text(product['description'])
# Calculate similarity score
similarity = self._calculate_similarity(
query_analysis,
product_analysis
)
matches.append({
"product": product,
"similarity": similarity
})
# Sort by similarity
matches.sort(key=lambda x: x['similarity'], reverse=True)
# Return top 5 matches
return [match['product'] for match in matches[:5]]
def _calculate_similarity(self, query_analysis: Dict, product_analysis: Dict) -> float:
"""Calculate similarity between query and product"""
# Combine multiple similarity factors
category_match = query_analysis['category'] == product_analysis['category']
embedding_similarity = np.dot(
query_analysis['embedding'],
product_analysis['embedding']
)
feature_overlap = len(
set(query_analysis['features']) & set(product_analysis['features'])
)
# Weight and combine scores
total_score = (
0.4 * category_match +
0.4 * embedding_similarity +
0.2 * feature_overlap
)
return total_score