File size: 1,764 Bytes
7ab3fbc
0803d70
 
 
7ab3fbc
0a1dfe8
 
0803d70
 
 
3ee972d
7ab3fbc
0803d70
 
 
 
 
0a1dfe8
0803d70
 
 
 
 
 
0a1dfe8
0803d70
 
0a1dfe8
0803d70
 
 
0a1dfe8
0803d70
 
0a1dfe8
0803d70
1515adb
0a1dfe8
7ab3fbc
 
0a1dfe8
 
 
86bd5a4
0803d70
0a1dfe8
0803d70
0a1dfe8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import gradio as gr
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

MODELS = {
    "rubert-tiny2": "cointegrated/rubert-tiny2",
    "sbert": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    "LaBSE": "sentence-transformers/LaBSE",
    "ruRoberta": "sberbank-ai/ruRoberta-large"
}

def get_embeddings(model, tokenizer, texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0].detach().numpy()

def classify(model_name: str, item: str, categories: str) -> str:
    # Загрузка модели и токенизатора
    tokenizer = AutoTokenizer.from_pretrained(MODELS[model_name])
    model = AutoModel.from_pretrained(MODELS[model_name])
    
    # Подготовка текстов
    texts = [item] + [c.strip() for c in categories.split(",")]
    
    # Получение эмбеддингов
    embeddings = get_embeddings(model, tokenizer, texts)
    
    # Сравнение с категориями
    item_embedding = embeddings[0].reshape(1, -1)
    category_embeddings = embeddings[1:]
    
    similarities = cosine_similarity(item_embedding, category_embeddings)[0]
    best_idx = np.argmax(similarities)
    
    return f"{texts[1:][best_idx]} ({similarities[best_idx]:.2f})"

iface = gr.Interface(
    fn=classify,
    inputs=[
        gr.Dropdown(list(MODELS.keys()), label="Модель"),
        gr.Textbox(label="Товар"), 
        gr.Textbox(label="Категории", value="Инструменты, Овощи, Техника")
    ],
    outputs=gr.Textbox(label="Результат")
)

iface.launch()