Spaces:
Sleeping
Sleeping
import streamlit as st | |
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity | |
from scipy.spatial import distance | |
import pandas as pd | |
import numpy as np | |
import torch | |
from transformers import AutoTokenizer, AutoModel | |
from joblib import load | |
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2") | |
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2") | |
films = pd.read_csv('movies_2.csv').dropna() | |
films['description'] = films['description'].astype(str) | |
def embed_bert_cls(text, model, tokenizer): | |
t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=1024) | |
with torch.no_grad(): | |
model_output = model(**{k: v.to(model.device) for k, v in t.items()}) | |
embeddings = model_output.last_hidden_state[:, 0, :] | |
embeddings = torch.nn.functional.normalize(embeddings) | |
return embeddings[0].cpu().numpy() | |
# @st.cache_resource | |
# def for_embeded_list(series: pd.Series) -> np.array: | |
# return np.array([embed_bert_cls(i.replace('\xa0', ' '), model, tokenizer) for i in series]) | |
embeded_list = load('embeded_list.joblib') | |
# embeded_list = for_embeded_list(films['description']) | |
text = st.text_input('Введите текст') | |
count_visible = st.number_input("Введите количество отображаемых элементов", 1, 10, 5, step=1) | |
if st.button("Найти", type="primary"): | |
if text and count_visible: | |
embeded_text = embed_bert_cls(text, model, tokenizer).reshape(1,-1) | |
cossim = pairwise_distances(embeded_text, embeded_list)[0] | |
for i in range(count_visible): | |
col1, col2 = st.columns(2) | |
with col1: | |
st.header(films.iloc[cossim.argsort()].iloc[i][2]) | |
st.write(films.iloc[cossim.argsort()].iloc[i][3].replace('\xa0', ' ')) | |
with col2: | |
try: | |
st.image(films.iloc[cossim.argsort()].iloc[i][1]) | |
except: | |
st.write('Нет картинки') | |
st.header('Самый не подходящий запрос') | |
col3, col4 = st.columns(2) | |
with col3: | |
st.header(films.iloc[cossim.argsort()].iloc[-1][2]) | |
st.write(films.iloc[cossim.argsort()].iloc[-1][3].replace('\xa0', ' ')) | |
with col4: | |
st.image(films.iloc[cossim.argsort()].iloc[-1][1]) |