import streamlit as st from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity from scipy.spatial import distance import pandas as pd import numpy as np import torch from transformers import AutoTokenizer, AutoModel tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2") model = AutoModel.from_pretrained("cointegrated/rubert-tiny2") films = pd.read_csv('Films_finder/movies_2.csv') films['description'] = films['description'].astype(str) def embed_bert_cls(text, model, tokenizer): t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=1024) with torch.no_grad(): model_output = model(**{k: v.to(model.device) for k, v in t.items()}) embeddings = model_output.last_hidden_state[:, 0, :] embeddings = torch.nn.functional.normalize(embeddings) return embeddings[0].cpu().numpy() @st.cache_resource def for_embeded_list(series: pd.Series) -> list: return np.array([embed_bert_cls(i.replace('\xa0', ' '), model, tokenizer) for i in series]) embeded_list = for_embeded_list(films['description']) text = st.text_input('Введите текст') count_visible = st.number_input("Введите количество отображаемых элементов", 1, 10, step=1) if text and count_visible: embeded_text = embed_bert_cls(text, model, tokenizer).reshape(1,-1) cossim = pairwise_distances(embeded_text, embeded_list)[0] for i in range(count_visible): col1, col2 = st.columns(2) with col1: st.header(films.iloc[cossim.argsort()].iloc[i][2]) st.write(films.iloc[cossim.argsort()].iloc[i][3].replace('\xa0', ' ')) st.write(f'Уверенность состовляет {cossim[i]}') with col2: st.image(films.iloc[cossim.argsort()].iloc[i][1]) st.header('Самый не подходящий запрос') col3, col4 = st.columns(2) with col3: st.header(films.iloc[cossim.argsort()].iloc[-1][2]) st.write(films.iloc[cossim.argsort()].iloc[-1][3].replace('\xa0', ' ')) with col4: st.image(films.iloc[cossim.argsort()].iloc[-1][1])