Teery commited on
Commit
901c6d9
·
1 Parent(s): c1efda8

app upgrade

Browse files
Files changed (1) hide show
  1. app.py +29 -24
app.py CHANGED
@@ -5,11 +5,12 @@ import pandas as pd
5
  import numpy as np
6
  import torch
7
  from transformers import AutoTokenizer, AutoModel
 
8
 
9
  tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
10
  model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
11
 
12
- films = pd.read_csv('Films_finder/movies_2.csv')
13
  films['description'] = films['description'].astype(str)
14
 
15
  def embed_bert_cls(text, model, tokenizer):
@@ -19,27 +20,31 @@ def embed_bert_cls(text, model, tokenizer):
19
  embeddings = model_output.last_hidden_state[:, 0, :]
20
  embeddings = torch.nn.functional.normalize(embeddings)
21
  return embeddings[0].cpu().numpy()
22
- @st.cache_resource
23
- def for_embeded_list(series: pd.Series) -> list:
24
- return np.array([embed_bert_cls(i.replace('\xa0', ' '), model, tokenizer) for i in series])
25
- embeded_list = for_embeded_list(films['description'])
 
26
  text = st.text_input('Введите текст')
27
- count_visible = st.number_input("Введите количество отображаемых элементов", 1, 10, step=1)
28
- if text and count_visible:
29
- embeded_text = embed_bert_cls(text, model, tokenizer).reshape(1,-1)
30
- cossim = pairwise_distances(embeded_text, embeded_list)[0]
31
- for i in range(count_visible):
32
- col1, col2 = st.columns(2)
33
- with col1:
34
- st.header(films.iloc[cossim.argsort()].iloc[i][2])
35
- st.write(films.iloc[cossim.argsort()].iloc[i][3].replace('\xa0', ' '))
36
- st.write(f'Уверенность состовляет {cossim[i]}')
37
- with col2:
38
- st.image(films.iloc[cossim.argsort()].iloc[i][1])
39
- st.header('Самый не подходящий запрос')
40
- col3, col4 = st.columns(2)
41
- with col3:
42
- st.header(films.iloc[cossim.argsort()].iloc[-1][2])
43
- st.write(films.iloc[cossim.argsort()].iloc[-1][3].replace('\xa0', ' '))
44
- with col4:
45
- st.image(films.iloc[cossim.argsort()].iloc[-1][1])
 
 
 
 
5
  import numpy as np
6
  import torch
7
  from transformers import AutoTokenizer, AutoModel
8
+ from joblib import load
9
 
10
  tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
11
  model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
12
 
13
+ films = pd.read_csv('movies_2.csv').dropna()
14
  films['description'] = films['description'].astype(str)
15
 
16
  def embed_bert_cls(text, model, tokenizer):
 
20
  embeddings = model_output.last_hidden_state[:, 0, :]
21
  embeddings = torch.nn.functional.normalize(embeddings)
22
  return embeddings[0].cpu().numpy()
23
+ # @st.cache_resource
24
+ # def for_embeded_list(series: pd.Series) -> np.array:
25
+ # return np.array([embed_bert_cls(i.replace('\xa0', ' '), model, tokenizer) for i in series])
26
+ embeded_list = load('embeded_list.joblib')
27
+ # embeded_list = for_embeded_list(films['description'])
28
  text = st.text_input('Введите текст')
29
+ count_visible = st.number_input("Введите количество отображаемых элементов", 1, 10, 5, step=1)
30
+ if st.button("Найти", type="primary"):
31
+ if text and count_visible:
32
+ embeded_text = embed_bert_cls(text, model, tokenizer).reshape(1,-1)
33
+ cossim = pairwise_distances(embeded_text, embeded_list)[0]
34
+ for i in range(count_visible):
35
+ col1, col2 = st.columns(2)
36
+ with col1:
37
+ st.header(films.iloc[cossim.argsort()].iloc[i][2])
38
+ st.write(films.iloc[cossim.argsort()].iloc[i][3].replace('\xa0', ' '))
39
+ with col2:
40
+ try:
41
+ st.image(films.iloc[cossim.argsort()].iloc[i][1])
42
+ except:
43
+ st.write('Нет картинки')
44
+ st.header('Самый не подходящий запрос')
45
+ col3, col4 = st.columns(2)
46
+ with col3:
47
+ st.header(films.iloc[cossim.argsort()].iloc[-1][2])
48
+ st.write(films.iloc[cossim.argsort()].iloc[-1][3].replace('\xa0', ' '))
49
+ with col4:
50
+ st.image(films.iloc[cossim.argsort()].iloc[-1][1])