File size: 3,143 Bytes
7b86ace
 
 
 
 
 
 
 
3326af6
50ed058
7b86ace
50ed058
7b86ace
 
 
 
3326af6
50ed058
7b86ace
 
 
030dff0
7b86ace
 
 
 
 
 
 
50ed058
7b86ace
 
 
 
 
 
 
 
 
 
 
 
 
 
d5175dd
 
 
7b86ace
d5175dd
 
 
 
 
 
 
 
 
 
 
 
 
7b86ace
d5175dd
 
 
 
 
7b86ace
d5175dd
 
7b86ace
 
 
 
 
 
50ed058
7b86ace
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#%%
import pandas as pd
import numpy as np
import torch
from sentence_transformers.util import cos_sim
from sentence_transformers import SentenceTransformer
import gradio as gr
#%%
etalon = pd.read_csv("etalon_prod.csv")
df = pd.read_csv("preprocessed_complaints.csv")

model = SentenceTransformer('sentence-transformers/multi-qa-distilbert-cos-v1')


unique_complaints = df['Жалобы'].values.tolist()

with open("embeddings.npy", 'rb') as f:
    embeddings = np.load(f)

def get_recommend(user_input, 
                  top_k_spec = 3,
                  top_k_services = 5,
                  treshold = 0.8):
    
    cols_for_top_k = ["Специальность врача",
                      "Рекомендуемые специалисты"]
    
    usr_embeddings = model.encode(user_input)

    cos_similarity = cos_sim(usr_embeddings, embeddings).detach().numpy()
    sorted_idx = cos_similarity[0].argsort()[::-1]
    cos_similarity.sort()
    
    cos_similarity = cos_similarity[0][::-1]
    
    sorted_df = df.loc[sorted_idx].copy()
    sorted_df['cos_sim'] = cos_similarity
    sorted_df = sorted_df[sorted_df['cos_sim'] > treshold]
    
    result = {}
    for col in cols_for_top_k:
        result[col] = sorted_df[col].value_counts()[:top_k_spec].index.tolist()
    result['Жалобы'] = sorted_df['Жалобы'].value_counts()[:top_k_services].index.tolist()
    
    top_k_mkb = sorted_df['Диагноз МКБ'].value_counts()[:top_k_services].index.tolist()
    result['Диагноз МКБ'] = top_k_mkb
    
    categories = ['Инструментальная диагностика', 'Лабораторная диагностика']
    
    top_k_services_lst_by_mkb = []
    for mkb in top_k_mkb:
        temp_lst = []
        slice_df = sorted_df[sorted_df['Диагноз МКБ'] == mkb]
        for category in categories:
            top_k_services_in_cat_mkb = slice_df[slice_df['service_name_category'] == category]['Рекомендации по обследованию'].value_counts()[:top_k_services].index.tolist()
            temp_lst.append({category:top_k_services_in_cat_mkb})
            
        top_k_services_lst_by_mkb.append({mkb:temp_lst})
            
    top_k_services_lst = []    
    
    for category in categories:
        slice_df = sorted_df[sorted_df['service_name_category'] == category]
        list_top_k_services = slice_df['Рекомендации по обследованию'].value_counts()[:top_k_services].index.tolist()
        top_k_services_lst.append({category:list_top_k_services})
        
        
    
    result['Рекомендации по обследованию'] = top_k_services_lst
    result['Рекомендации по обследованию по МКБ'] = top_k_services_lst_by_mkb

    return result
#%%
gradio_app = gr.Interface(
    get_recommend,
    inputs='text',
    outputs=gr.JSON(label='Рекомендации: '),
    # title="Предсказание топ-10 наиболее схожих услуг",
    description="Введите услугу:"
)

if __name__ == "__main__":
    gradio_app.launch()
# %%