File size: 2,023 Bytes
da676c8
40c9d2b
da676c8
 
12094be
 
da676c8
 
 
fa02d7f
da676c8
12094be
f089045
da676c8
fa02d7f
6dd0ae0
12094be
6dd0ae0
005c6a4
12094be
da676c8
fa02d7f
da676c8
1ef9e65
 
 
da676c8
6dd0ae0
 
 
12094be
 
 
 
 
 
ac5b8a7
12094be
fa02d7f
 
f089045
fa02d7f
12094be
 
fa02d7f
38a8bac
12094be
 
 
 
 
fa02d7f
40c9d2b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import streamlit as st
import pandas as pd

from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

@st.cache(allow_output_mutation=True)
def get_model(model):
	return pipeline("fill-mask", model=model, top_k=100)#seto maximum of tokens to be retrieved after each inference to model


HISTORY_WEIGHT = 100 # set history weight (if found any keyword from history, it will priorities based on its weight)

st.caption("This is a simple auto-completion where the next token is predicted per probability and a weigh if appears in user's history")

history_keyword_text = st.text_input("Enter users's history keywords (optional, i.e., 'Gates')", value="")

text = st.text_input("Enter a text for auto completion...", value='Where is Bill')
semantic_text = st.text_input("Enter users's history semantic (optional, i.e., 'Microsoft')", value="Microsoft")

model = st.selectbox("choose a model", ["roberta-base", "bert-base-uncased"])

data_load_state = st.text('Loading model...')
nlp = get_model(model)

if text:
    data_load_state = st.text('Inference to model...')
    result = nlp(text+' '+nlp.tokenizer.mask_token)
    data_load_state.text('')
    
    predicted_embeddings = model.encode(result['sequence'], convert_to_tensor=True)
    semantic_history_embeddings = model.encode(semantic_text.spllit(','), convert_to_tensor=True)
    
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    
    for index, r in enumerate(result):
        result[index]['score']=cosine_scores[index][index]
        if r['token_str'].lower().strip() in history_keyword_text.lower().strip() and len(r['token_str'].lower().strip())>1:
            #found from history, then increase the score of tokens
            result[index]['score']*=HISTORY_WEIGHT
            
    
    
    #sort the results        
    df=pd.DataFrame(result).sort_values(by='score', ascending=False)
    
    


    
    #show the results as a table
    st.table(df)