File size: 4,066 Bytes
f51e197
 
c90e96d
59dea20
 
 
 
 
 
 
 
 
 
 
 
ea4c492
59dea20
ea4c492
 
59dea20
 
 
 
 
 
 
 
 
 
 
4132514
ea4c492
4132514
f51e197
 
 
 
 
b1a0d5b
ea4c492
f51e197
59dea20
 
d38ab04
4132514
 
c90e96d
 
 
 
 
d38ab04
4132514
c90e96d
d38ab04
c90e96d
4132514
c90e96d
 
4132514
ea4c492
c90e96d
d38ab04
59dea20
 
 
 
4132514
 
59dea20
 
 
 
 
 
 
 
 
 
 
 
 
4132514
ea4c492
 
 
 
 
 
 
 
4132514
59dea20
 
 
4132514
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import streamlit as st
import pandas as pd
import numpy as np
import torch
from transformers import AlbertTokenizer, AlbertModel
from sklearn.metrics.pairwise import cosine_similarity

# base is smaller, vs large
model_size='base'
tokenizer = AlbertTokenizer.from_pretrained('albert-' + model_size + '-v2')
model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')

def get_embedding(input_text):
    encoded_input = tokenizer(input_text, return_tensors='pt')
    input_ids = encoded_input.input_ids
    #input_num_tokens = input_ids.shape[1]

    #print( "Number of input tokens: " + str(input_num_tokens))
    #print("Length of input: " + str(len(input_text)))

    list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())

    print( "Tokens : " + ' '.join(list_of_tokens))
    with torch.no_grad():

        outputs = model(**encoded_input)
        last_hidden_states = outputs[0]
        sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
        #sentence_embedding = output.last_hidden_state[0][0]
        return sentence_embedding.tolist()

st. set_page_config(layout="wide")
st.title('Upload the Address Dataset')

st.markdown('Upload an Excel file to view the data in a table.')

uploaded_file = st.file_uploader('Choose a file', type='xlsx')



if uploaded_file is not None:
    data_caqh = pd.read_excel(uploaded_file, sheet_name='CAQH', dtype=str)
    data_ndb = pd.read_excel(uploaded_file, sheet_name='NDB', dtype=str)

    # Data cleaning CAQH
    data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x)
    data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
                             + np.where(data_caqh['address2'].isnull(),  '' , data_caqh['address2'].astype(str))  \
                             + data_caqh['city'].astype(str) + ', '\
                             + data_caqh['state'].astype(str) + ', ' \
                             + data_caqh['postalcode'].astype(str)

    # Data cleaning NDB
    data_ndb['zip_pls_4_cd'] = data_ndb['zip_pls_4_cd'].astype(str).apply(lambda x: x if (x[-1] != '0' and x[-1] != '1') else '')

    data_ndb['zip_cd_zip_pls_4_cd'] = data_ndb['zip_cd'].astype(str) +\
                                      np.where( data_ndb['zip_pls_4_cd'] == '', '', '-' \
                                      + data_ndb['zip_pls_4_cd'].astype(str))

    data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
                            + data_ndb['cty_nm'].astype(str) + ', ' \
                            + data_ndb['st_cd'].astype(str) + ', ' + data_ndb['zip_cd_zip_pls_4_cd']

    # App
    data_caqh['embedding'] = data_caqh['full-addr'].apply(get_embedding)
    data_ndb['embedding'] = data_ndb['full-addr'].apply(get_embedding)

    data_caqh['matched-addr'] = ''

    for i, row in data_caqh.iterrows():
        max_similarity = 0
        matched_row = None
        for j, ndb_row in data_ndb.iterrows():
            sim = cosine_similarity([row['embedding']], [ndb_row['embedding']])
            if sim > max_similarity:
                max_similarity = sim
                matched_row = ndb_row
        if max_similarity >= 0.98:
            data_caqh.at[i, 'matched-addr'] = matched_row['full-addr']
            data_caqh.at[i, 'similarity-score'] = max_similarity
        else:
            data_caqh.at[i, 'matched-addr'] = 'No Matches'

    # Drop columns not needed for display
    data_caqh.drop(columns=['embedding'], inplace=True)
    data_ndb.drop(columns=['embedding'], inplace=True)

    st.header('CAQH addresses and matches')
    st.dataframe(data_caqh, use_container_width=True)
    st.header('NDB data')
    st.dataframe(data_ndb, use_container_width=True)

    # calculate the embedding of each item.

    #st.dataframe(data_caqh)
    # Do some matching
    #data_caqh.loc[data_caqh['full-addr'] == '1000 Vale Terrace, Vista, CA, 92084', 'matched-addr'] = '456 Main St'
    #time.sleep(10)
    #st.dataframe(data_caqh)