Spaces:
Runtime error
Runtime error
File size: 4,066 Bytes
f51e197 c90e96d 59dea20 ea4c492 59dea20 ea4c492 59dea20 4132514 ea4c492 4132514 f51e197 b1a0d5b ea4c492 f51e197 59dea20 d38ab04 4132514 c90e96d d38ab04 4132514 c90e96d d38ab04 c90e96d 4132514 c90e96d 4132514 ea4c492 c90e96d d38ab04 59dea20 4132514 59dea20 4132514 ea4c492 4132514 59dea20 4132514 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import streamlit as st
import pandas as pd
import numpy as np
import torch
from transformers import AlbertTokenizer, AlbertModel
from sklearn.metrics.pairwise import cosine_similarity
# base is smaller, vs large
model_size='base'
tokenizer = AlbertTokenizer.from_pretrained('albert-' + model_size + '-v2')
model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')
def get_embedding(input_text):
encoded_input = tokenizer(input_text, return_tensors='pt')
input_ids = encoded_input.input_ids
#input_num_tokens = input_ids.shape[1]
#print( "Number of input tokens: " + str(input_num_tokens))
#print("Length of input: " + str(len(input_text)))
list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
print( "Tokens : " + ' '.join(list_of_tokens))
with torch.no_grad():
outputs = model(**encoded_input)
last_hidden_states = outputs[0]
sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
#sentence_embedding = output.last_hidden_state[0][0]
return sentence_embedding.tolist()
st. set_page_config(layout="wide")
st.title('Upload the Address Dataset')
st.markdown('Upload an Excel file to view the data in a table.')
uploaded_file = st.file_uploader('Choose a file', type='xlsx')
if uploaded_file is not None:
data_caqh = pd.read_excel(uploaded_file, sheet_name='CAQH', dtype=str)
data_ndb = pd.read_excel(uploaded_file, sheet_name='NDB', dtype=str)
# Data cleaning CAQH
data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x)
data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
+ np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)) \
+ data_caqh['city'].astype(str) + ', '\
+ data_caqh['state'].astype(str) + ', ' \
+ data_caqh['postalcode'].astype(str)
# Data cleaning NDB
data_ndb['zip_pls_4_cd'] = data_ndb['zip_pls_4_cd'].astype(str).apply(lambda x: x if (x[-1] != '0' and x[-1] != '1') else '')
data_ndb['zip_cd_zip_pls_4_cd'] = data_ndb['zip_cd'].astype(str) +\
np.where( data_ndb['zip_pls_4_cd'] == '', '', '-' \
+ data_ndb['zip_pls_4_cd'].astype(str))
data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
+ data_ndb['cty_nm'].astype(str) + ', ' \
+ data_ndb['st_cd'].astype(str) + ', ' + data_ndb['zip_cd_zip_pls_4_cd']
# App
data_caqh['embedding'] = data_caqh['full-addr'].apply(get_embedding)
data_ndb['embedding'] = data_ndb['full-addr'].apply(get_embedding)
data_caqh['matched-addr'] = ''
for i, row in data_caqh.iterrows():
max_similarity = 0
matched_row = None
for j, ndb_row in data_ndb.iterrows():
sim = cosine_similarity([row['embedding']], [ndb_row['embedding']])
if sim > max_similarity:
max_similarity = sim
matched_row = ndb_row
if max_similarity >= 0.98:
data_caqh.at[i, 'matched-addr'] = matched_row['full-addr']
data_caqh.at[i, 'similarity-score'] = max_similarity
else:
data_caqh.at[i, 'matched-addr'] = 'No Matches'
# Drop columns not needed for display
data_caqh.drop(columns=['embedding'], inplace=True)
data_ndb.drop(columns=['embedding'], inplace=True)
st.header('CAQH addresses and matches')
st.dataframe(data_caqh, use_container_width=True)
st.header('NDB data')
st.dataframe(data_ndb, use_container_width=True)
# calculate the embedding of each item.
#st.dataframe(data_caqh)
# Do some matching
#data_caqh.loc[data_caqh['full-addr'] == '1000 Vale Terrace, Vista, CA, 92084', 'matched-addr'] = '456 Main St'
#time.sleep(10)
#st.dataframe(data_caqh)
|