Spaces:
Running
Running
File size: 3,020 Bytes
5e5793b de2522a 5e5793b 228552f 5e5793b ef497bf 228552f f70863b 228552f ef497bf 5e5793b 6da8b4f f70863b c1ec65f de2522a 228552f de2522a 228552f de2522a 228552f de2522a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import pandas as pd
import streamlit as st
import numpy as np
import torch
from transformers import AlbertTokenizer
import io
import time
@st.cache(show_spinner=True,allow_output_mutation=True)
def load_model(model_name):
if model_name.startswith('albert'):
tokenizer = AlbertTokenizer.from_pretrained(model_name)
return tokenizer
if __name__=='__main__':
# Config
max_width = 1500
padding_top = 0
padding_right = 2
padding_bottom = 0
padding_left = 2
define_margins = f"""
<style>
.appview-container .main .block-container{{
max-width: {max_width}px;
padding-top: {padding_top}rem;
padding-right: {padding_right}rem;
padding-left: {padding_left}rem;
padding-bottom: {padding_bottom}rem;
}}
</style>
"""
hide_table_row_index = """
<style>
tbody th {display:none}
.blank {display:none}
</style>
"""
st.markdown(define_margins, unsafe_allow_html=True)
st.markdown(hide_table_row_index, unsafe_allow_html=True)
# Title
st.markdown("<p style='text-align:center; color:black; font-family:Arial; font-size:32px;'>Tokenizer Demo</p>", unsafe_allow_html=True)
tokenizer = load_model('albert-xxlarge-v2')
sent_cols = st.columns(2)
num_tokens = {}
sents = {}
for sent_id, sent_col in enumerate(sent_cols):
with sent_col:
sentence = st.text_input(f'Sentence {sent_id+1}')
sents[f'sent_{sent_id+1}'] = sentence
if len(sentence)>0:
input_sent = tokenizer(sentence)['input_ids']
encoded_sent = [str(token) for token in input_sent]
decoded_sent = [tokenizer.decode([token]) for token in input_sent[1:-1]]
num_tokens[f'sent_{sent_id+1}'] = len(decoded_sent)
#char_nums = [len(word)+2 for word in decoded_sent]
#word_cols = st.columns(char_nums)
#for word_col,word in zip(word_cols,decoded_sent):
#with word_col:
#st.write(word)
st.write(' '.join(encoded_sent))
st.write(' '.join(decoded_sent))
st.markdown(f"<p style='text-align: center; color: black; font-family:Arial; font-size:20px;'>{len(decoded_sent)} tokens </p>", unsafe_allow_html=True)
if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
st.markdown("<p style='text-align:center; color:black; font-family:Arial; font-size:16px;'>Result: </p>", unsafe_allow_html=True)
if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
st.markdown("<p style='text-align:center; color:MediumAquamarine; font-family:Arial; font-size:20px;'>Matched! </p>", unsafe_allow_html=True)
else:
st.markdown("<p style='text-align:center; color:Salmon; font-family:Arial; font-size:20px;'>Not Matched... </p>", unsafe_allow_html=True)
|