import pandas as pd import streamlit as st import numpy as np import torch from transformers import AlbertTokenizer import io import time @st.cache(show_spinner=True,allow_output_mutation=True) def load_model(model_name): if model_name.startswith('albert'): tokenizer = AlbertTokenizer.from_pretrained(model_name) return tokenizer if __name__=='__main__': # Config max_width = 1500 padding_top = 0 padding_right = 2 padding_bottom = 0 padding_left = 2 define_margins = f""" """ hide_table_row_index = """ """ st.markdown(define_margins, unsafe_allow_html=True) st.markdown(hide_table_row_index, unsafe_allow_html=True) # Title st.markdown("

Tokenizer Demo

", unsafe_allow_html=True) tokenizer = load_model('albert-xxlarge-v2') sent_cols = st.columns(2) num_tokens = {} sents = {} for sent_id, sent_col in enumerate(sent_cols): with sent_col: sentence = st.text_input(f'Sentence {sent_id+1}') sents[f'sent_{sent_id+1}'] = sentence if len(sentence)>0: input_sent = tokenizer(sentence)['input_ids'] encoded_sent = [str(token) for token in input_sent] decoded_sent = [tokenizer.decode([token]) for token in input_sent[1:-1]] num_tokens[f'sent_{sent_id+1}'] = len(decoded_sent) #char_nums = [len(word)+2 for word in decoded_sent] #word_cols = st.columns(char_nums) #for word_col,word in zip(word_cols,decoded_sent): #with word_col: #st.write(word) st.write(' '.join(encoded_sent)) st.write(' '.join(decoded_sent)) st.markdown(f"

{len(decoded_sent)} tokens

", unsafe_allow_html=True) if len(sents['sent_1'])>0 and len(sents['sent_2'])>0: st.markdown("

Result:

", unsafe_allow_html=True) if num_tokens[f'sent_1']==num_tokens[f'sent_2']: st.markdown("

Matched!

", unsafe_allow_html=True) else: st.markdown("

Not Matched...

", unsafe_allow_html=True)