Spaces:
Running
Running
taka-yamakoshi
commited on
Commit
·
21c2f11
1
Parent(s):
a999c8e
include detokenize
Browse files
app.py
CHANGED
@@ -24,11 +24,14 @@ def load_model(model_name):
|
|
24 |
def generate_markdown(text,color='black',font='Arial',size=20):
|
25 |
return f"<p style='text-align:center; color:{color}; font-family:{font}; font-size:{size}px;'>{text}</p>"
|
26 |
|
27 |
-
def TokenizeText(sentence):
|
28 |
if len(sentence)>0:
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
32 |
num_tokens = len(decoded_sent)
|
33 |
|
34 |
#char_nums = [len(word)+2 for word in decoded_sent]
|
@@ -44,6 +47,22 @@ def TokenizeText(sentence):
|
|
44 |
|
45 |
return num_tokens
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
if __name__=='__main__':
|
49 |
|
@@ -76,7 +95,7 @@ if __name__=='__main__':
|
|
76 |
|
77 |
# Title
|
78 |
st.markdown(generate_markdown('Tokenizer Demo:',size=32), unsafe_allow_html=True)
|
79 |
-
st.markdown(generate_markdown('
|
80 |
|
81 |
# Select and load the tokenizer
|
82 |
tokenizer_name = st.sidebar.selectbox('Choose the tokenizer from below',
|
@@ -87,6 +106,7 @@ if __name__=='__main__':
|
|
87 |
tokenizer = load_model(tokenizer_name)
|
88 |
|
89 |
comparison_mode = st.sidebar.checkbox('Compare two texts')
|
|
|
90 |
if comparison_mode:
|
91 |
sent_cols = st.columns(2)
|
92 |
num_tokens = {}
|
@@ -95,7 +115,10 @@ if __name__=='__main__':
|
|
95 |
with sent_col:
|
96 |
sentence = st.text_input(f'Text {sent_id+1}')
|
97 |
sents[f'sent_{sent_id+1}'] = sentence
|
98 |
-
|
|
|
|
|
|
|
99 |
|
100 |
if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
|
101 |
st.markdown(generate_markdown('Result: ',size=16), unsafe_allow_html=True)
|
@@ -106,4 +129,7 @@ if __name__=='__main__':
|
|
106 |
|
107 |
else:
|
108 |
sentence = st.text_input(f'Text')
|
109 |
-
|
|
|
|
|
|
|
|
24 |
def generate_markdown(text,color='black',font='Arial',size=20):
|
25 |
return f"<p style='text-align:center; color:{color}; font-family:{font}; font-size:{size}px;'>{text}</p>"
|
26 |
|
27 |
+
def TokenizeText(sentence,tokenizer_name):
|
28 |
if len(sentence)>0:
|
29 |
+
if tokenizer_name.startswith('gpt2'):
|
30 |
+
input_sent = tokenizer(sentence)['input_ids']
|
31 |
+
else:
|
32 |
+
input_sent = tokenizer(sentence)['input_ids'][1:-1]
|
33 |
+
encoded_sent = [str(token) for token in input_sent]
|
34 |
+
decoded_sent = [tokenizer.decode([token]) for token in input_sent]
|
35 |
num_tokens = len(decoded_sent)
|
36 |
|
37 |
#char_nums = [len(word)+2 for word in decoded_sent]
|
|
|
47 |
|
48 |
return num_tokens
|
49 |
|
50 |
+
def DeTokenizeText(input_str):
|
51 |
+
if len(input_str)>0:
|
52 |
+
input_sent = [int(element) for element in input_str.strip().split(' ')]
|
53 |
+
encoded_sent = [str(token) for token in input_sent]
|
54 |
+
decoded_sent = [tokenizer.decode([token]) for token in input_sent]
|
55 |
+
num_tokens = len(decoded_sent)
|
56 |
+
|
57 |
+
#char_nums = [len(word)+2 for word in decoded_sent]
|
58 |
+
#word_cols = st.columns(char_nums)
|
59 |
+
#for word_col,word in zip(word_cols,decoded_sent):
|
60 |
+
#with word_col:
|
61 |
+
#st.write(word)
|
62 |
+
#st.write(' '.join(encoded_sent))
|
63 |
+
#st.write(' '.join(decoded_sent))
|
64 |
+
st.markdown(generate_markdown(' '.join(decoded_sent)), unsafe_allow_html=True)
|
65 |
+
return num_tokens
|
66 |
|
67 |
if __name__=='__main__':
|
68 |
|
|
|
95 |
|
96 |
# Title
|
97 |
st.markdown(generate_markdown('Tokenizer Demo:',size=32), unsafe_allow_html=True)
|
98 |
+
st.markdown(generate_markdown('quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
|
99 |
|
100 |
# Select and load the tokenizer
|
101 |
tokenizer_name = st.sidebar.selectbox('Choose the tokenizer from below',
|
|
|
106 |
tokenizer = load_model(tokenizer_name)
|
107 |
|
108 |
comparison_mode = st.sidebar.checkbox('Compare two texts')
|
109 |
+
detokenize = st.sidebar.checkbox('de-tokenize')
|
110 |
if comparison_mode:
|
111 |
sent_cols = st.columns(2)
|
112 |
num_tokens = {}
|
|
|
115 |
with sent_col:
|
116 |
sentence = st.text_input(f'Text {sent_id+1}')
|
117 |
sents[f'sent_{sent_id+1}'] = sentence
|
118 |
+
if detokenize:
|
119 |
+
num_tokens[f'sent_{sent_id+1}'] = DeTokenizeText(sentence)
|
120 |
+
else:
|
121 |
+
num_tokens[f'sent_{sent_id+1}'] = TokenizeText(sentence,tokenizer_name)
|
122 |
|
123 |
if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
|
124 |
st.markdown(generate_markdown('Result: ',size=16), unsafe_allow_html=True)
|
|
|
129 |
|
130 |
else:
|
131 |
sentence = st.text_input(f'Text')
|
132 |
+
if detokenize:
|
133 |
+
num_tokens = DeTokenizeText(sentence)
|
134 |
+
else:
|
135 |
+
num_tokens = TokenizeText(sentence,tokenizer_name)
|