Spaces:
Running
Running
taka-yamakoshi
commited on
Commit
·
ef0b5c6
1
Parent(s):
145f48c
add default inputs
Browse files
app.py
CHANGED
@@ -94,7 +94,7 @@ if __name__=='__main__':
|
|
94 |
st.markdown(hide_table_row_index, unsafe_allow_html=True)
|
95 |
|
96 |
# Title
|
97 |
-
st.markdown(generate_markdown('
|
98 |
st.markdown(generate_markdown('quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
|
99 |
|
100 |
# Select and load the tokenizer
|
@@ -135,8 +135,12 @@ if __name__=='__main__':
|
|
135 |
|
136 |
else:
|
137 |
if detokenize:
|
138 |
-
|
|
|
|
|
|
|
|
|
139 |
num_tokens = DeTokenizeText(sentence)
|
140 |
else:
|
141 |
-
sentence = st.text_input(f'Text')
|
142 |
num_tokens = TokenizeText(sentence,tokenizer_name)
|
|
|
94 |
st.markdown(hide_table_row_index, unsafe_allow_html=True)
|
95 |
|
96 |
# Title
|
97 |
+
st.markdown(generate_markdown('WordPiece Explorer',size=32), unsafe_allow_html=True)
|
98 |
st.markdown(generate_markdown('quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
|
99 |
|
100 |
# Select and load the tokenizer
|
|
|
135 |
|
136 |
else:
|
137 |
if detokenize:
|
138 |
+
if tokenizer_name.startswith('gpt2'):
|
139 |
+
default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
|
140 |
+
else:
|
141 |
+
default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids'][1:-1]
|
142 |
+
sentence = st.text_input(f'Tokenized IDs',value=' '.join(default_tokens))
|
143 |
num_tokens = DeTokenizeText(sentence)
|
144 |
else:
|
145 |
+
sentence = st.text_input(f'Text',value='Tokenizers decompose bigger words into smaller tokens')
|
146 |
num_tokens = TokenizeText(sentence,tokenizer_name)
|