Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	
		taka-yamakoshi
		
	commited on
		
		
					Commit 
							
							·
						
						ef0b5c6
	
1
								Parent(s):
							
							145f48c
								
add default inputs
Browse files
    	
        app.py
    CHANGED
    
    | @@ -94,7 +94,7 @@ if __name__=='__main__': | |
| 94 | 
             
                st.markdown(hide_table_row_index, unsafe_allow_html=True)
         | 
| 95 |  | 
| 96 | 
             
                # Title
         | 
| 97 | 
            -
                st.markdown(generate_markdown(' | 
| 98 | 
             
                st.markdown(generate_markdown('quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
         | 
| 99 |  | 
| 100 | 
             
                # Select and load the tokenizer
         | 
| @@ -135,8 +135,12 @@ if __name__=='__main__': | |
| 135 |  | 
| 136 | 
             
                else:
         | 
| 137 | 
             
                    if detokenize:
         | 
| 138 | 
            -
                         | 
|  | |
|  | |
|  | |
|  | |
| 139 | 
             
                        num_tokens = DeTokenizeText(sentence)
         | 
| 140 | 
             
                    else:
         | 
| 141 | 
            -
                        sentence = st.text_input(f'Text')
         | 
| 142 | 
             
                        num_tokens = TokenizeText(sentence,tokenizer_name)
         | 
|  | |
| 94 | 
             
                st.markdown(hide_table_row_index, unsafe_allow_html=True)
         | 
| 95 |  | 
| 96 | 
             
                # Title
         | 
| 97 | 
            +
                st.markdown(generate_markdown('WordPiece Explorer',size=32), unsafe_allow_html=True)
         | 
| 98 | 
             
                st.markdown(generate_markdown('quick and easy way to explore how tokenizers work',size=24), unsafe_allow_html=True)
         | 
| 99 |  | 
| 100 | 
             
                # Select and load the tokenizer
         | 
|  | |
| 135 |  | 
| 136 | 
             
                else:
         | 
| 137 | 
             
                    if detokenize:
         | 
| 138 | 
            +
                        if tokenizer_name.startswith('gpt2'):
         | 
| 139 | 
            +
                            default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
         | 
| 140 | 
            +
                        else:
         | 
| 141 | 
            +
                            default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids'][1:-1]
         | 
| 142 | 
            +
                        sentence = st.text_input(f'Tokenized IDs',value=' '.join(default_tokens))
         | 
| 143 | 
             
                        num_tokens = DeTokenizeText(sentence)
         | 
| 144 | 
             
                    else:
         | 
| 145 | 
            +
                        sentence = st.text_input(f'Text',value='Tokenizers decompose bigger words into smaller tokens')
         | 
| 146 | 
             
                        num_tokens = TokenizeText(sentence,tokenizer_name)
         |