byt5-tokenizer-utils

Sleeping

File size: 1,626 Bytes

a1ad15c
d459757
a1ad15c
 
a0f9f54
a1ad15c
 
d459757
a1ad15c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0f9f54
 
 
 
a1ad15c
 
cd3f827
a1ad15c
 
 
 
6f7a741
a1ad15c

import gradio as gr
from transformers import ByT5Tokenizer
import json

ACTIONS = ["text2ids", "text2tokens", "ids2tokens", "tokens2ids and JSON requires double quotes", "ids2text"]

def translate(model, action, inputs):
    tokenizer = ByT5Tokenizer.from_pretrained(model)
    vocab_size = tokenizer.vocab_size
    len_tokenizer = len(tokenizer)
    input = inputs
    output = ''
    if action == ACTIONS[0]:
      input_ids = tokenizer(input)['input_ids']
      output = input_ids

    if action == ACTIONS[1]:
      input_ids = tokenizer(input)['input_ids']
      tokens = tokenizer.convert_ids_to_tokens(input_ids)
      output = tokens
    if action == ACTIONS[2]:
      list = json.loads(input)
      tokens = tokenizer.convert_ids_to_tokens(list)
      output = tokens
    if action == ACTIONS[3]:
      list = json.loads(input)
      tokens = tokenizer.convert_tokens_to_ids(list)
      output = tokens
    if action == ACTIONS[4]:
      list = json.loads(input)
      text = tokenizer.decode(list)
      output = text    
        
        
    return f'{output}\n\n\n\nother infos:\njson:{json.dumps(output)} \nvocab_size: {vocab_size}\nlen(tokenizer): {len_tokenizer}'

demo = gr.Interface(
    fn=translate,
    inputs=[
        gr.components.Textbox(label="MODEL NAME, eg: google/byt5-small", value="google/byt5-small"),
        gr.components.Dropdown(label="ACTIONS", choices=ACTIONS),
        gr.components.Textbox(label="INPUTS"),
    ],
    outputs=["text"],
    cache_examples=False,
    title="Test T5Tokenizer",
    description="▁Test, ▁To, ken, izer, s, ▁happily, !, </s>"
)

demo.launch(debug=True)