Spaces:
Runtime error
Runtime error
File size: 2,558 Bytes
5357bd8 61a3b2f bee3802 ece3f89 81d456a ece3f89 c8d4230 ece3f89 61a3b2f c8d4230 66b3df6 c8d4230 ece3f89 c8d4230 66b3df6 ab010ed c8d4230 7ed66d7 c8d4230 80ccea0 c8d4230 bee3802 ece3f89 80ccea0 11bd107 c8d4230 80ccea0 3567a04 ece3f89 6f9d03f ece3f89 81d456a c8d4230 ece3f89 fcbfd45 ece3f89 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
from gradio import Interface
import gradio as gr
import aranizer
from transformers import AutoTokenizer
import codecs
# Loading tokenizer instances from Transformers.
gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
# Assuming the existence of get_tokenizer() method for aranizer models in your setup.
tokenizers = {
"aranizer_bpe50k": lambda: aranizer.aranizer_bpe50k.get_tokenizer(),
"aranizer_bpe64k": lambda: aranizer.aranizer_bpe64k.get_tokenizer(),
"aranizer_bpe86k": lambda: aranizer.aranizer_bpe86k.get_tokenizer(),
"aranizer_sp32k": lambda: aranizer.aranizer_sp32k.get_tokenizer(),
"aranizer_sp50k": lambda: aranizer.aranizer_sp50k.get_tokenizer(),
"aranizer_sp64k": lambda: aranizer.aranizer_sp64k.get_tokenizer(),
"aranizer_sp86k": lambda: aranizer.aranizer_sp86k.get_tokenizer(),
"FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
"FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
"inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer,
}
# Define tokenizer options for dropdown menu.
tokenizer_options = list(tokenizers.keys())
def compare_tokenizers(tokenizer_name, text):
# UTF-8 encoding assertion for the input text
text = codecs.decode(text.encode('utf-8'), 'utf-8')
tokenizer = tokenizers[tokenizer_name]()
tokens = tokenizer.tokenize(text)
encoded_output = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")
decoded_text = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
# Ensuring the tokens are iterated and converted correctly
tokens_utf8 = [codecs.decode(token.encode('utf-8'), 'utf-8', errors='ignore') for token in tokens]
# Preparing and returning results in UTF-8
results = [(tokenizer_name, tokens_utf8, encoded_output.tolist(), decoded_text)]
return results
inputs_component = [
gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer"),
gr.Textbox(lines=2, placeholder="Enter Arabic text here...", label="Input Text", default="مثال بالعربية")
]
outputs_component = gr.Dataframe(
headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"],
label="Results",
)
iface = Interface(
fn=compare_tokenizers,
inputs=inputs_component,
outputs=outputs_component,
title="Tokenizer Comparison",
live=True,
)
iface.launch() |