Spaces:
Runtime error
Runtime error
File size: 3,093 Bytes
5357bd8 61a3b2f bee3802 7760bbc ece3f89 7760bbc ece3f89 61a3b2f 7760bbc 66b3df6 7760bbc ece3f89 7760bbc 66b3df6 ab010ed 7ed66d7 7760bbc 3b0ce68 80ccea0 7760bbc 3b0ce68 bee3802 ece3f89 7760bbc 80ccea0 7760bbc 80ccea0 3567a04 ece3f89 7760bbc ece3f89 6f9d03f 7760bbc ece3f89 9f4f9aa 7760bbc ece3f89 fcbfd45 7760bbc ece3f89 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
from gradio import Interface
import gradio as gr
import aranizer
from aranizer import aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
from transformers import AutoTokenizer
# Load additional tokenizers from transformers
gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
# List of available tokenizers and a dictionary to load them
tokenizer_options = [
"aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
"aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k",
"FreedomIntelligence/AceGPT-13B",
"FreedomIntelligence/AceGPT-7B",
"inception-mbzuai/jais-13b"
]
tokenizers = {
"aranizer_bpe50k": aranizer_bpe50k.get_tokenizer,
"aranizer_bpe64k": aranizer_bpe64k.get_tokenizer,
"aranizer_bpe86k": aranizer_bpe86k.get_tokenizer,
"aranizer_sp32k": aranizer_sp32k.get_tokenizer,
"aranizer_sp50k": aranizer_sp50k.get_tokenizer,
"aranizer_sp64k": aranizer_sp64k.get_tokenizer,
"aranizer_sp86k": aranizer_sp86k.get_tokenizer,
"FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
"FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
"inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer
}
def compare_tokenizers(tokenizer_name, text):
# Handle the transformer tokenizers separately due to API differences
if tokenizer_name in ["FreedomIntelligence/AceGPT-13B", "FreedomIntelligence/AceGPT-7B", "inception-mbzuai/jais-13b"]:
tokenizer = tokenizers[tokenizer_name]()
tokens = tokenizer.tokenize(text)
encoded_output = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")
decoded_text = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
else:
# AraNizer tokenizers
tokenizer = tokenizers[tokenizer_name]()
tokens = tokenizer.tokenize(text)
encoded_output = tokenizer.encode(text, add_special_tokens=True)
decoded_text = tokenizer.decode(encoded_output)
tokens_arabic = [token.encode('utf-8').decode('utf-8') for token in tokens]
# Prepare the results to be displayed
results = [(tokenizer_name, tokens_arabic, encoded_output, decoded_text)]
return results
# Define the Gradio interface components with a dropdown for model selection
inputs_component = [
gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer"),
gr.Textbox(lines=2, placeholder="اكتب النص هنا...", label="Input Text")
]
outputs_component = gr.Dataframe(
headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"],
label="Results",
type="pandas"
)
# Setting up the interface
iface = Interface(
fn=compare_tokenizers,
inputs=inputs_component,
outputs=outputs_component,
title="Arabic Tokenizer Arena",
live=True
)
# Launching the Gradio app
iface.launch() |