File size: 2,558 Bytes
5357bd8
61a3b2f
bee3802
ece3f89
81d456a
ece3f89
c8d4230
ece3f89
 
 
61a3b2f
c8d4230
66b3df6
c8d4230
 
 
 
 
 
 
ece3f89
 
c8d4230
66b3df6
ab010ed
c8d4230
 
 
7ed66d7
c8d4230
 
80ccea0
c8d4230
 
 
 
 
 
 
 
 
 
bee3802
ece3f89
80ccea0
11bd107
c8d4230
80ccea0
3567a04
ece3f89
 
 
 
6f9d03f
ece3f89
 
 
 
81d456a
c8d4230
ece3f89
fcbfd45
ece3f89
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from gradio import Interface
import gradio as gr
import aranizer
from transformers import AutoTokenizer
import codecs

# Loading tokenizer instances from Transformers.
gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")

# Assuming the existence of get_tokenizer() method for aranizer models in your setup.
tokenizers = {
    "aranizer_bpe50k": lambda: aranizer.aranizer_bpe50k.get_tokenizer(),
    "aranizer_bpe64k": lambda: aranizer.aranizer_bpe64k.get_tokenizer(),
    "aranizer_bpe86k": lambda: aranizer.aranizer_bpe86k.get_tokenizer(),
    "aranizer_sp32k": lambda: aranizer.aranizer_sp32k.get_tokenizer(),
    "aranizer_sp50k": lambda: aranizer.aranizer_sp50k.get_tokenizer(),
    "aranizer_sp64k": lambda: aranizer.aranizer_sp64k.get_tokenizer(),
    "aranizer_sp86k": lambda: aranizer.aranizer_sp86k.get_tokenizer(),
    "FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
    "FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
    "inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer,
}

# Define tokenizer options for dropdown menu.
tokenizer_options = list(tokenizers.keys())

def compare_tokenizers(tokenizer_name, text):
    # UTF-8 encoding assertion for the input text
    text = codecs.decode(text.encode('utf-8'), 'utf-8')
    
    tokenizer = tokenizers[tokenizer_name]()
    tokens = tokenizer.tokenize(text)
    encoded_output = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")
    decoded_text = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
    
    # Ensuring the tokens are iterated and converted correctly
    tokens_utf8 = [codecs.decode(token.encode('utf-8'), 'utf-8', errors='ignore') for token in tokens]

    # Preparing and returning results in UTF-8
    results = [(tokenizer_name, tokens_utf8, encoded_output.tolist(), decoded_text)]
    return results

inputs_component = [
    gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer"),
    gr.Textbox(lines=2, placeholder="Enter Arabic text here...", label="Input Text", default="مثال بالعربية")
]

outputs_component = gr.Dataframe(
    headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"], 
    label="Results",
)

iface = Interface(
    fn=compare_tokenizers, 
    inputs=inputs_component, 
    outputs=outputs_component, 
    title="Tokenizer Comparison",
    live=True,
)

iface.launch()