HeshamHaroon commited on
Commit
c8d4230
·
verified ·
1 Parent(s): 81d456a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -43
app.py CHANGED
@@ -1,79 +1,63 @@
1
  from gradio import Interface
2
  import gradio as gr
3
  import aranizer
4
- from aranizer import aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
5
  from transformers import AutoTokenizer
6
  import codecs
7
 
8
- # Load additional tokenizers from transformers
9
  gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
10
  gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
11
  jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
12
 
13
- # List of available tokenizers and a dictionary to load them
14
- tokenizer_options = [
15
- "aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
16
- "aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k",
17
- "FreedomIntelligence/AceGPT-13B", # Previously added GPT tokenizer
18
- "FreedomIntelligence/AceGPT-7B", # Another previously added GPT tokenizer
19
- "inception-mbzuai/jais-13b" # Adding the new tokenizer to the options
20
- ]
21
-
22
  tokenizers = {
23
- "aranizer_bpe50k": aranizer_bpe50k.get_tokenizer,
24
- "aranizer_bpe64k": aranizer_bpe64k.get_tokenizer,
25
- "aranizer_bpe86k": aranizer_bpe86k.get_tokenizer,
26
- "aranizer_sp32k": aranizer_sp32k.get_tokenizer,
27
- "aranizer_sp50k": aranizer_sp50k.get_tokenizer,
28
- "aranizer_sp64k": aranizer_sp64k.get_tokenizer,
29
- "aranizer_sp86k": aranizer_sp86k.get_tokenizer,
30
  "FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
31
  "FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
32
- "inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer # Adding the new Jais tokenizer
33
  }
34
 
 
 
 
35
  def compare_tokenizers(tokenizer_name, text):
36
- # Handle the transformer tokenizers separately due to API differences
37
- if tokenizer_name in ["FreedomIntelligence/AceGPT-13B", "FreedomIntelligence/AceGPT-7B", "inception-mbzuai/jais-13b"]:
38
- tokenizer = tokenizers[tokenizer_name]()
39
- tokens = tokenizer.tokenize(text)
40
- encoded_output = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")
41
- decoded_text = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
42
- # Decoding using codecs to force UTF-8 and handle potential encoding issues
43
- decoded_text_utf8 = codecs.decode(decoded_text.encode('utf-8'), 'utf-8', errors='ignore')
44
- else:
45
- # AraNizer tokenizers
46
- tokenizer = tokenizers[tokenizer_name]()
47
- tokens = tokenizer.tokenize(text)
48
- encoded_output = tokenizer.encode(text, add_special_tokens=True)
49
- decoded_text = tokenizer.decode(encoded_output)
50
- # Same codecs handling for AraNizer tokenizers
51
- decoded_text_utf8 = codecs.decode(decoded_text.encode('utf-8'), 'utf-8', errors='ignore')
52
 
53
- # Prepare the results to be displayed, using UTF-8 decoded text
54
- results = [(tokenizer_name, tokens, encoded_output, decoded_text_utf8)]
 
 
 
 
 
 
 
 
55
  return results
56
 
57
- # Define the Gradio interface components with a dropdown for model selection
58
  inputs_component = [
59
  gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer"),
60
- gr.Textbox(lines=2, placeholder="Enter text here...", label="Input Text")
61
  ]
62
 
63
  outputs_component = gr.Dataframe(
64
  headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"],
65
  label="Results",
66
- type="pandas"
67
  )
68
 
69
- # Setting up the interface
70
  iface = Interface(
71
  fn=compare_tokenizers,
72
  inputs=inputs_component,
73
  outputs=outputs_component,
74
  title="Tokenizer Comparison",
75
- live=True
76
  )
77
 
78
- # Launching the Gradio app
79
  iface.launch()
 
1
  from gradio import Interface
2
  import gradio as gr
3
  import aranizer
 
4
  from transformers import AutoTokenizer
5
  import codecs
6
 
7
+ # Loading tokenizer instances from Transformers.
8
  gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
9
  gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
10
  jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
11
 
12
+ # Assuming the existence of get_tokenizer() method for aranizer models in your setup.
 
 
 
 
 
 
 
 
13
  tokenizers = {
14
+ "aranizer_bpe50k": lambda: aranizer.aranizer_bpe50k.get_tokenizer(),
15
+ "aranizer_bpe64k": lambda: aranizer.aranizer_bpe64k.get_tokenizer(),
16
+ "aranizer_bpe86k": lambda: aranizer.aranizer_bpe86k.get_tokenizer(),
17
+ "aranizer_sp32k": lambda: aranizer.aranizer_sp32k.get_tokenizer(),
18
+ "aranizer_sp50k": lambda: aranizer.aranizer_sp50k.get_tokenizer(),
19
+ "aranizer_sp64k": lambda: aranizer.aranizer_sp64k.get_tokenizer(),
20
+ "aranizer_sp86k": lambda: aranizer.aranizer_sp86k.get_tokenizer(),
21
  "FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
22
  "FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
23
+ "inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer,
24
  }
25
 
26
+ # Define tokenizer options for dropdown menu.
27
+ tokenizer_options = list(tokenizers.keys())
28
+
29
  def compare_tokenizers(tokenizer_name, text):
30
+ # UTF-8 encoding assertion for the input text
31
+ text = codecs.decode(text.encode('utf-8'), 'utf-8')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ tokenizer = tokenizers[tokenizer_name]()
34
+ tokens = tokenizer.tokenize(text)
35
+ encoded_output = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt")
36
+ decoded_text = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
37
+
38
+ # Ensuring the tokens are iterated and converted correctly
39
+ tokens_utf8 = [codecs.decode(token.encode('utf-8'), 'utf-8', errors='ignore') for token in tokens]
40
+
41
+ # Preparing and returning results in UTF-8
42
+ results = [(tokenizer_name, tokens_utf8, encoded_output.tolist(), decoded_text)]
43
  return results
44
 
 
45
  inputs_component = [
46
  gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer"),
47
+ gr.Textbox(lines=2, placeholder="Enter Arabic text here...", label="Input Text", default="مثال بالعربية")
48
  ]
49
 
50
  outputs_component = gr.Dataframe(
51
  headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"],
52
  label="Results",
 
53
  )
54
 
 
55
  iface = Interface(
56
  fn=compare_tokenizers,
57
  inputs=inputs_component,
58
  outputs=outputs_component,
59
  title="Tokenizer Comparison",
60
+ live=True,
61
  )
62
 
 
63
  iface.launch()