Walid-Ahmed commited on
Commit
82fa7ce
·
verified ·
1 Parent(s): 348dd8a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -0
app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ from transformers import AutoTokenizer
4
+
5
+ # Define tokenizers for English and Arabic
6
+ tokenizers = {
7
+ "English - BERT (bert-base-uncased)": "bert-base-uncased",
8
+ "Arabic - CAMeL BERT (bert-base-arabic-camelbert-ca)": "CAMeL-Lab/bert-base-arabic-camelbert-ca",
9
+ "Arabic - AraBERT (asafaya/bert-base-arabic)": "asafaya/bert-base-arabic"
10
+ }
11
+
12
+ # Tokenization function
13
+ def tokenize_text(text,model_name):
14
+ # Load the selected tokenizer
15
+ tokenizer = AutoTokenizer.from_pretrained(tokenizers[model_name])
16
+
17
+ # Tokenize the input text
18
+ tokens = tokenizer.tokenize(text)
19
+
20
+ # Return tokens and number of tokens
21
+ return f"Tokens: {tokens}", f"Number of tokens: {len(tokens)}"
22
+
23
+ # Define Gradio interface components
24
+ model_choice = gr.Dropdown(choices=list(tokenizers.keys()), label="Select Tokenizer", value="English - BERT (bert-base-uncased)")
25
+ text_input = gr.Textbox(label="Enter a sentence to tokenize")
26
+
27
+
28
+ # Predefined example sentences
29
+ examples = [
30
+ ["The quick brown fox jumps over the lazy dog."], # English sentence
31
+ ["القمر جميل في السماء."] # Arabic sentence
32
+ ]
33
+
34
+ # Set up the Gradio interface
35
+ demo = gr.Interface(
36
+ fn=tokenize_text,
37
+ inputs=[text_input,model_choice],
38
+ outputs=[gr.Textbox(label="Tokens"), gr.Textbox(label="Number of Tokens")], # Properly named outputs
39
+ title="Hugging Face Tokenizer Explorer",
40
+ description="Enter a sentence or use one of the example sentences below to see how different tokenizers work.",
41
+ examples=examples, # Provide examples that apply to the text input field
42
+ allow_flagging=False
43
+ )