Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,6 +10,7 @@ import os
|
|
| 10 |
HF_TOKEN = os.getenv('HF_TOKEN')
|
| 11 |
|
| 12 |
if HF_TOKEN:
|
|
|
|
| 13 |
login(token=HF_TOKEN)
|
| 14 |
|
| 15 |
# Load additional tokenizers from transformers
|
|
@@ -63,29 +64,16 @@ if meta_llama_tokenizer:
|
|
| 63 |
tokenizers["meta-llama/Meta-Llama-3-8B"] = lambda: meta_llama_tokenizer
|
| 64 |
|
| 65 |
def compare_tokenizers(tokenizer_name, text):
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
]:
|
| 72 |
-
tokenizer = tokenizers[tokenizer_name]()
|
| 73 |
-
tokens = tokenizer.tokenize(text)
|
| 74 |
-
encoded_output = tokenizer.encode(text, add_special_tokens=True)
|
| 75 |
-
decoded_text = tokenizer.decode(encoded_output, skip_special_tokens=True)
|
| 76 |
-
else:
|
| 77 |
-
# AraNizer tokenizers
|
| 78 |
-
tokenizer = tokenizers[tokenizer_name]()
|
| 79 |
-
tokens = tokenizer.tokenize(text)
|
| 80 |
-
encoded_output = tokenizer.encode(text, add_special_tokens=True)
|
| 81 |
-
decoded_text = tokenizer.decode(encoded_output)
|
| 82 |
-
|
| 83 |
# Prepare the results to be displayed in HTML format
|
| 84 |
-
tokens_arabic = [token.encode('utf-8').decode('utf-8') if isinstance(token, bytes) else token for token in tokens]
|
| 85 |
results_html = f"""
|
| 86 |
<div>
|
| 87 |
<h3>Tokenizer: {tokenizer_name}</h3>
|
| 88 |
-
<p><strong>Tokens:</strong> {
|
| 89 |
<p><strong>Encoded:</strong> {encoded_output}</p>
|
| 90 |
<p><strong>Decoded:</strong> {decoded_text}</p>
|
| 91 |
</div>
|
|
|
|
| 10 |
HF_TOKEN = os.getenv('HF_TOKEN')
|
| 11 |
|
| 12 |
if HF_TOKEN:
|
| 13 |
+
HF_TOKEN = HF_TOKEN.strip() # Remove any leading or trailing whitespace/newlines
|
| 14 |
login(token=HF_TOKEN)
|
| 15 |
|
| 16 |
# Load additional tokenizers from transformers
|
|
|
|
| 64 |
tokenizers["meta-llama/Meta-Llama-3-8B"] = lambda: meta_llama_tokenizer
|
| 65 |
|
| 66 |
def compare_tokenizers(tokenizer_name, text):
|
| 67 |
+
tokenizer = tokenizers[tokenizer_name]()
|
| 68 |
+
tokens = tokenizer.tokenize(text)
|
| 69 |
+
encoded_output = tokenizer.encode(text, add_special_tokens=True)
|
| 70 |
+
decoded_text = tokenizer.decode(encoded_output, skip_special_tokens=True)
|
| 71 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
# Prepare the results to be displayed in HTML format
|
|
|
|
| 73 |
results_html = f"""
|
| 74 |
<div>
|
| 75 |
<h3>Tokenizer: {tokenizer_name}</h3>
|
| 76 |
+
<p><strong>Tokens:</strong> {tokens}</p>
|
| 77 |
<p><strong>Encoded:</strong> {encoded_output}</p>
|
| 78 |
<p><strong>Decoded:</strong> {decoded_text}</p>
|
| 79 |
</div>
|