File size: 1,715 Bytes
f43ae3f 59752f1 f43ae3f 59752f1 f43ae3f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
from transformers import AutoTokenizer
import logging
import os
# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
def get_tokenizer(id):
logging.debug(f"Loading tokenizer: {id}")
try:
tokenizer = AutoTokenizer.from_pretrained(id, trust_remote_code=True, token=os.getenv("HF_TOKEN"))
logging.debug(f"Tokenizer loaded: {tokenizer}")
return tokenizer
except Exception as e:
logging.error(f"Error loading tokenizer {id}: {e}")
raise e
def get_tokenization(tokenizer, text):
logging.debug(f"Tokenizing text: {text}")
ids = tokenizer.encode(text)
string_tokens = tokenizer.convert_ids_to_tokens(ids)
logging.debug(f"Tokens: {string_tokens}")
return string_tokens
def get_vocab_size(tokenizer):
logging.debug(f"Getting vocabulary size for tokenizer: {tokenizer}")
vocab_size = len(tokenizer.get_vocab())
logging.debug(f"Vocabulary size: {vocab_size}")
return vocab_size
def check_latin_support(tokenizer):
logging.debug(f"Checking Latin support for tokenizer: {tokenizer}")
try:
test_text = "This is a test with latin characters 1234567890."
tokens = tokenizer.tokenize(test_text)
# If the tokenizer can tokenize the latin text without returning unknown tokens, we consider it as supporting latin
if all(token != tokenizer.unk_token for token in tokens):
logging.debug(f"Latin support: β
")
return "β
"
else:
logging.debug(f"Latin support: β")
return "β"
except Exception as e:
logging.error(f"Error checking latin support: {e}")
return "β"
|