|
import os |
|
import sentencepiece as spm |
|
from transformers import AutoTokenizer, PreTrainedTokenizerFast |
|
|
|
class TokenizerSetup: |
|
def __init__(self, model_path="tokenizer", model_type="bpe", vocab_size=32000, hf_model=None): |
|
"""Initialize tokenizer setup for custom or pretrained use.""" |
|
self.model_path = model_path |
|
self.model_type = model_type.lower() |
|
self.vocab_size = vocab_size |
|
self.hf_model = hf_model |
|
self.tokenizer = None |
|
|
|
|
|
valid_types = ["bpe", "unigram", "char", "word"] |
|
if self.model_type not in valid_types: |
|
print(f"⚠️ Invalid model_type '{self.model_type}'. Choose from {valid_types}") |
|
self.model_type = "bpe" |
|
|
|
def train_sentencepiece(self, input_file): |
|
"""Train a SentencePiece tokenizer with specified settings.""" |
|
if not os.path.exists(input_file): |
|
print(f"⚠️ Input file {input_file} not found! Provide a valid text corpus.") |
|
return |
|
|
|
try: |
|
spm.SentencePieceTrainer.Train( |
|
f"--input={input_file} " |
|
f"--model_prefix={self.model_path} " |
|
f"--vocab_size={self.vocab_size} " |
|
f"--model_type={self.model_type} " |
|
f"--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 " |
|
f"--user_defined_symbols=<pad>,<unk>,<bos>,<eos>" |
|
) |
|
print(f"✅ Trained SentencePiece tokenizer. Saved as {self.model_path}.model") |
|
except Exception as e: |
|
print(f"⚠️ Error training SentencePiece: {e}") |
|
|
|
def load_tokenizer(self): |
|
"""Load either a SentencePiece or Hugging Face tokenizer.""" |
|
try: |
|
if self.hf_model: |
|
self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model) |
|
print(f"✅ Loaded Hugging Face tokenizer from {self.hf_model}") |
|
else: |
|
sp_model = f"{self.model_path}.model" |
|
if not os.path.exists(sp_model): |
|
print(f"⚠️ {sp_model} not found! Train it first.") |
|
return |
|
|
|
sp = spm.SentencePieceProcessor(model_file=sp_model) |
|
self.tokenizer = PreTrainedTokenizerFast( |
|
tokenizer_object=sp, |
|
pad_token="<pad>", |
|
unk_token="<unk>", |
|
bos_token="<bos>", |
|
eos_token="<eos>" |
|
) |
|
print(f"✅ Loaded SentencePiece tokenizer from {sp_model}") |
|
except Exception as e: |
|
print(f"⚠️ Error loading tokenizer: {e}") |
|
|
|
def save_tokenizer(self, save_dir="tokenizer/"): |
|
"""Save tokenizer files to a directory.""" |
|
if not self.tokenizer: |
|
print("⚠️ No tokenizer loaded to save!") |
|
return |
|
|
|
try: |
|
os.makedirs(save_dir, exist_ok=True) |
|
self.tokenizer.save_pretrained(save_dir) |
|
if not self.hf_model: |
|
for ext in [".model", ".vocab"]: |
|
src = f"{self.model_path}{ext}" |
|
if os.path.exists(src): |
|
os.system(f"cp {src} {save_dir}") |
|
print(f"✅ Tokenizer saved to {save_dir}") |
|
except Exception as e: |
|
print(f"⚠️ Error saving tokenizer: {e}") |
|
|
|
def tokenize_text(self, text, return_tensors=True): |
|
"""Tokenize text and show both IDs and decoded output.""" |
|
if not self.tokenizer: |
|
print("⚠️ No tokenizer initialized! Load or train one first.") |
|
return None |
|
|
|
try: |
|
tokens = self.tokenizer(text, return_tensors="pt" if return_tensors else None) |
|
ids = tokens["input_ids"] if return_tensors else tokens |
|
decoded = self.tokenizer.decode(ids[0] if return_tensors else ids, skip_special_tokens=True) |
|
print(f"🔹 Token IDs: {ids}") |
|
print(f"🔹 Decoded: {decoded}") |
|
return tokens |
|
except Exception as e: |
|
print(f"⚠️ Error tokenizing text: {e}") |
|
return None |
|
|
|
if __name__ == "__main__": |
|
|
|
tokenizer_setup = TokenizerSetup( |
|
model_path="tokenizer", |
|
model_type="bpe", |
|
vocab_size=32000, |
|
hf_model=None |
|
) |
|
|
|
|
|
input_file = "../datasets/eclipse_corpuz_1.1.txt" |
|
if not os.path.exists(f"{tokenizer_setup.model_path}.model"): |
|
tokenizer_setup.train_sentencepiece(input_file) |
|
|
|
|
|
tokenizer_setup.load_tokenizer() |
|
|
|
|
|
tokenizer_setup.save_tokenizer("../finetuned_charm15/") |
|
|
|
|
|
sample_text = "Charm 15 is an AI model optimized for deep learning and security." |
|
tokenizer_setup.tokenize_text(sample_text) |