Charm_15 / tokenizer_setup.py
GeminiFan207's picture
Create tokenizer_setup.py
c59a5cf verified
import os
import sentencepiece as spm
from transformers import AutoTokenizer, PreTrainedTokenizerFast
class TokenizerSetup:
def __init__(self, model_path="tokenizer", model_type="bpe", vocab_size=32000, hf_model=None):
"""Initialize tokenizer setup for custom or pretrained use."""
self.model_path = model_path
self.model_type = model_type.lower() # Normalize: bpe, unigram, char, word
self.vocab_size = vocab_size
self.hf_model = hf_model
self.tokenizer = None
# Validate model_type
valid_types = ["bpe", "unigram", "char", "word"]
if self.model_type not in valid_types:
print(f"⚠️ Invalid model_type '{self.model_type}'. Choose from {valid_types}")
self.model_type = "bpe"
def train_sentencepiece(self, input_file):
"""Train a SentencePiece tokenizer with specified settings."""
if not os.path.exists(input_file):
print(f"⚠️ Input file {input_file} not found! Provide a valid text corpus.")
return
try:
spm.SentencePieceTrainer.Train(
f"--input={input_file} "
f"--model_prefix={self.model_path} "
f"--vocab_size={self.vocab_size} "
f"--model_type={self.model_type} "
f"--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 "
f"--user_defined_symbols=<pad>,<unk>,<bos>,<eos>" # Explicit special tokens
)
print(f"✅ Trained SentencePiece tokenizer. Saved as {self.model_path}.model")
except Exception as e:
print(f"⚠️ Error training SentencePiece: {e}")
def load_tokenizer(self):
"""Load either a SentencePiece or Hugging Face tokenizer."""
try:
if self.hf_model:
self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model)
print(f"✅ Loaded Hugging Face tokenizer from {self.hf_model}")
else:
sp_model = f"{self.model_path}.model"
if not os.path.exists(sp_model):
print(f"⚠️ {sp_model} not found! Train it first.")
return
sp = spm.SentencePieceProcessor(model_file=sp_model)
self.tokenizer = PreTrainedTokenizerFast(
tokenizer_object=sp,
pad_token="<pad>",
unk_token="<unk>",
bos_token="<bos>",
eos_token="<eos>"
)
print(f"✅ Loaded SentencePiece tokenizer from {sp_model}")
except Exception as e:
print(f"⚠️ Error loading tokenizer: {e}")
def save_tokenizer(self, save_dir="tokenizer/"):
"""Save tokenizer files to a directory."""
if not self.tokenizer:
print("⚠️ No tokenizer loaded to save!")
return
try:
os.makedirs(save_dir, exist_ok=True)
self.tokenizer.save_pretrained(save_dir)
if not self.hf_model: # Copy SentencePiece files
for ext in [".model", ".vocab"]:
src = f"{self.model_path}{ext}"
if os.path.exists(src):
os.system(f"cp {src} {save_dir}")
print(f"✅ Tokenizer saved to {save_dir}")
except Exception as e:
print(f"⚠️ Error saving tokenizer: {e}")
def tokenize_text(self, text, return_tensors=True):
"""Tokenize text and show both IDs and decoded output."""
if not self.tokenizer:
print("⚠️ No tokenizer initialized! Load or train one first.")
return None
try:
tokens = self.tokenizer(text, return_tensors="pt" if return_tensors else None)
ids = tokens["input_ids"] if return_tensors else tokens
decoded = self.tokenizer.decode(ids[0] if return_tensors else ids, skip_special_tokens=True)
print(f"🔹 Token IDs: {ids}")
print(f"🔹 Decoded: {decoded}")
return tokens
except Exception as e:
print(f"⚠️ Error tokenizing text: {e}")
return None
if __name__ == "__main__":
# Setup with Charm 15 context
tokenizer_setup = TokenizerSetup(
model_path="tokenizer",
model_type="bpe", # Matches your earlier BPE config
vocab_size=32000, # Matches Mistral/Charm 15
hf_model=None # Custom training; set to "mistralai/Mixtral-8x7B-Instruct-v0.1" for pretrained
)
# Train on Eclipse Corpuz (or other corpus)
input_file = "../datasets/eclipse_corpuz_1.1.txt" # Adjust to your dataset
if not os.path.exists(f"{tokenizer_setup.model_path}.model"):
tokenizer_setup.train_sentencepiece(input_file)
# Load tokenizer
tokenizer_setup.load_tokenizer()
# Save for Charm 15 use
tokenizer_setup.save_tokenizer("../finetuned_charm15/") # Match your training dir
# Test with sample
sample_text = "Charm 15 is an AI model optimized for deep learning and security."
tokenizer_setup.tokenize_text(sample_text)