import os import sentencepiece as spm from transformers import AutoTokenizer, PreTrainedTokenizerFast class TokenizerSetup: def __init__(self, model_path="tokenizer", model_type="bpe", vocab_size=32000, hf_model=None): """Initialize tokenizer setup for custom or pretrained use.""" self.model_path = model_path self.model_type = model_type.lower() # Normalize: bpe, unigram, char, word self.vocab_size = vocab_size self.hf_model = hf_model self.tokenizer = None # Validate model_type valid_types = ["bpe", "unigram", "char", "word"] if self.model_type not in valid_types: print(f"⚠️ Invalid model_type '{self.model_type}'. Choose from {valid_types}") self.model_type = "bpe" def train_sentencepiece(self, input_file): """Train a SentencePiece tokenizer with specified settings.""" if not os.path.exists(input_file): print(f"⚠️ Input file {input_file} not found! Provide a valid text corpus.") return try: spm.SentencePieceTrainer.Train( f"--input={input_file} " f"--model_prefix={self.model_path} " f"--vocab_size={self.vocab_size} " f"--model_type={self.model_type} " f"--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 " f"--user_defined_symbols=,,," # Explicit special tokens ) print(f"✅ Trained SentencePiece tokenizer. Saved as {self.model_path}.model") except Exception as e: print(f"⚠️ Error training SentencePiece: {e}") def load_tokenizer(self): """Load either a SentencePiece or Hugging Face tokenizer.""" try: if self.hf_model: self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model) print(f"✅ Loaded Hugging Face tokenizer from {self.hf_model}") else: sp_model = f"{self.model_path}.model" if not os.path.exists(sp_model): print(f"⚠️ {sp_model} not found! Train it first.") return sp = spm.SentencePieceProcessor(model_file=sp_model) self.tokenizer = PreTrainedTokenizerFast( tokenizer_object=sp, pad_token="", unk_token="", bos_token="", eos_token="" ) print(f"✅ Loaded SentencePiece tokenizer from {sp_model}") except Exception as e: print(f"⚠️ Error loading tokenizer: {e}") def save_tokenizer(self, save_dir="tokenizer/"): """Save tokenizer files to a directory.""" if not self.tokenizer: print("⚠️ No tokenizer loaded to save!") return try: os.makedirs(save_dir, exist_ok=True) self.tokenizer.save_pretrained(save_dir) if not self.hf_model: # Copy SentencePiece files for ext in [".model", ".vocab"]: src = f"{self.model_path}{ext}" if os.path.exists(src): os.system(f"cp {src} {save_dir}") print(f"✅ Tokenizer saved to {save_dir}") except Exception as e: print(f"⚠️ Error saving tokenizer: {e}") def tokenize_text(self, text, return_tensors=True): """Tokenize text and show both IDs and decoded output.""" if not self.tokenizer: print("⚠️ No tokenizer initialized! Load or train one first.") return None try: tokens = self.tokenizer(text, return_tensors="pt" if return_tensors else None) ids = tokens["input_ids"] if return_tensors else tokens decoded = self.tokenizer.decode(ids[0] if return_tensors else ids, skip_special_tokens=True) print(f"🔹 Token IDs: {ids}") print(f"🔹 Decoded: {decoded}") return tokens except Exception as e: print(f"⚠️ Error tokenizing text: {e}") return None if __name__ == "__main__": # Setup with Charm 15 context tokenizer_setup = TokenizerSetup( model_path="tokenizer", model_type="bpe", # Matches your earlier BPE config vocab_size=32000, # Matches Mistral/Charm 15 hf_model=None # Custom training; set to "mistralai/Mixtral-8x7B-Instruct-v0.1" for pretrained ) # Train on Eclipse Corpuz (or other corpus) input_file = "../datasets/eclipse_corpuz_1.1.txt" # Adjust to your dataset if not os.path.exists(f"{tokenizer_setup.model_path}.model"): tokenizer_setup.train_sentencepiece(input_file) # Load tokenizer tokenizer_setup.load_tokenizer() # Save for Charm 15 use tokenizer_setup.save_tokenizer("../finetuned_charm15/") # Match your training dir # Test with sample sample_text = "Charm 15 is an AI model optimized for deep learning and security." tokenizer_setup.tokenize_text(sample_text)