File size: 5,216 Bytes
c59a5cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import sentencepiece as spm
from transformers import AutoTokenizer, PreTrainedTokenizerFast

class TokenizerSetup:
    def __init__(self, model_path="tokenizer", model_type="bpe", vocab_size=32000, hf_model=None):
        """Initialize tokenizer setup for custom or pretrained use."""
        self.model_path = model_path
        self.model_type = model_type.lower()  # Normalize: bpe, unigram, char, word
        self.vocab_size = vocab_size
        self.hf_model = hf_model
        self.tokenizer = None
        
        # Validate model_type
        valid_types = ["bpe", "unigram", "char", "word"]
        if self.model_type not in valid_types:
            print(f"⚠️ Invalid model_type '{self.model_type}'. Choose from {valid_types}")
            self.model_type = "bpe"

    def train_sentencepiece(self, input_file):
        """Train a SentencePiece tokenizer with specified settings."""
        if not os.path.exists(input_file):
            print(f"⚠️ Input file {input_file} not found! Provide a valid text corpus.")
            return
        
        try:
            spm.SentencePieceTrainer.Train(
                f"--input={input_file} "
                f"--model_prefix={self.model_path} "
                f"--vocab_size={self.vocab_size} "
                f"--model_type={self.model_type} "
                f"--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 "
                f"--user_defined_symbols=<pad>,<unk>,<bos>,<eos>"  # Explicit special tokens
            )
            print(f"✅ Trained SentencePiece tokenizer. Saved as {self.model_path}.model")
        except Exception as e:
            print(f"⚠️ Error training SentencePiece: {e}")

    def load_tokenizer(self):
        """Load either a SentencePiece or Hugging Face tokenizer."""
        try:
            if self.hf_model:
                self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model)
                print(f"✅ Loaded Hugging Face tokenizer from {self.hf_model}")
            else:
                sp_model = f"{self.model_path}.model"
                if not os.path.exists(sp_model):
                    print(f"⚠️ {sp_model} not found! Train it first.")
                    return
                
                sp = spm.SentencePieceProcessor(model_file=sp_model)
                self.tokenizer = PreTrainedTokenizerFast(
                    tokenizer_object=sp,
                    pad_token="<pad>",
                    unk_token="<unk>",
                    bos_token="<bos>",
                    eos_token="<eos>"
                )
                print(f"✅ Loaded SentencePiece tokenizer from {sp_model}")
        except Exception as e:
            print(f"⚠️ Error loading tokenizer: {e}")

    def save_tokenizer(self, save_dir="tokenizer/"):
        """Save tokenizer files to a directory."""
        if not self.tokenizer:
            print("⚠️ No tokenizer loaded to save!")
            return
        
        try:
            os.makedirs(save_dir, exist_ok=True)
            self.tokenizer.save_pretrained(save_dir)
            if not self.hf_model:  # Copy SentencePiece files
                for ext in [".model", ".vocab"]:
                    src = f"{self.model_path}{ext}"
                    if os.path.exists(src):
                        os.system(f"cp {src} {save_dir}")
            print(f"✅ Tokenizer saved to {save_dir}")
        except Exception as e:
            print(f"⚠️ Error saving tokenizer: {e}")

    def tokenize_text(self, text, return_tensors=True):
        """Tokenize text and show both IDs and decoded output."""
        if not self.tokenizer:
            print("⚠️ No tokenizer initialized! Load or train one first.")
            return None
        
        try:
            tokens = self.tokenizer(text, return_tensors="pt" if return_tensors else None)
            ids = tokens["input_ids"] if return_tensors else tokens
            decoded = self.tokenizer.decode(ids[0] if return_tensors else ids, skip_special_tokens=True)
            print(f"🔹 Token IDs: {ids}")
            print(f"🔹 Decoded: {decoded}")
            return tokens
        except Exception as e:
            print(f"⚠️ Error tokenizing text: {e}")
            return None

if __name__ == "__main__":
    # Setup with Charm 15 context
    tokenizer_setup = TokenizerSetup(
        model_path="tokenizer",
        model_type="bpe",           # Matches your earlier BPE config
        vocab_size=32000,           # Matches Mistral/Charm 15
        hf_model=None               # Custom training; set to "mistralai/Mixtral-8x7B-Instruct-v0.1" for pretrained
    )

    # Train on Eclipse Corpuz (or other corpus)
    input_file = "../datasets/eclipse_corpuz_1.1.txt"  # Adjust to your dataset
    if not os.path.exists(f"{tokenizer_setup.model_path}.model"):
        tokenizer_setup.train_sentencepiece(input_file)

    # Load tokenizer
    tokenizer_setup.load_tokenizer()

    # Save for Charm 15 use
    tokenizer_setup.save_tokenizer("../finetuned_charm15/")  # Match your training dir

    # Test with sample
    sample_text = "Charm 15 is an AI model optimized for deep learning and security."
    tokenizer_setup.tokenize_text(sample_text)