GeminiFan207 commited on
Commit
c59a5cf
·
verified ·
1 Parent(s): e75fa15

Create tokenizer_setup.py

Browse files
Files changed (1) hide show
  1. tokenizer_setup.py +120 -0
tokenizer_setup.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sentencepiece as spm
3
+ from transformers import AutoTokenizer, PreTrainedTokenizerFast
4
+
5
+ class TokenizerSetup:
6
+ def __init__(self, model_path="tokenizer", model_type="bpe", vocab_size=32000, hf_model=None):
7
+ """Initialize tokenizer setup for custom or pretrained use."""
8
+ self.model_path = model_path
9
+ self.model_type = model_type.lower() # Normalize: bpe, unigram, char, word
10
+ self.vocab_size = vocab_size
11
+ self.hf_model = hf_model
12
+ self.tokenizer = None
13
+
14
+ # Validate model_type
15
+ valid_types = ["bpe", "unigram", "char", "word"]
16
+ if self.model_type not in valid_types:
17
+ print(f"⚠️ Invalid model_type '{self.model_type}'. Choose from {valid_types}")
18
+ self.model_type = "bpe"
19
+
20
+ def train_sentencepiece(self, input_file):
21
+ """Train a SentencePiece tokenizer with specified settings."""
22
+ if not os.path.exists(input_file):
23
+ print(f"⚠️ Input file {input_file} not found! Provide a valid text corpus.")
24
+ return
25
+
26
+ try:
27
+ spm.SentencePieceTrainer.Train(
28
+ f"--input={input_file} "
29
+ f"--model_prefix={self.model_path} "
30
+ f"--vocab_size={self.vocab_size} "
31
+ f"--model_type={self.model_type} "
32
+ f"--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 "
33
+ f"--user_defined_symbols=<pad>,<unk>,<bos>,<eos>" # Explicit special tokens
34
+ )
35
+ print(f"✅ Trained SentencePiece tokenizer. Saved as {self.model_path}.model")
36
+ except Exception as e:
37
+ print(f"⚠️ Error training SentencePiece: {e}")
38
+
39
+ def load_tokenizer(self):
40
+ """Load either a SentencePiece or Hugging Face tokenizer."""
41
+ try:
42
+ if self.hf_model:
43
+ self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model)
44
+ print(f"✅ Loaded Hugging Face tokenizer from {self.hf_model}")
45
+ else:
46
+ sp_model = f"{self.model_path}.model"
47
+ if not os.path.exists(sp_model):
48
+ print(f"⚠️ {sp_model} not found! Train it first.")
49
+ return
50
+
51
+ sp = spm.SentencePieceProcessor(model_file=sp_model)
52
+ self.tokenizer = PreTrainedTokenizerFast(
53
+ tokenizer_object=sp,
54
+ pad_token="<pad>",
55
+ unk_token="<unk>",
56
+ bos_token="<bos>",
57
+ eos_token="<eos>"
58
+ )
59
+ print(f"✅ Loaded SentencePiece tokenizer from {sp_model}")
60
+ except Exception as e:
61
+ print(f"⚠️ Error loading tokenizer: {e}")
62
+
63
+ def save_tokenizer(self, save_dir="tokenizer/"):
64
+ """Save tokenizer files to a directory."""
65
+ if not self.tokenizer:
66
+ print("⚠️ No tokenizer loaded to save!")
67
+ return
68
+
69
+ try:
70
+ os.makedirs(save_dir, exist_ok=True)
71
+ self.tokenizer.save_pretrained(save_dir)
72
+ if not self.hf_model: # Copy SentencePiece files
73
+ for ext in [".model", ".vocab"]:
74
+ src = f"{self.model_path}{ext}"
75
+ if os.path.exists(src):
76
+ os.system(f"cp {src} {save_dir}")
77
+ print(f"✅ Tokenizer saved to {save_dir}")
78
+ except Exception as e:
79
+ print(f"⚠️ Error saving tokenizer: {e}")
80
+
81
+ def tokenize_text(self, text, return_tensors=True):
82
+ """Tokenize text and show both IDs and decoded output."""
83
+ if not self.tokenizer:
84
+ print("⚠️ No tokenizer initialized! Load or train one first.")
85
+ return None
86
+
87
+ try:
88
+ tokens = self.tokenizer(text, return_tensors="pt" if return_tensors else None)
89
+ ids = tokens["input_ids"] if return_tensors else tokens
90
+ decoded = self.tokenizer.decode(ids[0] if return_tensors else ids, skip_special_tokens=True)
91
+ print(f"🔹 Token IDs: {ids}")
92
+ print(f"🔹 Decoded: {decoded}")
93
+ return tokens
94
+ except Exception as e:
95
+ print(f"⚠️ Error tokenizing text: {e}")
96
+ return None
97
+
98
+ if __name__ == "__main__":
99
+ # Setup with Charm 15 context
100
+ tokenizer_setup = TokenizerSetup(
101
+ model_path="tokenizer",
102
+ model_type="bpe", # Matches your earlier BPE config
103
+ vocab_size=32000, # Matches Mistral/Charm 15
104
+ hf_model=None # Custom training; set to "mistralai/Mixtral-8x7B-Instruct-v0.1" for pretrained
105
+ )
106
+
107
+ # Train on Eclipse Corpuz (or other corpus)
108
+ input_file = "../datasets/eclipse_corpuz_1.1.txt" # Adjust to your dataset
109
+ if not os.path.exists(f"{tokenizer_setup.model_path}.model"):
110
+ tokenizer_setup.train_sentencepiece(input_file)
111
+
112
+ # Load tokenizer
113
+ tokenizer_setup.load_tokenizer()
114
+
115
+ # Save for Charm 15 use
116
+ tokenizer_setup.save_tokenizer("../finetuned_charm15/") # Match your training dir
117
+
118
+ # Test with sample
119
+ sample_text = "Charm 15 is an AI model optimized for deep learning and security."
120
+ tokenizer_setup.tokenize_text(sample_text)