Update HROM_Trainer.py
Browse files- HROM_Trainer.py +18 -15
HROM_Trainer.py
CHANGED
@@ -28,30 +28,33 @@ logging.basicConfig(
|
|
28 |
|
29 |
# Configuration
|
30 |
CONFIG = {
|
|
|
31 |
"dim": 768,
|
32 |
-
"n_layers":
|
33 |
-
"n_heads":
|
34 |
-
"ff_dim":
|
|
|
|
|
35 |
"dropout": 0.1,
|
36 |
"max_seq_len": 512,
|
37 |
-
"
|
|
|
|
|
|
|
38 |
"checkpoint_interval": 2000,
|
39 |
"debug_interval": 400,
|
40 |
-
#
|
41 |
"datasets": ["daily_dialog", "empathetic_dialogues", "blended_skill_talk", "AlekseyKorshuk/persona-chat"],
|
42 |
-
#
|
43 |
-
"
|
44 |
-
#
|
45 |
-
"
|
46 |
-
"
|
47 |
-
# Adjusted samples per dataset: with 4 datasets, 50k each gives 200k total samples
|
48 |
-
"tokenizer_train_samples_per_dataset": 50000,
|
49 |
-
"learning_rate": 2e-5,
|
50 |
"warmup_steps": 1000,
|
51 |
-
"max_turns": 8, #
|
52 |
"max_checkpoints": 5,
|
53 |
"num_epochs": 30,
|
54 |
-
"grad_accum_steps":
|
55 |
}
|
56 |
|
57 |
# --- Model Definition (HROM, HROMBlock, HROMAttention, SwiGLU, RoPE) ---
|
|
|
28 |
|
29 |
# Configuration
|
30 |
CONFIG = {
|
31 |
+
# --- Scaled Parameters ---
|
32 |
"dim": 768,
|
33 |
+
"n_layers": 16,
|
34 |
+
"n_heads": 16,
|
35 |
+
"ff_dim": 3072, # Explicitly set to 4 * dim
|
36 |
+
|
37 |
+
# --- Kept Parameters ---
|
38 |
"dropout": 0.1,
|
39 |
"max_seq_len": 512,
|
40 |
+
"vocab_size": 32000, # Fixed by tokenizer
|
41 |
+
|
42 |
+
# --- Training/Dataset Parameters ---
|
43 |
+
"batch_size": 12,
|
44 |
"checkpoint_interval": 2000,
|
45 |
"debug_interval": 400,
|
46 |
+
# --- ADDED CoQA and QuAC ---
|
47 |
"datasets": ["daily_dialog", "empathetic_dialogues", "blended_skill_talk", "AlekseyKorshuk/persona-chat"],
|
48 |
+
"tokenizer_name": "hrom_tokenizer.json", # New name for expanded tokenizer
|
49 |
+
"checkpoint_dir": "checkpoints", # Separate directory for expanded data model
|
50 |
+
# --- Increased samples per dataset slightly for tokenizer ---
|
51 |
+
"tokenizer_train_samples_per_dataset": 100000, # Use same limit for all, incl. new ones
|
52 |
+
"learning_rate": 1e-5,
|
|
|
|
|
|
|
53 |
"warmup_steps": 1000,
|
54 |
+
"max_turns": 8, # Keep max_turns limit for Q&A datasets too
|
55 |
"max_checkpoints": 5,
|
56 |
"num_epochs": 30,
|
57 |
+
"grad_accum_steps": 16
|
58 |
}
|
59 |
|
60 |
# --- Model Definition (HROM, HROMBlock, HROMAttention, SwiGLU, RoPE) ---
|