elapt1c commited on
Commit
2307f6e
·
verified ·
1 Parent(s): cd4a9d3

Update HROM_Trainer.py

Browse files
Files changed (1) hide show
  1. HROM_Trainer.py +18 -15
HROM_Trainer.py CHANGED
@@ -28,30 +28,33 @@ logging.basicConfig(
28
 
29
  # Configuration
30
  CONFIG = {
 
31
  "dim": 768,
32
- "n_layers": 8,
33
- "n_heads": 8,
34
- "ff_dim": 2048,
 
 
35
  "dropout": 0.1,
36
  "max_seq_len": 512,
37
- "batch_size": 16, # Keep batch size reasonable
 
 
 
38
  "checkpoint_interval": 2000,
39
  "debug_interval": 400,
40
- # Reverted to training on all four datasets, using correct persona_chat identifier
41
  "datasets": ["daily_dialog", "empathetic_dialogues", "blended_skill_talk", "AlekseyKorshuk/persona-chat"],
42
- # Reverted to combined tokenizer name
43
- "tokenizer_name": "hrom_tokenizer.json",
44
- # Reverted to combined checkpoint dir
45
- "checkpoint_dir": "checkpoints",
46
- "vocab_size": 32000,
47
- # Adjusted samples per dataset: with 4 datasets, 50k each gives 200k total samples
48
- "tokenizer_train_samples_per_dataset": 50000,
49
- "learning_rate": 2e-5,
50
  "warmup_steps": 1000,
51
- "max_turns": 8, # Max turns applied per dialogue
52
  "max_checkpoints": 5,
53
  "num_epochs": 30,
54
- "grad_accum_steps": 8 # Keep grad accum reasonable
55
  }
56
 
57
  # --- Model Definition (HROM, HROMBlock, HROMAttention, SwiGLU, RoPE) ---
 
28
 
29
  # Configuration
30
  CONFIG = {
31
+ # --- Scaled Parameters ---
32
  "dim": 768,
33
+ "n_layers": 16,
34
+ "n_heads": 16,
35
+ "ff_dim": 3072, # Explicitly set to 4 * dim
36
+
37
+ # --- Kept Parameters ---
38
  "dropout": 0.1,
39
  "max_seq_len": 512,
40
+ "vocab_size": 32000, # Fixed by tokenizer
41
+
42
+ # --- Training/Dataset Parameters ---
43
+ "batch_size": 12,
44
  "checkpoint_interval": 2000,
45
  "debug_interval": 400,
46
+ # --- ADDED CoQA and QuAC ---
47
  "datasets": ["daily_dialog", "empathetic_dialogues", "blended_skill_talk", "AlekseyKorshuk/persona-chat"],
48
+ "tokenizer_name": "hrom_tokenizer.json", # New name for expanded tokenizer
49
+ "checkpoint_dir": "checkpoints", # Separate directory for expanded data model
50
+ # --- Increased samples per dataset slightly for tokenizer ---
51
+ "tokenizer_train_samples_per_dataset": 100000, # Use same limit for all, incl. new ones
52
+ "learning_rate": 1e-5,
 
 
 
53
  "warmup_steps": 1000,
54
+ "max_turns": 8, # Keep max_turns limit for Q&A datasets too
55
  "max_checkpoints": 5,
56
  "num_epochs": 30,
57
+ "grad_accum_steps": 16
58
  }
59
 
60
  # --- Model Definition (HROM, HROMBlock, HROMAttention, SwiGLU, RoPE) ---