beeper-ascii-v1 / training_config.json
AbstractPhil's picture
Crystal-Beeper-Harmony-v5 export @ 2025-08-19 01:35:41
1470196 verified
{
"batch_size": 128,
"grad_accum_steps": 1,
"epochs": 10,
"lr": 0.0003,
"betas": [
0.9,
0.95
],
"weight_decay": 0.1,
"warmup_steps": 500,
"max_steps": null,
"clip_grad": 1.0,
"min_lr": 1e-06,
"label_smoothing": 0.0,
"mixed_precision": "bf16",
"log_dir": "./runs/crystal_beeper",
"log_interval": 50,
"ckpt_dir": "./checkpoints_crystal",
"export_dir": "./export_crystal",
"resume": true,
"resume_strict": false,
"resume_tag": "best_model.safetensors",
"hf_repo": "AbstractPhil/beeper-ascii-v1",
"upload_to_hub": true,
"add_bos_eos": true,
"span_corrupt_frac": 0.0,
"val_ratio": 0.01,
"test_ratio": 0.01,
"max_rows_per_dataset": null,
"dataset_cache_verbose": true,
"lambda_route": 0.2,
"route_topk": 32,
"lambda_geom": 0.3,
"lambda_geom_angle": 0.8,
"lambda_geom_var": 0.3,
"lambda_geom_edge": 0.3,
"lambda_geom_vol": 0.6,
"lambda_geom_minrel": 1.0,
"geom_min_edge_rel": 0.6,
"geom_vol_lower_frac": 0.85,
"geom_sample_classes": 64,
"lambda_rose": 0.1,
"rose_scale": 1.8,
"contrast_warmup": 800,
"pent_temp": 0.1,
"lambda_contrast": 0.25,
"punctuation": {
"enable": true,
"chars": [
".",
",",
";",
":",
"!",
"?",
"'",
"\"",
"(",
")",
"[",
"]",
"{",
"}",
"-",
"\u2014",
"\u2026"
],
"alpha_soft": 0.6,
"hard_mask_gate": false,
"apply_to_coarse_ids": "ALL"
},
"harmony": {
"apply": true,
"system": "You are Crystal-Beeper, a helpful, honest, precise assistant.",
"style": "concise"
},
"stages": [
{
"name": "bootstrap",
"epochs": 1,
"lambda_route": 0.05,
"lambda_geom": 0.2,
"gate_tau": 0.1,
"punct_alpha": 0.6,
"hard_mask_gate": false,
"mix_sdpa": 1.0
},
{
"name": "crystal_warmup",
"epochs": 2,
"lambda_route": 0.2,
"lambda_geom": 0.3,
"gate_tau": 0.08,
"punct_alpha": 0.7,
"hard_mask_gate": false,
"mix_sdpa": 1.0
},
{
"name": "dictionary_crystals",
"epochs": 2,
"lambda_route": 0.25,
"lambda_geom": 0.35,
"gate_tau": 0.06,
"punct_alpha": 0.85,
"hard_mask_gate": false,
"mix_sdpa": 0.9
},
{
"name": "stability_tuning",
"epochs": 3,
"lambda_route": 0.3,
"lambda_geom": 0.4,
"gate_tau": 0.05,
"punct_alpha": 1.0,
"hard_mask_gate": false,
"mix_sdpa": 0.8
}
],
"corpus": [
{
"name": "TinyStories",
"path": "roneneldan/TinyStories",
"split": "train[30%:50%]",
"weight": 0.1,
"dialect": [
0.6,
0.1,
0.05,
0.05,
0.2
]
},
{
"name": "WikipediaEN",
"path": "wikimedia/wikipedia",
"config": "20231101.en",
"split": "train[5%:15%]",
"weight": 0.5,
"dialect": [
0.12,
0.58,
0.1,
0.1,
0.1
]
},
{
"name": "AGNews",
"path": "ag_news",
"split": "train[:]",
"weight": 0.1,
"dialect": [
0.2,
0.5,
0.1,
0.1,
0.1
]
},
{
"name": "GSM8K",
"path": "openai/gsm8k",
"config": "main",
"split": "train[40%:60%]",
"weight": 0.6,
"dialect": [
0.1,
0.15,
0.5,
0.15,
0.1
]
},
{
"name": "AI2-ARC-Easy",
"path": "allenai/ai2_arc",
"config": "ARC-Easy",
"split": "train[30%:60%]",
"weight": 0.6,
"dialect": [
0.05,
0.15,
0.4,
0.25,
0.15
]
},
{
"name": "HH-RLHF",
"path": "Anthropic/hh-rlhf",
"split": "train[5%:10%]",
"weight": 0.5,
"dialect": [
0.1,
0.25,
0.2,
0.25,
0.2
]
},
{
"name": "SVAMP",
"path": "ChilleD/SVAMP",
"split": "train",
"weight": 0.25,
"dialect": [
0.1,
0.15,
0.55,
0.15,
0.05
]
},
{
"name": "MATH-500",
"path": "HuggingFaceH4/MATH-500",
"split": "test",
"weight": 0.25,
"dialect": [
0.05,
0.15,
0.6,
0.15,
0.05
]
},
{
"name": "SEP",
"path": "AiresPucrs/stanford-encyclopedia-philosophy",
"split": "train",
"weight": 0.3,
"dialect": [
0.05,
0.45,
0.18,
0.22,
0.1
]
}
],
"_alive_entries": [
{
"name": "TinyStories",
"path": "roneneldan/TinyStories",
"split": "train[30%:50%]",
"weight": 0.1,
"dialect": [
0.6000000238418579,
0.10000000149011612,
0.05000000074505806,
0.05000000074505806,
0.20000000298023224
],
"class_id": 0,
"p": 0.03125000000000001
},
{
"name": "WikipediaEN",
"path": "wikimedia/wikipedia",
"config": "20231101.en",
"split": "train[5%:15%]",
"weight": 0.5,
"dialect": [
0.11999999731779099,
0.5799999833106995,
0.10000000149011612,
0.10000000149011612,
0.10000000149011612
],
"class_id": 1,
"p": 0.15625
},
{
"name": "AGNews",
"path": "ag_news",
"split": "train[:]",
"weight": 0.1,
"dialect": [
0.20000000298023224,
0.5,
0.10000000149011612,
0.10000000149011612,
0.10000000149011612
],
"class_id": 2,
"p": 0.03125000000000001
},
{
"name": "GSM8K",
"path": "openai/gsm8k",
"config": "main",
"split": "train[40%:60%]",
"weight": 0.6,
"dialect": [
0.10000000149011612,
0.15000000596046448,
0.5,
0.15000000596046448,
0.10000000149011612
],
"class_id": 3,
"p": 0.1875
},
{
"name": "AI2-ARC-Easy",
"path": "allenai/ai2_arc",
"config": "ARC-Easy",
"split": "train[30%:60%]",
"weight": 0.6,
"dialect": [
0.05000000074505806,
0.15000000596046448,
0.4000000059604645,
0.25,
0.15000000596046448
],
"class_id": 4,
"p": 0.1875
},
{
"name": "HH-RLHF",
"path": "Anthropic/hh-rlhf",
"split": "train[5%:10%]",
"weight": 0.5,
"dialect": [
0.10000000149011612,
0.25,
0.20000000298023224,
0.25,
0.20000000298023224
],
"class_id": 5,
"p": 0.15625
},
{
"name": "SVAMP",
"path": "ChilleD/SVAMP",
"split": "train",
"weight": 0.25,
"dialect": [
0.10000000149011612,
0.15000000596046448,
0.550000011920929,
0.15000000596046448,
0.05000000074505806
],
"class_id": 6,
"p": 0.078125
},
{
"name": "MATH-500",
"path": "HuggingFaceH4/MATH-500",
"split": "test",
"weight": 0.25,
"dialect": [
0.05000000074505806,
0.15000000596046448,
0.6000000238418579,
0.15000000596046448,
0.05000000074505806
],
"class_id": 7,
"p": 0.078125
},
{
"name": "SEP",
"path": "AiresPucrs/stanford-encyclopedia-philosophy",
"split": "train",
"weight": 0.3,
"dialect": [
0.05000000074505806,
0.44999998807907104,
0.18000000715255737,
0.2199999988079071,
0.10000000149011612
],
"class_id": 8,
"p": 0.09375
}
]
}