| { | |
| "batch_size": 128, | |
| "grad_accum_steps": 1, | |
| "epochs": 10, | |
| "lr": 0.0003, | |
| "betas": [ | |
| 0.9, | |
| 0.95 | |
| ], | |
| "weight_decay": 0.1, | |
| "warmup_steps": 500, | |
| "max_steps": null, | |
| "clip_grad": 1.0, | |
| "min_lr": 1e-06, | |
| "label_smoothing": 0.0, | |
| "mixed_precision": "bf16", | |
| "log_dir": "./runs/crystal_beeper", | |
| "log_interval": 50, | |
| "ckpt_dir": "./checkpoints_crystal", | |
| "export_dir": "./export_crystal", | |
| "resume": true, | |
| "resume_strict": false, | |
| "resume_tag": "best_model.safetensors", | |
| "hf_repo": "AbstractPhil/beeper-ascii-v1", | |
| "upload_to_hub": true, | |
| "add_bos_eos": true, | |
| "span_corrupt_frac": 0.0, | |
| "val_ratio": 0.01, | |
| "test_ratio": 0.01, | |
| "max_rows_per_dataset": null, | |
| "dataset_cache_verbose": true, | |
| "lambda_route": 0.2, | |
| "route_topk": 32, | |
| "lambda_geom": 0.3, | |
| "lambda_geom_angle": 0.8, | |
| "lambda_geom_var": 0.3, | |
| "lambda_geom_edge": 0.3, | |
| "lambda_geom_vol": 0.6, | |
| "lambda_geom_minrel": 1.0, | |
| "geom_min_edge_rel": 0.6, | |
| "geom_vol_lower_frac": 0.85, | |
| "geom_sample_classes": 64, | |
| "lambda_rose": 0.1, | |
| "rose_scale": 1.8, | |
| "contrast_warmup": 800, | |
| "pent_temp": 0.1, | |
| "lambda_contrast": 0.25, | |
| "punctuation": { | |
| "enable": true, | |
| "chars": [ | |
| ".", | |
| ",", | |
| ";", | |
| ":", | |
| "!", | |
| "?", | |
| "'", | |
| "\"", | |
| "(", | |
| ")", | |
| "[", | |
| "]", | |
| "{", | |
| "}", | |
| "-", | |
| "\u2014", | |
| "\u2026" | |
| ], | |
| "alpha_soft": 0.6, | |
| "hard_mask_gate": false, | |
| "apply_to_coarse_ids": "ALL" | |
| }, | |
| "harmony": { | |
| "apply": true, | |
| "system": "You are Crystal-Beeper, a helpful, honest, precise assistant.", | |
| "style": "concise" | |
| }, | |
| "stages": [ | |
| { | |
| "name": "bootstrap", | |
| "epochs": 1, | |
| "lambda_route": 0.05, | |
| "lambda_geom": 0.2, | |
| "gate_tau": 0.1, | |
| "punct_alpha": 0.6, | |
| "hard_mask_gate": false, | |
| "mix_sdpa": 1.0 | |
| }, | |
| { | |
| "name": "crystal_warmup", | |
| "epochs": 2, | |
| "lambda_route": 0.2, | |
| "lambda_geom": 0.3, | |
| "gate_tau": 0.08, | |
| "punct_alpha": 0.7, | |
| "hard_mask_gate": false, | |
| "mix_sdpa": 1.0 | |
| }, | |
| { | |
| "name": "dictionary_crystals", | |
| "epochs": 2, | |
| "lambda_route": 0.25, | |
| "lambda_geom": 0.35, | |
| "gate_tau": 0.06, | |
| "punct_alpha": 0.85, | |
| "hard_mask_gate": false, | |
| "mix_sdpa": 0.9 | |
| }, | |
| { | |
| "name": "stability_tuning", | |
| "epochs": 3, | |
| "lambda_route": 0.3, | |
| "lambda_geom": 0.4, | |
| "gate_tau": 0.05, | |
| "punct_alpha": 1.0, | |
| "hard_mask_gate": false, | |
| "mix_sdpa": 0.8 | |
| } | |
| ], | |
| "corpus": [ | |
| { | |
| "name": "TinyStories", | |
| "path": "roneneldan/TinyStories", | |
| "split": "train[30%:50%]", | |
| "weight": 0.1, | |
| "dialect": [ | |
| 0.6, | |
| 0.1, | |
| 0.05, | |
| 0.05, | |
| 0.2 | |
| ] | |
| }, | |
| { | |
| "name": "WikipediaEN", | |
| "path": "wikimedia/wikipedia", | |
| "config": "20231101.en", | |
| "split": "train[5%:15%]", | |
| "weight": 0.5, | |
| "dialect": [ | |
| 0.12, | |
| 0.58, | |
| 0.1, | |
| 0.1, | |
| 0.1 | |
| ] | |
| }, | |
| { | |
| "name": "AGNews", | |
| "path": "ag_news", | |
| "split": "train[:]", | |
| "weight": 0.1, | |
| "dialect": [ | |
| 0.2, | |
| 0.5, | |
| 0.1, | |
| 0.1, | |
| 0.1 | |
| ] | |
| }, | |
| { | |
| "name": "GSM8K", | |
| "path": "openai/gsm8k", | |
| "config": "main", | |
| "split": "train[40%:60%]", | |
| "weight": 0.6, | |
| "dialect": [ | |
| 0.1, | |
| 0.15, | |
| 0.5, | |
| 0.15, | |
| 0.1 | |
| ] | |
| }, | |
| { | |
| "name": "AI2-ARC-Easy", | |
| "path": "allenai/ai2_arc", | |
| "config": "ARC-Easy", | |
| "split": "train[30%:60%]", | |
| "weight": 0.6, | |
| "dialect": [ | |
| 0.05, | |
| 0.15, | |
| 0.4, | |
| 0.25, | |
| 0.15 | |
| ] | |
| }, | |
| { | |
| "name": "HH-RLHF", | |
| "path": "Anthropic/hh-rlhf", | |
| "split": "train[5%:10%]", | |
| "weight": 0.5, | |
| "dialect": [ | |
| 0.1, | |
| 0.25, | |
| 0.2, | |
| 0.25, | |
| 0.2 | |
| ] | |
| }, | |
| { | |
| "name": "SVAMP", | |
| "path": "ChilleD/SVAMP", | |
| "split": "train", | |
| "weight": 0.25, | |
| "dialect": [ | |
| 0.1, | |
| 0.15, | |
| 0.55, | |
| 0.15, | |
| 0.05 | |
| ] | |
| }, | |
| { | |
| "name": "MATH-500", | |
| "path": "HuggingFaceH4/MATH-500", | |
| "split": "test", | |
| "weight": 0.25, | |
| "dialect": [ | |
| 0.05, | |
| 0.15, | |
| 0.6, | |
| 0.15, | |
| 0.05 | |
| ] | |
| }, | |
| { | |
| "name": "SEP", | |
| "path": "AiresPucrs/stanford-encyclopedia-philosophy", | |
| "split": "train", | |
| "weight": 0.3, | |
| "dialect": [ | |
| 0.05, | |
| 0.45, | |
| 0.18, | |
| 0.22, | |
| 0.1 | |
| ] | |
| } | |
| ], | |
| "_alive_entries": [ | |
| { | |
| "name": "TinyStories", | |
| "path": "roneneldan/TinyStories", | |
| "split": "train[30%:50%]", | |
| "weight": 0.1, | |
| "dialect": [ | |
| 0.6000000238418579, | |
| 0.10000000149011612, | |
| 0.05000000074505806, | |
| 0.05000000074505806, | |
| 0.20000000298023224 | |
| ], | |
| "class_id": 0, | |
| "p": 0.03125000000000001 | |
| }, | |
| { | |
| "name": "WikipediaEN", | |
| "path": "wikimedia/wikipedia", | |
| "config": "20231101.en", | |
| "split": "train[5%:15%]", | |
| "weight": 0.5, | |
| "dialect": [ | |
| 0.11999999731779099, | |
| 0.5799999833106995, | |
| 0.10000000149011612, | |
| 0.10000000149011612, | |
| 0.10000000149011612 | |
| ], | |
| "class_id": 1, | |
| "p": 0.15625 | |
| }, | |
| { | |
| "name": "AGNews", | |
| "path": "ag_news", | |
| "split": "train[:]", | |
| "weight": 0.1, | |
| "dialect": [ | |
| 0.20000000298023224, | |
| 0.5, | |
| 0.10000000149011612, | |
| 0.10000000149011612, | |
| 0.10000000149011612 | |
| ], | |
| "class_id": 2, | |
| "p": 0.03125000000000001 | |
| }, | |
| { | |
| "name": "GSM8K", | |
| "path": "openai/gsm8k", | |
| "config": "main", | |
| "split": "train[40%:60%]", | |
| "weight": 0.6, | |
| "dialect": [ | |
| 0.10000000149011612, | |
| 0.15000000596046448, | |
| 0.5, | |
| 0.15000000596046448, | |
| 0.10000000149011612 | |
| ], | |
| "class_id": 3, | |
| "p": 0.1875 | |
| }, | |
| { | |
| "name": "AI2-ARC-Easy", | |
| "path": "allenai/ai2_arc", | |
| "config": "ARC-Easy", | |
| "split": "train[30%:60%]", | |
| "weight": 0.6, | |
| "dialect": [ | |
| 0.05000000074505806, | |
| 0.15000000596046448, | |
| 0.4000000059604645, | |
| 0.25, | |
| 0.15000000596046448 | |
| ], | |
| "class_id": 4, | |
| "p": 0.1875 | |
| }, | |
| { | |
| "name": "HH-RLHF", | |
| "path": "Anthropic/hh-rlhf", | |
| "split": "train[5%:10%]", | |
| "weight": 0.5, | |
| "dialect": [ | |
| 0.10000000149011612, | |
| 0.25, | |
| 0.20000000298023224, | |
| 0.25, | |
| 0.20000000298023224 | |
| ], | |
| "class_id": 5, | |
| "p": 0.15625 | |
| }, | |
| { | |
| "name": "SVAMP", | |
| "path": "ChilleD/SVAMP", | |
| "split": "train", | |
| "weight": 0.25, | |
| "dialect": [ | |
| 0.10000000149011612, | |
| 0.15000000596046448, | |
| 0.550000011920929, | |
| 0.15000000596046448, | |
| 0.05000000074505806 | |
| ], | |
| "class_id": 6, | |
| "p": 0.078125 | |
| }, | |
| { | |
| "name": "MATH-500", | |
| "path": "HuggingFaceH4/MATH-500", | |
| "split": "test", | |
| "weight": 0.25, | |
| "dialect": [ | |
| 0.05000000074505806, | |
| 0.15000000596046448, | |
| 0.6000000238418579, | |
| 0.15000000596046448, | |
| 0.05000000074505806 | |
| ], | |
| "class_id": 7, | |
| "p": 0.078125 | |
| }, | |
| { | |
| "name": "SEP", | |
| "path": "AiresPucrs/stanford-encyclopedia-philosophy", | |
| "split": "train", | |
| "weight": 0.3, | |
| "dialect": [ | |
| 0.05000000074505806, | |
| 0.44999998807907104, | |
| 0.18000000715255737, | |
| 0.2199999988079071, | |
| 0.10000000149011612 | |
| ], | |
| "class_id": 8, | |
| "p": 0.09375 | |
| } | |
| ] | |
| } | 
