|
{ |
|
"activation_checkpoint_impl": "per-iteration", |
|
"architecture_class_name": "RecurrentGPT", |
|
"architectures": [ |
|
"RavenForCausalLM" |
|
], |
|
"auto_map": { |
|
"AutoConfig": "raven_config_minimal.RavenConfig", |
|
"AutoModelForCausalLM": "raven_modeling_minimal.RavenForCausalLM" |
|
}, |
|
"bias": false, |
|
"block_class_name": "SandwichBlock", |
|
"block_size": 4096, |
|
"effective_expected_depth": 132, |
|
"head_dim": 96, |
|
"init_orthogonal": false, |
|
"init_strategy": "takase", |
|
"init_values": { |
|
"embed_scale": 72.6636084983398, |
|
"embedding": 0.008703882797784892, |
|
"out_proj": 0.0005356869554443541, |
|
"std": 0.008703882797784892 |
|
}, |
|
"injection_type": "linear", |
|
"intermediate_size": 17920, |
|
"mean_backprop_depth": 8, |
|
"mean_recurrence": 32, |
|
"mlp_class_name": "GatedMLP", |
|
"model_type": "huginn_raven", |
|
"n_embd": 5280, |
|
"n_heads": 55, |
|
"n_layers": 8, |
|
"n_layers_in_coda": 2, |
|
"n_layers_in_prelude": 2, |
|
"n_layers_in_recurrent_block": 4, |
|
"nonlin_name": "SiLU", |
|
"norm_class_name": "RMSNorm_llama", |
|
"norm_eps": 1e-06, |
|
"num_key_value_heads": 55, |
|
"padded_vocab_size": 65536, |
|
"padding_multiple": 4096, |
|
"qk_bias": true, |
|
"rope_base": 50000, |
|
"sampling_scheme": "poisson-lognormal-filling", |
|
"state_init": "like-init", |
|
"tie_embeddings": true, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.44.2", |
|
"vocab_size": 65536 |
|
} |
|
|