{ "activation_checkpoint_impl": "per-iteration", "architecture_class_name": "RecurrentGPT", "architectures": [ "RavenForCausalLM" ], "auto_map": { "AutoConfig": "raven_config_minimal.RavenConfig", "AutoModelForCausalLM": "raven_modeling_minimal.RavenForCausalLM" }, "bias": false, "block_class_name": "SandwichBlock", "block_size": 4096, "effective_expected_depth": 132, "head_dim": 96, "init_orthogonal": false, "init_strategy": "takase", "init_values": { "embed_scale": 72.6636084983398, "embedding": 0.008703882797784892, "out_proj": 0.0005356869554443541, "std": 0.008703882797784892 }, "injection_type": "linear", "intermediate_size": 17920, "mean_backprop_depth": 8, "mean_recurrence": 32, "mlp_class_name": "GatedMLP", "model_type": "huginn_raven", "n_embd": 5280, "n_heads": 55, "n_layers": 8, "n_layers_in_coda": 2, "n_layers_in_prelude": 2, "n_layers_in_recurrent_block": 4, "nonlin_name": "SiLU", "norm_class_name": "RMSNorm_llama", "norm_eps": 1e-06, "num_key_value_heads": 55, "padded_vocab_size": 65536, "padding_multiple": 4096, "qk_bias": true, "rope_base": 50000, "sampling_scheme": "poisson-lognormal-filling", "state_init": "like-init", "tie_embeddings": true, "torch_dtype": "float32", "transformers_version": "4.44.2", "vocab_size": 65536, "bos_token_id": 65504, "eos_token_id": 65505, "pad_token_id": 65509 }