huginn_swa_75_7_ema_0.9_merge / raven_config_minimal.py

Upload RavenForCausalLM

543b80b verified 17 days ago

3.82 kB

	"""A HuggingFace-style model configuration."""

	from transformers import PretrainedConfig
	from math import sqrt


	class RavenConfig(PretrainedConfig):
	model_type = "huginn_raven"
	keys_to_ignore_at_inference = [""]
	attribute_map = {"num_attention_heads": "n_heads", "hidden_size": "n_embd", "num_hidden_layers": "n_layers"}

	def __init__(
	self,
	n_embd: int = 5280,
	n_heads: int = 55,
	n_layers: int = 8, # total of prelude + recurrent + coda
	block_size: int = 4096,
	vocab_size: int = 65536,
	padding_multiple: int = 4096,
	tie_embeddings: bool = True,
	intermediate_size: int = 17920,
	bias: bool = False,
	architecture_class_name: str = "RecurrentGPT",
	block_class_name: str = "SandwichBlock",
	norm_class_name: str = "RMSNorm_llama",
	norm_eps: float = 0.000001,
	mlp_class_name: str = "GatedMLP",
	nonlin_name: str = "SiLU",
	init_strategy: str = "takase",
	init_orthogonal: bool = False,
	state_init: str = "like-init",
	injection_type: str = "linear",
	n_layers_in_recurrent_block: int = 4,
	mean_recurrence: int = 32,
	sampling_scheme: str = "poisson-lognormal-filling",
	mean_backprop_depth: int = 8,
	n_layers_in_prelude: int = 2,
	n_layers_in_coda: int = 2,
	qk_bias: bool = True,
	activation_checkpoint_impl: str = "per-iteration",
	rope_base: float = 50_000,
	torch_dtype: str = "bfloat16",
	transformers_version: str = "4.47.1",
	**kwargs,
	):
	self.n_embd = n_embd
	self.n_heads = n_heads
	self.n_layers = n_layers
	self.block_size = block_size
	self.vocab_size = self.padded_vocab_size = vocab_size
	self.padding_multiple = padding_multiple
	self.tie_embeddings = tie_embeddings
	self.intermediate_size = intermediate_size
	self.bias = bias
	self.architecture_class_name = architecture_class_name
	self.block_class_name = block_class_name
	self.norm_class_name = norm_class_name
	self.norm_eps = norm_eps
	self.mlp_class_name = mlp_class_name
	self.nonlin_name = nonlin_name
	self.init_strategy = init_strategy
	self.init_orthogonal = init_orthogonal
	self.state_init = state_init
	self.injection_type = injection_type
	self.n_layers_in_recurrent_block = n_layers_in_recurrent_block
	self.mean_recurrence = mean_recurrence
	self.sampling_scheme = sampling_scheme
	self.mean_backprop_depth = mean_backprop_depth
	self.n_layers_in_prelude = n_layers_in_prelude
	self.n_layers_in_coda = n_layers_in_coda
	self.qk_bias = qk_bias
	self.activation_checkpoint_impl = activation_checkpoint_impl
	self.rope_base = rope_base
	self.torch_dtype = torch_dtype # Added from JSON
	self.transformers_version = transformers_version # Added from JSON
	# Derived
	self.num_key_value_heads = n_heads
	self.num_attention_heads = n_heads
	self.head_dim = n_embd // n_heads
	self.effective_expected_depth = (
	self.n_layers_in_prelude + self.n_layers_in_coda + self.n_layers_in_recurrent_block * self.mean_recurrence
	)
	self.init_values = {
	"std": sqrt(2 / (5 * self.n_embd)),
	"out_proj": sqrt(2 / (5 * self.n_embd)) / sqrt(2 * self.effective_expected_depth),
	"embedding": sqrt(2 / (5 * self.n_embd)),
	"embed_scale": sqrt(self.n_embd),
	}

	super().__init__(
	# pad_token_id=65509,
	# bos_token_id=65504,
	# eos_token_id=65505,
	tie_word_embeddings=tie_embeddings,
	**kwargs,
	)