|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from typing import Optional |
|
|
|
import attrs |
|
|
|
from .ar_config_tokenizer import TokenizerConfig |
|
|
|
|
|
@attrs.define |
|
class ModelConfig: |
|
""" |
|
A class to hold model configuration arguments. |
|
|
|
Args: |
|
dim (int): The dimensionality of the input and output of each transformer block. |
|
n_layers (int): Number of layers in the transformer. |
|
n_heads (int): Number of attention heads. |
|
n_kv_heads (Optional[int]): Number of key-value heads. If None, defaults to n_heads. Note: this is equivalent to |
|
`num_gqa_groups` in TransformerEngine, where GQA means Grouped Query Attention. |
|
head_dim (Optional[int]): Dimensionality of each head. If None, defaults to dim // n_heads. |
|
vocab_size (int): Vocabulary size. |
|
ffn_hidden_size (int): Hidden size for feedforward network. |
|
norm_eps (float): Epsilon value for normalization. |
|
rope_theta (float): Theta value for rotary positional embeddings. |
|
apply_abs_pos_emb (bool): Whether to apply absolute position embeddings. |
|
max_batch_size (int): Maximum batch size for inference. |
|
max_seq_len (int): Maximum sequence length for input text. |
|
fuse_qkv (bool): Whether to fuse QKV in attention. Defaults to True. |
|
causal_mask (bool): Whether to use causal mask. Defaults to True. |
|
norm_type (str): Type of normalization layer. Choices: "rmsnorm", "fused_rmsnorm", "layernorm", "np_layernorm". |
|
precision (str): Data type for the model. |
|
use_qk_normalization (bool): Whether to enable QK normalization. |
|
ckpt_dir (str): Checkpoint directory. |
|
ckpt_path (str): Checkpoint path. |
|
apply_yarn (Optional[bool]): Whether to apply YaRN (long-context extension). |
|
yarn_scale (Optional[float]): Scale factor for YaRN. |
|
yarn_beta_fast (Optional[int]): Beta fast variable for YaRN (i.e., low_freq_factor in Llama 3.1 RoPE scaling code) |
|
yarn_beta_slow (Optional[int]): Beta slow variable for YaRN (i.e., high_freq_factor in Llama 3.1 RoPE scaling code) |
|
original_seq_len (Optional[int]): Original sequence length. |
|
vision_encoder (Optional[str]): Vision encoder name. |
|
mm_projector (Optional[str]): Multi-modal projector name. |
|
vision_encoder_in_channels (Optional[int]): Number of channels in the input image for the vision encoder. Default is 3, you can specify to int larger than 3. E.g. if you have 4-channel images with the last channel as the alpha channel, set this to 4. |
|
rope_dim (Optional[str]): Dimensionality of the RoPE. Choices: "1D", "3D". |
|
pytorch_rope_version (Optional[str]): Version of the PyTorch RoPE implementation. Choices: "v1", "v2". |
|
original_latent_shape (Optional[list]): Original shape of the latent tensor needed for rope extension. |
|
pad_to_multiple_of (Optional[int]): Pad the position embedding to a multiple of this value. |
|
vision_encoder_in_channels (Optional[int]): Number of channels in the input image for the vision encoder. Default is 3. |
|
insert_cross_attn (bool): Whether to insert the cross-attention layers after each multi-head self-attention (MSA) layer. |
|
insert_cross_attn_every_k_layers (int): Insert cross-attention layers every k TransformerLayers. |
|
context_dim (Optional[int]): The dimensionality of cross-attention embedding, e.g., T5 embed feature dim. |
|
num_video_frames (Optional[int]): Number of video frames. |
|
video_height (Optional[int]): Raw video pixel height dimension. |
|
video_width (Optional[int]): Raw video pixel width dimension. |
|
video_latent_shape (Optional[list]): Video tokenizer output dimension, in (T,H,W). |
|
""" |
|
|
|
dim: int = attrs.field(default=4096) |
|
n_layers: int = attrs.field(default=32) |
|
n_heads: int = attrs.field(default=32) |
|
n_kv_heads: Optional[int] = attrs.field(default=8) |
|
head_dim: Optional[int] = attrs.field(default=None) |
|
vocab_size: int = attrs.field(default=128256) |
|
ffn_hidden_size: int = attrs.field(default=14336) |
|
norm_eps: float = attrs.field(default=1e-5) |
|
rope_theta: float = attrs.field(default=500000) |
|
apply_abs_pos_emb: bool = attrs.field(default=False) |
|
max_batch_size: int = attrs.field(default=1) |
|
max_seq_len: int = attrs.field(default=8192) |
|
fuse_qkv: bool = attrs.field(default=False) |
|
causal_mask: bool = attrs.field(default=True) |
|
norm_type: str = attrs.field(default="rmsnorm") |
|
precision: str = attrs.field(default="bfloat16") |
|
use_qk_normalization: bool = False |
|
tokenizer: Optional[TokenizerConfig] = None |
|
ckpt_dir: Optional[str] = attrs.field(default=None) |
|
ckpt_path: Optional[str] = attrs.field( |
|
default=None |
|
) |
|
apply_yarn: Optional[bool] = attrs.field(default=False) |
|
yarn_scale: Optional[float] = attrs.field(default=None) |
|
yarn_beta_fast: Optional[int] = attrs.field(default=None) |
|
yarn_beta_slow: Optional[int] = attrs.field(default=None) |
|
original_seq_len: Optional[int] = attrs.field(default=None) |
|
vision_encoder: Optional[str] = attrs.field(default=None) |
|
vision_encoder_in_channels: Optional[int] = attrs.field(default=3) |
|
mm_projector: Optional[str] = attrs.field(default=None) |
|
rope_dim: Optional[str] = attrs.field(default="1D") |
|
pytorch_rope_version: Optional[str] = attrs.field(default="v2") |
|
original_latent_shape: Optional[list] = None |
|
pad_to_multiple_of: Optional[int] = None |
|
vision_encoder_in_channels: Optional[int] = attrs.field(default=3) |
|
insert_cross_attn: bool = False |
|
insert_cross_attn_every_k_layers: int = 1 |
|
context_dim: Optional[int] = attrs.field(default=1024) |
|
|
|
num_video_frames: Optional[int] = None |
|
|
|
video_height: Optional[int] = None |
|
video_width: Optional[int] = None |
|
|
|
video_latent_shape: Optional[list] = None |
|
|
|
def __getitem__(self, item): |
|
return getattr(self, item) |
|
|