|
from transformers import PretrainedConfig |
|
|
|
|
|
class ByteGPTConfig(PretrainedConfig): |
|
model_type = "ijk_byte_gpt" |
|
|
|
def __init__( |
|
self, |
|
vocab_size: int = 259, |
|
block_size: int = 128, |
|
n_embd: int = 64, |
|
n_head: int = 4, |
|
n_layer: int = 4, |
|
dropout: float = 0.1, |
|
use_flash_attention: bool = False, |
|
_attn_implementation_autoset: bool = False, |
|
**kwargs |
|
): |
|
super().__init__(**kwargs) |
|
self.auto_map = { |
|
"AutoConfig": "configuration_bytegpt.ByteGPTConfig", |
|
"AutoModelForCausalLM": "modeling_bytegpt.ByteGPTForCausalLM", |
|
} |
|
self.vocab_size = vocab_size |
|
self.block_size = block_size |
|
self.n_embd = n_embd |
|
self.n_head = n_head |
|
self.n_layer = n_layer |
|
self.dropout = dropout |
|
self.use_flash_attention = use_flash_attention |
|
|