from transformers import PretrainedConfig, AutoConfig | |
class SPTConfig(PretrainedConfig): | |
model_type = "spt" | |
def __init__( | |
self, | |
vocab_size=97, | |
hidden_size=512, | |
n_layers=12, | |
n_attn_heads=16, | |
n_kv_heads=16, | |
intermediate_size=2048, | |
max_len=2048, | |
residual=True, | |
normalise=True, | |
pad_token_id=95, | |
bos_token_id=95, | |
eos_token_id=95, | |
**kwargs | |
): | |
self.vocab_size = vocab_size | |
self.hidden_size = hidden_size | |
self.n_layers = n_layers | |
self.n_attn_heads = n_attn_heads | |
self.n_kv_heads = n_kv_heads | |
self.intermediate_size = intermediate_size | |
self.max_len = max_len | |
self.residual = residual | |
self.normalise = normalise | |
super().__init__( | |
pad_token_id=pad_token_id, | |
bos_token_id=bos_token_id, | |
eos_token_id=eos_token_id, | |
**kwargs | |
) | |
AutoConfig.register("spt", SPTConfig) | |