JingzeShi commited on
Commit
c5541fa
verified
1 Parent(s): 66a8e0b

Upload DogeForCausalLM

Browse files
Files changed (4) hide show
  1. config.json +43 -43
  2. configuration_doge.py +228 -228
  3. generation_config.json +7 -7
  4. modeling_doge.py +0 -0
config.json CHANGED
@@ -1,43 +1,43 @@
1
- {
2
- "_name_or_path": "SmallDoge/Doge-160M-Instruct-SFT",
3
- "architectures": [
4
- "DogeForCausalLM"
5
- ],
6
- "attention_dropout": 0.0,
7
- "auto_map": {
8
- "AutoConfig": "configuration_doge.DogeConfig",
9
- "AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
10
- },
11
- "bos_token_id": 0,
12
- "dynamic_mask_ratio": 0.0,
13
- "eos_token_id": 1,
14
- "expert_retrieval_size": 64,
15
- "hidden_act": "silu",
16
- "hidden_bias": false,
17
- "hidden_dropout": 0.0,
18
- "hidden_size": 768,
19
- "initializer_range": 0.02,
20
- "intermediate_size": 1536,
21
- "is_moe": false,
22
- "max_position_embeddings": 2048,
23
- "model_type": "doge",
24
- "num_attention_heads": 6,
25
- "num_cdmoe_experts": 16348,
26
- "num_cdmoe_experts_per_head": 8,
27
- "num_cdmoe_heads": 4,
28
- "num_hidden_layers": 24,
29
- "num_key_value_heads": 3,
30
- "pad_token_id": 2,
31
- "rms_norm_eps": 1e-06,
32
- "rope_scaling": {
33
- "factor": 4.0,
34
- "original_max_position_embeddings": 2048,
35
- "rope_type": "dynamic"
36
- },
37
- "rope_theta": 10000.0,
38
- "tie_word_embeddings": true,
39
- "torch_dtype": "float32",
40
- "transformers_version": "4.48.3",
41
- "use_cache": true,
42
- "vocab_size": 32768
43
- }
 
1
+ {
2
+ "_name_or_path": "SmallDoge/Doge-160M-Instruct-SFT",
3
+ "architectures": [
4
+ "DogeForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_doge.DogeConfig",
9
+ "AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
10
+ },
11
+ "bos_token_id": 0,
12
+ "dynamic_mask_ratio": 0.0,
13
+ "eos_token_id": 1,
14
+ "expert_retrieval_size": 64,
15
+ "hidden_act": "silu",
16
+ "hidden_bias": false,
17
+ "hidden_dropout": 0.0,
18
+ "hidden_size": 768,
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 1536,
21
+ "is_moe": false,
22
+ "max_position_embeddings": 2048,
23
+ "model_type": "doge",
24
+ "num_attention_heads": 6,
25
+ "num_cdmoe_experts": 16348,
26
+ "num_cdmoe_experts_per_head": 8,
27
+ "num_cdmoe_heads": 4,
28
+ "num_hidden_layers": 24,
29
+ "num_key_value_heads": 3,
30
+ "pad_token_id": 2,
31
+ "rms_norm_eps": 1e-06,
32
+ "rope_scaling": {
33
+ "factor": 4.0,
34
+ "original_max_position_embeddings": 2048,
35
+ "rope_type": "dynamic"
36
+ },
37
+ "rope_theta": 10000.0,
38
+ "tie_word_embeddings": true,
39
+ "torch_dtype": "float32",
40
+ "transformers_version": "4.48.3",
41
+ "use_cache": true,
42
+ "vocab_size": 32768
43
+ }
configuration_doge.py CHANGED
@@ -1,228 +1,228 @@
1
- # 馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃
2
- # This file was automatically generated from src/transformers/models/doge/modular_doge.py.
3
- # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
- # the file from the modular. If any change should be done, please apply the change to the
5
- # modular_doge.py file directly. One of our CI enforces this.
6
- # 馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃
7
- # coding=utf-8
8
- # Copyright 2024 Jingze Shi and the HuggingFace Inc. team. All rights reserved.
9
- #
10
- # This code is based on the Wonderful Matrices paper implementation.
11
- # The Doge family of small language models is trained by Jingze Shi.
12
- #
13
- # Licensed under the Apache License, Version 2.0 (the "License");
14
- # you may not use this file except in compliance with the License.
15
- # You may obtain a copy of the License at
16
- #
17
- # http://www.apache.org/licenses/LICENSE-2.0
18
- #
19
- # Unless required by applicable law or agreed to in writing, software
20
- # distributed under the License is distributed on an "AS IS" BASIS,
21
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22
- # See the License for the specific language governing permissions and
23
- # limitations under the License.
24
- from transformers.configuration_utils import PretrainedConfig
25
- from transformers.modeling_rope_utils import rope_config_validation
26
-
27
-
28
- class DogeConfig(PretrainedConfig):
29
- r"""
30
- This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
31
- model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-20M](https://huggingface.co/SmallDoge/Doge-20M).
32
-
33
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
34
- documentation from [`PretrainedConfig`] for more information.
35
-
36
- Args:
37
- vocab_size (`int`, *optional*, defaults to 32768):
38
- Vocabulary size of the Doge model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`]
39
- hidden_size (`int`, *optional*, defaults to 1024):
40
- Dimension of the hidden representations.
41
- intermediate_size (`int`, *optional*, defaults to 2048):
42
- Dimension of the MLP representations.
43
- num_hidden_layers (`int`, *optional*, defaults to 32):
44
- Number of hidden layers in the Transformer decoder.
45
- hidden_bias (`bool`, *optional*, defaults to `False`):
46
- Whether to use bias in the hidden layers.
47
- hidden_dropout (`float`, *optional*, defaults to 0.0):
48
- Dropout probability for each sequence transformation and state transformation module.
49
- hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
50
- The non-linear activation function (function or string) in the decoder.
51
- initializer_range (`float`, *optional*, defaults to 0.02):
52
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
53
- rms_norm_eps (`float`, *optional*, defaults to 1e-06):
54
- The epsilon used by the rms normalization layers.
55
- use_cache (`bool`, *optional*, defaults to `True`):
56
- Whether or not the model should return the last key/values attentions (not used by all models). Only
57
- relevant if `config.is_decoder=True`.
58
- bos_token_id (`int`, *optional*, defaults to 0):
59
- Beginning of stream token id.
60
- eos_token_id (`int`, *optional*, defaults to 1):
61
- End of stream token id.
62
- pad_token_id (`int`, *optional*, defaults to 2):
63
- Padding token id.
64
- tie_word_embeddings (`bool`, *optional*, defaults to `False`):
65
- Whether to tie weight embeddings
66
- max_position_embeddings (`int`, *optional*, defaults to 2048):
67
- The maximum sequence length that this model might ever be used with.
68
- rope_theta (`float`, *optional*, defaults to 10000.0):
69
- The base period of the RoPE embeddings.
70
- rope_scaling (`Dict`, *optional*):
71
- Dictionary containing the scaling configuration for the RoPE embeddings.
72
- NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly.
73
- Doge family of small models use `{ 'rope_type': 'dynamic', 'factor': 4.0, 'original_max_position_embeddings': 2048 }` as the default value.
74
- Expected contents:
75
- `rope_type` (`str`):
76
- The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation.
77
- `factor` (`float`, *optional*):
78
- Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings.
79
- In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length.
80
- `original_max_position_embeddings` (`int`, *optional*):
81
- Used with 'dynamic', 'longrope' and 'llama3'.
82
- The original max position embeddings used during pretraining.
83
- `attention_factor` (`float`, *optional*):
84
- Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
85
- computation.
86
- If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value.
87
- `beta_fast` (`float`, *optional*):
88
- Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
89
- ramp function. If unspecified, it defaults to 32.
90
- `beta_slow` (`float`, *optional*):
91
- Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
92
- ramp function. If unspecified, it defaults to 1.
93
- `short_factor` (`List[float]`, *optional*):
94
- Only used with 'longrope'. The scaling factor to be applied to short contexts (<`original_max_position_embeddings`).
95
- Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
96
- `long_factor` (`List[float]`, *optional*):
97
- Only used with 'longrope'. The scaling factor to be applied to long contexts (<`original_max_position_embeddings`).
98
- Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
99
- `low_freq_factor` (`float`, *optional*):
100
- Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
101
- `high_freq_factor` (`float`, *optional*):
102
- Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
103
- num_attention_heads (`int`, *optional*, defaults to 8):
104
- Number of attention heads for each attention layer in the Transformer decoder.
105
- num_key_value_heads (`int`, *optional*):
106
- This is the number of key_value heads that should be used to implement Grouped Query Attention.
107
- If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
108
- `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
109
- When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group.
110
- For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf).
111
- If it is not specified, will default to `num_attention_heads`.
112
- attention_dropout (`float`, *optional*, defaults to 0.0):
113
- The dropout ratio for the attention probabilities.
114
- dynamic_mask_ratio (`float`, *optional*, defaults to 0.0):
115
- The ratio to control the proportion of the dynamic mask filled with the minimum value. For more details checkout [this paper](https://arxiv.org/pdf/2412.11834).
116
- is_moe (`bool`, *optional*, defaults to `False`):
117
- Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize. For more details checkout [this paper](https://arxiv.org/pdf/2412.11834).
118
- num_cdmoe_experts (`int`, *optional*, defaults to 16348):
119
- Number of Experts for the Cross Domain Mixture of Experts.
120
- num_cdmoe_heads (`int`, *optional*, defaults to 4):
121
- Number of retrieval heads, used to mix multi-head experts.
122
- num_cdmoe_experts_per_head (`int`, *optional*, defaults to 8):
123
- Number of Experts per retrieval head, used to mix multi-head experts.
124
- expert_retrieval_size (`int`, *optional*, defaults to 64):
125
- Dimension of the Expert retrieval states for calculating the dot product of query and key to determine the expert index.
126
-
127
- ```python
128
- >>> from transformers import DogeConfig, DogeModel
129
-
130
- >>> # Initializing a Doge-320M style configuration
131
- >>> configuration = DogeConfig()
132
-
133
- >>> # Initializing a model from the Doge-320M style configuration
134
- >>> model = DogeModel(configuration)
135
-
136
- >>> # Accessing the model configuration
137
- >>> configuration = model.config
138
- ```"""
139
-
140
- model_type = "doge"
141
- keys_to_ignore_at_inference = ["past_key_values"]
142
- # Default tensor parallel plan for base model `DogeModel`
143
- base_model_tp_plan = {
144
- "layers.*.self_attn.q_proj": "colwise",
145
- "layers.*.self_attn.k_proj": "colwise",
146
- "layers.*.self_attn.v_proj": "colwise",
147
- "layers.*.self_attn.dt_proj": "rowwise",
148
- "layers.*.self_attn.o_proj": "rowwise",
149
- "layers.*.mlp.gate_proj": "colwise",
150
- "layers.*.mlp.up_proj": "colwise",
151
- "layers.*.mlp.down_proj": "rowwise",
152
- }
153
-
154
- def __init__(
155
- self,
156
- vocab_size=32768,
157
- hidden_size=1024,
158
- intermediate_size=2048,
159
- num_hidden_layers=32,
160
- hidden_bias=False,
161
- hidden_dropout=0.0,
162
- hidden_act="silu",
163
- initializer_range=0.02,
164
- rms_norm_eps=1e-06,
165
- use_cache=True,
166
- bos_token_id=0,
167
- eos_token_id=1,
168
- pad_token_id=2,
169
- tie_word_embeddings=False,
170
- max_position_embeddings=2048,
171
- rope_theta=10000.0,
172
- rope_scaling=None,
173
- num_attention_heads=8,
174
- num_key_value_heads=None,
175
- attention_dropout=0.0,
176
- dynamic_mask_ratio=0.0,
177
- is_moe=False,
178
- num_cdmoe_experts=16348,
179
- num_cdmoe_heads=4,
180
- num_cdmoe_experts_per_head=8,
181
- expert_retrieval_size=64,
182
- **kwargs,
183
- ):
184
- self.vocab_size = vocab_size
185
- self.hidden_size = hidden_size
186
- self.intermediate_size = intermediate_size
187
- self.num_hidden_layers = num_hidden_layers
188
-
189
- self.hidden_bias = hidden_bias
190
- self.hidden_dropout = hidden_dropout
191
- self.hidden_act = hidden_act
192
- self.initializer_range = initializer_range
193
- self.rms_norm_eps = rms_norm_eps
194
- self.use_cache = use_cache
195
-
196
- self.max_position_embeddings = max_position_embeddings
197
- self.rope_theta = rope_theta
198
- self.rope_scaling = rope_scaling
199
- self.num_attention_heads = num_attention_heads
200
- self.num_key_value_heads = num_key_value_heads
201
- self.attention_dropout = attention_dropout
202
- self.dynamic_mask_ratio = dynamic_mask_ratio
203
- self.is_moe = is_moe
204
- self.num_cdmoe_experts = num_cdmoe_experts
205
- self.num_cdmoe_heads = num_cdmoe_heads
206
- self.num_cdmoe_experts_per_head = num_cdmoe_experts_per_head
207
- self.expert_retrieval_size = expert_retrieval_size
208
-
209
- # Validate the correctness of rotary position embeddings parameters
210
- # BC: if there is a 'type' field, copy it it to 'rope_type'.
211
- if self.rope_scaling is not None and "type" in self.rope_scaling:
212
- self.rope_scaling["rope_type"] = self.rope_scaling["type"]
213
- rope_config_validation(self)
214
-
215
- # for backward compatibility
216
- if num_key_value_heads is None:
217
- self.num_key_value_heads = num_attention_heads
218
-
219
- super().__init__(
220
- bos_token_id=bos_token_id,
221
- eos_token_id=eos_token_id,
222
- pad_token_id=pad_token_id,
223
- tie_word_embeddings=tie_word_embeddings,
224
- **kwargs,
225
- )
226
-
227
-
228
- __all__ = ["DogeConfig"]
 
1
+ # 馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃
2
+ # This file was automatically generated from src/transformers/models/doge/modular_doge.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_doge.py file directly. One of our CI enforces this.
6
+ # 馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃馃毃
7
+ # coding=utf-8
8
+ # Copyright 2024 Jingze Shi and the HuggingFace Inc. team. All rights reserved.
9
+ #
10
+ # This code is based on the Wonderful Matrices paper implementation.
11
+ # The Doge family of small language models is trained by Jingze Shi.
12
+ #
13
+ # Licensed under the Apache License, Version 2.0 (the "License");
14
+ # you may not use this file except in compliance with the License.
15
+ # You may obtain a copy of the License at
16
+ #
17
+ # http://www.apache.org/licenses/LICENSE-2.0
18
+ #
19
+ # Unless required by applicable law or agreed to in writing, software
20
+ # distributed under the License is distributed on an "AS IS" BASIS,
21
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22
+ # See the License for the specific language governing permissions and
23
+ # limitations under the License.
24
+ from transformers.configuration_utils import PretrainedConfig
25
+ from transformers.modeling_rope_utils import rope_config_validation
26
+
27
+
28
+ class DogeConfig(PretrainedConfig):
29
+ r"""
30
+ This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
31
+ model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-20M](https://huggingface.co/SmallDoge/Doge-20M).
32
+
33
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
34
+ documentation from [`PretrainedConfig`] for more information.
35
+
36
+ Args:
37
+ vocab_size (`int`, *optional*, defaults to 32768):
38
+ Vocabulary size of the Doge model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`]
39
+ hidden_size (`int`, *optional*, defaults to 1024):
40
+ Dimension of the hidden representations.
41
+ intermediate_size (`int`, *optional*, defaults to 2048):
42
+ Dimension of the MLP representations.
43
+ num_hidden_layers (`int`, *optional*, defaults to 32):
44
+ Number of hidden layers in the Transformer decoder.
45
+ hidden_bias (`bool`, *optional*, defaults to `False`):
46
+ Whether to use bias in the hidden layers.
47
+ hidden_dropout (`float`, *optional*, defaults to 0.0):
48
+ Dropout probability for each sequence transformation and state transformation module.
49
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
50
+ The non-linear activation function (function or string) in the decoder.
51
+ initializer_range (`float`, *optional*, defaults to 0.02):
52
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
53
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
54
+ The epsilon used by the rms normalization layers.
55
+ use_cache (`bool`, *optional*, defaults to `True`):
56
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
57
+ relevant if `config.is_decoder=True`.
58
+ bos_token_id (`int`, *optional*, defaults to 0):
59
+ Beginning of stream token id.
60
+ eos_token_id (`int`, *optional*, defaults to 1):
61
+ End of stream token id.
62
+ pad_token_id (`int`, *optional*, defaults to 2):
63
+ Padding token id.
64
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
65
+ Whether to tie weight embeddings
66
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
67
+ The maximum sequence length that this model might ever be used with.
68
+ rope_theta (`float`, *optional*, defaults to 10000.0):
69
+ The base period of the RoPE embeddings.
70
+ rope_scaling (`Dict`, *optional*):
71
+ Dictionary containing the scaling configuration for the RoPE embeddings.
72
+ NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly.
73
+ Doge family of small models use `{ 'rope_type': 'dynamic', 'factor': 4.0, 'original_max_position_embeddings': 2048 }` as the default value.
74
+ Expected contents:
75
+ `rope_type` (`str`):
76
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation.
77
+ `factor` (`float`, *optional*):
78
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings.
79
+ In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length.
80
+ `original_max_position_embeddings` (`int`, *optional*):
81
+ Used with 'dynamic', 'longrope' and 'llama3'.
82
+ The original max position embeddings used during pretraining.
83
+ `attention_factor` (`float`, *optional*):
84
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
85
+ computation.
86
+ If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value.
87
+ `beta_fast` (`float`, *optional*):
88
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
89
+ ramp function. If unspecified, it defaults to 32.
90
+ `beta_slow` (`float`, *optional*):
91
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
92
+ ramp function. If unspecified, it defaults to 1.
93
+ `short_factor` (`List[float]`, *optional*):
94
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<`original_max_position_embeddings`).
95
+ Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
96
+ `long_factor` (`List[float]`, *optional*):
97
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<`original_max_position_embeddings`).
98
+ Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
99
+ `low_freq_factor` (`float`, *optional*):
100
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
101
+ `high_freq_factor` (`float`, *optional*):
102
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
103
+ num_attention_heads (`int`, *optional*, defaults to 8):
104
+ Number of attention heads for each attention layer in the Transformer decoder.
105
+ num_key_value_heads (`int`, *optional*):
106
+ This is the number of key_value heads that should be used to implement Grouped Query Attention.
107
+ If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
108
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
109
+ When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group.
110
+ For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf).
111
+ If it is not specified, will default to `num_attention_heads`.
112
+ attention_dropout (`float`, *optional*, defaults to 0.0):
113
+ The dropout ratio for the attention probabilities.
114
+ dynamic_mask_ratio (`float`, *optional*, defaults to 0.0):
115
+ The ratio to control the proportion of the dynamic mask filled with the minimum value. For more details checkout [this paper](https://arxiv.org/pdf/2412.11834).
116
+ is_moe (`bool`, *optional*, defaults to `False`):
117
+ Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize. For more details checkout [this paper](https://arxiv.org/pdf/2412.11834).
118
+ num_cdmoe_experts (`int`, *optional*, defaults to 16348):
119
+ Number of Experts for the Cross Domain Mixture of Experts.
120
+ num_cdmoe_heads (`int`, *optional*, defaults to 4):
121
+ Number of retrieval heads, used to mix multi-head experts.
122
+ num_cdmoe_experts_per_head (`int`, *optional*, defaults to 8):
123
+ Number of Experts per retrieval head, used to mix multi-head experts.
124
+ expert_retrieval_size (`int`, *optional*, defaults to 64):
125
+ Dimension of the Expert retrieval states for calculating the dot product of query and key to determine the expert index.
126
+
127
+ ```python
128
+ >>> from transformers import DogeConfig, DogeModel
129
+
130
+ >>> # Initializing a Doge-320M style configuration
131
+ >>> configuration = DogeConfig()
132
+
133
+ >>> # Initializing a model from the Doge-320M style configuration
134
+ >>> model = DogeModel(configuration)
135
+
136
+ >>> # Accessing the model configuration
137
+ >>> configuration = model.config
138
+ ```"""
139
+
140
+ model_type = "doge"
141
+ keys_to_ignore_at_inference = ["past_key_values"]
142
+ # Default tensor parallel plan for base model `DogeModel`
143
+ base_model_tp_plan = {
144
+ "layers.*.self_attn.q_proj": "colwise",
145
+ "layers.*.self_attn.k_proj": "colwise",
146
+ "layers.*.self_attn.v_proj": "colwise",
147
+ "layers.*.self_attn.dt_proj": "rowwise",
148
+ "layers.*.self_attn.o_proj": "rowwise",
149
+ "layers.*.mlp.gate_proj": "colwise",
150
+ "layers.*.mlp.up_proj": "colwise",
151
+ "layers.*.mlp.down_proj": "rowwise",
152
+ }
153
+
154
+ def __init__(
155
+ self,
156
+ vocab_size=32768,
157
+ hidden_size=1024,
158
+ intermediate_size=2048,
159
+ num_hidden_layers=32,
160
+ hidden_bias=False,
161
+ hidden_dropout=0.0,
162
+ hidden_act="silu",
163
+ initializer_range=0.02,
164
+ rms_norm_eps=1e-06,
165
+ use_cache=True,
166
+ bos_token_id=0,
167
+ eos_token_id=1,
168
+ pad_token_id=2,
169
+ tie_word_embeddings=False,
170
+ max_position_embeddings=2048,
171
+ rope_theta=10000.0,
172
+ rope_scaling=None,
173
+ num_attention_heads=8,
174
+ num_key_value_heads=None,
175
+ attention_dropout=0.0,
176
+ dynamic_mask_ratio=0.0,
177
+ is_moe=False,
178
+ num_cdmoe_experts=16348,
179
+ num_cdmoe_heads=4,
180
+ num_cdmoe_experts_per_head=8,
181
+ expert_retrieval_size=64,
182
+ **kwargs,
183
+ ):
184
+ self.vocab_size = vocab_size
185
+ self.hidden_size = hidden_size
186
+ self.intermediate_size = intermediate_size
187
+ self.num_hidden_layers = num_hidden_layers
188
+
189
+ self.hidden_bias = hidden_bias
190
+ self.hidden_dropout = hidden_dropout
191
+ self.hidden_act = hidden_act
192
+ self.initializer_range = initializer_range
193
+ self.rms_norm_eps = rms_norm_eps
194
+ self.use_cache = use_cache
195
+
196
+ self.max_position_embeddings = max_position_embeddings
197
+ self.rope_theta = rope_theta
198
+ self.rope_scaling = rope_scaling
199
+ self.num_attention_heads = num_attention_heads
200
+ self.num_key_value_heads = num_key_value_heads
201
+ self.attention_dropout = attention_dropout
202
+ self.dynamic_mask_ratio = dynamic_mask_ratio
203
+ self.is_moe = is_moe
204
+ self.num_cdmoe_experts = num_cdmoe_experts
205
+ self.num_cdmoe_heads = num_cdmoe_heads
206
+ self.num_cdmoe_experts_per_head = num_cdmoe_experts_per_head
207
+ self.expert_retrieval_size = expert_retrieval_size
208
+
209
+ # Validate the correctness of rotary position embeddings parameters
210
+ # BC: if there is a 'type' field, copy it it to 'rope_type'.
211
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
212
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
213
+ rope_config_validation(self)
214
+
215
+ # for backward compatibility
216
+ if num_key_value_heads is None:
217
+ self.num_key_value_heads = num_attention_heads
218
+
219
+ super().__init__(
220
+ bos_token_id=bos_token_id,
221
+ eos_token_id=eos_token_id,
222
+ pad_token_id=pad_token_id,
223
+ tie_word_embeddings=tie_word_embeddings,
224
+ **kwargs,
225
+ )
226
+
227
+
228
+ __all__ = ["DogeConfig"]
generation_config.json CHANGED
@@ -1,7 +1,7 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 0,
4
- "eos_token_id": 1,
5
- "pad_token_id": 2,
6
- "transformers_version": "4.48.3"
7
- }
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.48.3"
7
+ }
modeling_doge.py CHANGED
The diff for this file is too large to render. See raw diff