| from transformers.models.bert.configuration_bert import BertConfig | |
| class MoeBertConfig(BertConfig): | |
| """ | |
| Extension of Bert configuration to add projections parameter. | |
| """ | |
| model_type = "bert_moe" | |
| def __init__( | |
| self, | |
| moebert_expert_num = 16, | |
| moebert_route_method = gate-token, | |
| moebert_expert_dropout = 0.1, | |
| moebert_expert_dim = 128, | |
| moebert_route_hash_list = None, | |
| moebert_share_importance = 0.5, | |
| moebert_load_importance = None, | |
| **kwargs | |
| ): | |
| super().__init__(**kwargs) | |
| self.moebert_expert_num = moebert_expert_num | |
| self.moebert_route_method = moebert_route_method | |
| self.moebert_expert_dropout = moebert_expert_dropout | |
| self.moebert_expert_dim = moebert_expert_dim | |
| self.moebert_route_hash_list = moebert_route_hash_list | |
| self.moebert_share_importance = moebert_share_importance | |
| self.moebert_load_importance = moebert_load_importance | |