{ "activation_type": "swiglu", "alibi": false, "alibi_bias_max": 8.0, "architectures": [ "AIGCodeXMoEForCausalLM" ], "attention_dropout": 0.0, "attention_layer_norm": false, "attention_layer_norm_with_affine": false, "batch_size": 4, "bias_for_layer_norm": false, "block_group_size": 1, "block_type": "sequential", "clip_qkv": null, "d_model": 4096, "deepnorm": false, "embedding_dropout": 0.0, "embedding_size": 65280, "encoder_decoder": false, "eos_token_id": 2, "eval_max_sequence_length": null, "exp_dim_ratio": 1, "flash_attention": false, "gate_level": "token", "gate_sample_ratio": 1, "gate_softmax_temperature": 8.0, "gshard": false, "include_bias": false, "init_cutoff_factor": null, "init_device": "meta", "init_fn": "normal", "init_std": 0.01, "intermediate_size": 16384, "latent_attention": false, "latent_attention_dim": 512, "layer_norm_eps": 1e-05, "layer_norm_type": "default", "layer_norm_with_affine": false, "layer_share": false, "layer_share_mlp_version": 1, "layer_std_check": false, "max_sequence_length": 4096, "mlp_hidden_size": null, "mlp_ratio": 4, "mobile_llm_repeat_num": 1, "model_type": "hf_aigcodexmoe", "moe_act_ckpt_ratio": 1, "moe_auxiliary_loss": false, "moe_auxiliary_loss_weight": 0.0, "moe_batch_prioritized_routing": false, "moe_eval_capacity_token_fraction": 0.25, "moe_expert_count": 4, "moe_expert_count_mluti_level": null, "moe_freq": 2, "moe_freq_pos": 0, "moe_gate_input_type": "concat", "moe_gate_loss_combine_method": "average", "moe_gate_loss_weight": 0.0, "moe_gate_no_grad": false, "moe_gating_use_fp32": true, "moe_logging": false, "moe_normalize_gate_prob_before_dropping": false, "moe_second_expert_policy": "sampling", "moe_share_expert_count": 0, "moe_top1_expert": true, "moe_topn_expert": 1, "moe_version": 1, "multi_query_attention": false, "n_heads": 32, "n_kv_heads": null, "n_layers": 22, "pad_token_id": 0, "ple_layer_num": 0, "ple_layernorm": false, "precision": "amp_bf16", "residual_dropout": 0.0, "rope": true, "rope_base": 30000, "rope_ext_ratio": 1, "rope_full_precision": true, "scale_logits": false, "sft_ans_mask": false, "share_layer_groups": 1, "share_moe_groups": 1, "torch_dtype": "float32", "transformers_version": "4.40.2", "use_cache": true, "use_mobile_llm": false, "use_moe": false, "use_ple": false, "use_xmoe": true, "vocab_size": 64000, "weight_tying": false }