|
--- |
|
license: mit |
|
--- |
|
|
|
``` |
|
PhiConfig { |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"embd_pdrop": 0.0, |
|
"eos_token_id": 2, |
|
"hidden_act": "gelu_new", |
|
"hidden_size": 8, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 10, |
|
"layer_norm_eps": 1e-05, |
|
"max_position_embeddings": 2048, |
|
"model_type": "phi", |
|
"num_attention_heads": 4, |
|
"num_hidden_layers": 6, |
|
"num_key_value_heads": 2, |
|
"partial_rotary_factor": 0.5, |
|
"qk_layernorm": false, |
|
"resid_pdrop": 0.0, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"transformers_version": "4.38.2", |
|
"use_cache": true, |
|
"vocab_size": 20 |
|
} |
|
``` |
|
|
|
``` |
|
PhiForCausalLM( |
|
(model): PhiModel( |
|
(embed_tokens): Embedding(20, 8) |
|
(embed_dropout): Dropout(p=0.0, inplace=False) |
|
(layers): ModuleList( |
|
(0-5): 6 x PhiDecoderLayer( |
|
(self_attn): PhiAttention( |
|
(q_proj): Linear(in_features=8, out_features=8, bias=True) |
|
(k_proj): Linear(in_features=8, out_features=4, bias=True) |
|
(v_proj): Linear(in_features=8, out_features=4, bias=True) |
|
(dense): Linear(in_features=8, out_features=8, bias=True) |
|
(rotary_emb): PhiRotaryEmbedding() |
|
) |
|
(mlp): PhiMLP( |
|
(activation_fn): NewGELUActivation() |
|
(fc1): Linear(in_features=8, out_features=10, bias=True) |
|
(fc2): Linear(in_features=10, out_features=8, bias=True) |
|
) |
|
(input_layernorm): LayerNorm((8,), eps=1e-05, elementwise_affine=True) |
|
(resid_dropout): Dropout(p=0.0, inplace=False) |
|
) |
|
) |
|
(final_layernorm): LayerNorm((8,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(lm_head): Linear(in_features=8, out_features=20, bias=True) |
|
) |
|
|
|
``` |
|
|
|
``` |
|
=========================================================================== |
|
Layer (type:depth-idx) Param # |
|
=========================================================================== |
|
PhiForCausalLM -- |
|
├─PhiModel: 1-1 -- |
|
│ └─Embedding: 2-1 160 |
|
│ └─Dropout: 2-2 -- |
|
│ └─ModuleList: 2-3 -- |
|
│ │ └─PhiDecoderLayer: 3-1 410 |
|
│ │ └─PhiDecoderLayer: 3-2 410 |
|
│ │ └─PhiDecoderLayer: 3-3 410 |
|
│ │ └─PhiDecoderLayer: 3-4 410 |
|
│ │ └─PhiDecoderLayer: 3-5 410 |
|
│ │ └─PhiDecoderLayer: 3-6 410 |
|
│ └─LayerNorm: 2-4 16 |
|
├─Linear: 1-2 180 |
|
=========================================================================== |
|
Total params: 2,816 |
|
Trainable params: 2,816 |
|
Non-trainable params: 0 |
|
=========================================================================== |
|
``` |
|
|