if001/sample_phi-2 · Hugging Face

PhiConfig {
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 2,
  "hidden_act": "gelu_new",
  "hidden_size": 8,
  "initializer_range": 0.02,
  "intermediate_size": 10,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "phi",
  "num_attention_heads": 4,
  "num_hidden_layers": 6,
  "num_key_value_heads": 2,
  "partial_rotary_factor": 0.5,
  "qk_layernorm": false,
  "resid_pdrop": 0.0,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 51200
}

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 8)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=8, out_features=8, bias=True)
          (k_proj): Linear(in_features=8, out_features=4, bias=True)
          (v_proj): Linear(in_features=8, out_features=4, bias=True)
          (dense): Linear(in_features=8, out_features=8, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=8, out_features=10, bias=True)
          (fc2): Linear(in_features=10, out_features=8, bias=True)
        )
        (input_layernorm): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=8, out_features=51200, bias=True)
)

===========================================================================
Layer (type:depth-idx)                             Param #
===========================================================================
PhiForCausalLM                                     --
├─PhiModel: 1-1                                    --
│    └─Embedding: 2-1                              409,600
│    └─Dropout: 2-2                                --
│    └─ModuleList: 2-3                             --
│    │    └─PhiDecoderLayer: 3-1                   410
│    │    └─PhiDecoderLayer: 3-2                   410
│    │    └─PhiDecoderLayer: 3-3                   410
│    │    └─PhiDecoderLayer: 3-4                   410
│    │    └─PhiDecoderLayer: 3-5                   410
│    │    └─PhiDecoderLayer: 3-6                   410
│    └─LayerNorm: 2-4                              16
├─Linear: 1-2                                      460,800
===========================================================================
Total params: 872,876
Trainable params: 872,876
Non-trainable params: 0
===========================================================================