paulilioaica commited on
Commit
f136148
1 Parent(s): b523290

Create configuration_phi.py

Browse files
Files changed (1) hide show
  1. configuration_phi.py +66 -0
configuration_phi.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ import math
5
+ from typing import Optional
6
+
7
+ from transformers import PretrainedConfig
8
+
9
+
10
+ class PhiConfig(PretrainedConfig):
11
+ """Phi configuration."""
12
+
13
+ model_type = "phi-msft"
14
+ attribute_map = {
15
+ "max_position_embeddings": "n_positions",
16
+ "hidden_size": "n_embd",
17
+ "num_attention_heads": "n_head",
18
+ "num_hidden_layers": "n_layer",
19
+ }
20
+
21
+ def __init__(
22
+ self,
23
+ vocab_size: int = 50304,
24
+ n_positions: int = 2048,
25
+ n_embd: int = 1024,
26
+ n_layer: int = 20,
27
+ n_inner: Optional[int] = None,
28
+ n_head: int = 16,
29
+ n_head_kv: Optional[int] = None,
30
+ num_experts_per_tok: int = 2,
31
+ num_local_experts: int = 4,
32
+ rotary_dim: Optional[int] = 32,
33
+ activation_function: Optional[str] = "gelu_new",
34
+ flash_attn: bool = False,
35
+ flash_rotary: bool = False,
36
+ fused_dense: bool = False,
37
+ attn_pdrop: float = 0.0,
38
+ embd_pdrop: float = 0.0,
39
+ resid_pdrop: float = 0.0,
40
+ layer_norm_epsilon: float = 1e-5,
41
+ initializer_range: float = 0.02,
42
+ tie_word_embeddings: bool = False,
43
+ pad_vocab_size_multiple: int = 64,
44
+ **kwargs
45
+ ) -> None:
46
+ self.vocab_size = int(math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple)
47
+ self.n_positions = n_positions
48
+ self.n_embd = n_embd
49
+ self.n_layer = n_layer
50
+ self.n_inner = n_inner
51
+ self.n_head = n_head
52
+ self.n_head_kv = n_head_kv
53
+ self.num_experts_per_tok = num_experts_per_tok
54
+ self.num_local_experts = num_local_experts
55
+ self.rotary_dim = min(rotary_dim, n_embd // n_head)
56
+ self.activation_function = activation_function
57
+ self.flash_attn = flash_attn
58
+ self.flash_rotary = flash_rotary
59
+ self.fused_dense = fused_dense
60
+ self.attn_pdrop = attn_pdrop
61
+ self.embd_pdrop = embd_pdrop
62
+ self.resid_pdrop = resid_pdrop
63
+ self.layer_norm_epsilon = layer_norm_epsilon
64
+ self.initializer_range = initializer_range
65
+
66
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)