damerajee commited on
Commit
11e0196
1 Parent(s): fd40f56

Create configuration_Llamoe.py

Browse files
Files changed (1) hide show
  1. configuration_Llamoe.py +70 -0
configuration_Llamoe.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from transformers.configuration_utils import PretrainedConfig
3
+ from transformers.utils import logging
4
+
5
+
6
+ logger = logging.get_logger(__name__)
7
+
8
+ GEMMOE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
9
+ "Crystalcareai/GemMoE-Beta-1": "https://huggingface.co/Crystalcareai/GemMoE-Beta-1/resolve/main/config.json",
10
+ }
11
+
12
+
13
+ class GemmoeConfig(PretrainedConfig):
14
+ model_type = "Llamoe"
15
+ keys_to_ignore_at_inference = ["past_key_values"]
16
+
17
+ def __init__(
18
+ self,
19
+ vocab_size=32000,
20
+ hidden_size=3072,
21
+ intermediate_size=24576,
22
+ num_hidden_layers=28,
23
+ num_attention_heads=16,
24
+ num_key_value_heads=16,
25
+ head_dim=256,
26
+ hidden_act="gelu",
27
+ max_position_embeddings=8192,
28
+ initializer_range=0.02,
29
+ rms_norm_eps=1e-6,
30
+ use_cache=True,
31
+ pad_token_id=0,
32
+ eos_token_id=1,
33
+ bos_token_id=2,
34
+ tie_word_embeddings=True,
35
+ rope_theta=10000.0,
36
+ attention_bias=False,
37
+ attention_dropout=0.0,
38
+ num_experts_per_tok=2,
39
+ num_local_experts=8,
40
+ router_aux_loss_coef=0.02,
41
+ output_router_logits=False,
42
+ **kwargs,
43
+ ):
44
+ self.vocab_size = vocab_size
45
+ self.max_position_embeddings = max_position_embeddings
46
+ self.hidden_size = hidden_size
47
+ self.intermediate_size = intermediate_size
48
+ self.num_hidden_layers = num_hidden_layers
49
+ self.num_attention_heads = num_attention_heads
50
+ self.head_dim = head_dim
51
+ self.num_key_value_heads = num_key_value_heads
52
+ self.hidden_act = hidden_act
53
+ self.initializer_range = initializer_range
54
+ self.rms_norm_eps = rms_norm_eps
55
+ self.use_cache = use_cache
56
+ self.rope_theta = rope_theta
57
+ self.attention_bias = attention_bias
58
+ self.attention_dropout = attention_dropout
59
+ self.num_experts_per_tok = num_experts_per_tok
60
+ self.num_local_experts = num_local_experts
61
+ self.router_aux_loss_coef = router_aux_loss_coef
62
+ self.output_router_logits = output_router_logits
63
+
64
+ super().__init__(
65
+ pad_token_id=pad_token_id,
66
+ bos_token_id=bos_token_id,
67
+ eos_token_id=eos_token_id,
68
+ tie_word_embeddings=tie_word_embeddings,
69
+ **kwargs,
70
+ )