BossRui commited on
Commit
bd4c3cc
1 Parent(s): eea4d0a

Upload configuration_upcycling_qwen2_moe.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. configuration_upcycling_qwen2_moe.py +180 -0
configuration_upcycling_qwen2_moe.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Qwen2MoE model configuration"""
16
+
17
+ from transformers.configuration_utils import PretrainedConfig
18
+ from transformers.utils import logging
19
+ import torch
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ class Qwen2Config(PretrainedConfig):
26
+ def __init__(
27
+ self,
28
+ vocab_size=151936,
29
+ hidden_size=4096,
30
+ intermediate_size=22016,
31
+ num_hidden_layers=32,
32
+ num_attention_heads=32,
33
+ num_key_value_heads=32,
34
+ hidden_act="silu",
35
+ max_position_embeddings=32768,
36
+ initializer_range=0.02,
37
+ rms_norm_eps=1e-6,
38
+ use_cache=True,
39
+ tie_word_embeddings=False,
40
+ rope_theta=10000.0,
41
+ use_sliding_window=False,
42
+ sliding_window=4096,
43
+ max_window_layers=28,
44
+ attention_dropout=0.0,
45
+ **kwargs,
46
+ ):
47
+ self.vocab_size = vocab_size
48
+ self.max_position_embeddings = max_position_embeddings
49
+ self.hidden_size = hidden_size
50
+ self.intermediate_size = intermediate_size
51
+ self.num_hidden_layers = num_hidden_layers
52
+ self.num_attention_heads = num_attention_heads
53
+ self.use_sliding_window = use_sliding_window
54
+ self.sliding_window = sliding_window
55
+ self.max_window_layers = max_window_layers
56
+
57
+ # for backward compatibility
58
+ if num_key_value_heads is None:
59
+ num_key_value_heads = num_attention_heads
60
+
61
+ self.num_key_value_heads = num_key_value_heads
62
+ self.hidden_act = hidden_act
63
+ self.initializer_range = initializer_range
64
+ self.rms_norm_eps = rms_norm_eps
65
+ self.use_cache = use_cache
66
+ self.rope_theta = rope_theta
67
+ self.attention_dropout = attention_dropout
68
+
69
+ super().__init__(
70
+ tie_word_embeddings=tie_word_embeddings,
71
+ **kwargs,
72
+ )
73
+
74
+
75
+ class Qwen2MoeConfig(PretrainedConfig):
76
+
77
+ model_type = "qwen2_moe"
78
+ keys_to_ignore_at_inference = ["past_key_values"]
79
+
80
+ def __init__(
81
+ self,
82
+ vocab_size=151936,
83
+ hidden_size=2048,
84
+ intermediate_size=5632,
85
+ num_hidden_layers=24,
86
+ num_attention_heads=16,
87
+ num_key_value_heads=16,
88
+ hidden_act="silu",
89
+ max_position_embeddings=32768,
90
+ initializer_range=0.02,
91
+ rms_norm_eps=1e-6,
92
+ use_cache=True,
93
+ tie_word_embeddings=False,
94
+ rope_theta=10000.0,
95
+ use_sliding_window=False,
96
+ sliding_window=4096,
97
+ max_window_layers=28,
98
+ attention_dropout=0.0,
99
+
100
+ decoder_sparse_step=1,
101
+ moe_intermediate_size=1408,
102
+ shared_expert_intermediate_size=5632,
103
+ num_experts_per_tok=4,
104
+ num_experts=60,
105
+ norm_topk_prob=False,
106
+ output_router_logits=False,
107
+ router_aux_loss_coef=0.001,
108
+ mlp_only_layers=None,
109
+ **kwargs,
110
+ ):
111
+ self.vocab_size = vocab_size
112
+ self.max_position_embeddings = max_position_embeddings
113
+ self.hidden_size = hidden_size
114
+ self.intermediate_size = intermediate_size
115
+ self.num_hidden_layers = num_hidden_layers
116
+ self.num_attention_heads = num_attention_heads
117
+ self.use_sliding_window = use_sliding_window
118
+ self.sliding_window = sliding_window
119
+ self.max_window_layers = max_window_layers
120
+
121
+ self.num_key_value_heads = num_key_value_heads
122
+ self.hidden_act = hidden_act
123
+ self.initializer_range = initializer_range
124
+ self.rms_norm_eps = rms_norm_eps
125
+ self.use_cache = use_cache
126
+ self.rope_theta = rope_theta
127
+ self.attention_dropout = attention_dropout
128
+
129
+ # MoE arguments
130
+ self.decoder_sparse_step = decoder_sparse_step
131
+ self.moe_intermediate_size = moe_intermediate_size
132
+ self.shared_expert_intermediate_size = shared_expert_intermediate_size
133
+ self.num_experts_per_tok = num_experts_per_tok
134
+ self.num_experts = num_experts
135
+ self.norm_topk_prob = norm_topk_prob
136
+ self.output_router_logits = output_router_logits
137
+ self.router_aux_loss_coef = router_aux_loss_coef
138
+ self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
139
+
140
+ super().__init__(
141
+ tie_word_embeddings=tie_word_embeddings,
142
+ **kwargs,
143
+ )
144
+
145
+
146
+ class UpcyclingQwen2MoeConfig(Qwen2Config):
147
+ model_type="upcycling-qwen2-moe"
148
+ #upcycling form Qwen2-1_5B
149
+ def __init__(
150
+ self,
151
+ decoder_sparse_step=1,
152
+ num_experts_per_tok=2,
153
+ num_experts=7,
154
+ norm_topk_prob=False,
155
+ output_router_logits=False,
156
+ router_aux_loss_coef=0.000,
157
+ mlp_only_layers=None,#MoE only last 2 layers
158
+ share_flag=False,
159
+ attn_init_change=False,
160
+ language_gate=False,
161
+ **kwargs
162
+ ):
163
+ super().__init__(**kwargs)
164
+ # MoE arguments
165
+ self.decoder_sparse_step = decoder_sparse_step
166
+ self.moe_intermediate_size = self.intermediate_size
167
+ self.shared_expert_intermediate_size = self.intermediate_size
168
+ self.norm_topk_prob = norm_topk_prob
169
+ self.output_router_logits = output_router_logits
170
+ self.router_aux_loss_coef = router_aux_loss_coef
171
+ # self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
172
+ self.mlp_only_layers=torch.arange(self.num_hidden_layers).tolist()[:-2]
173
+ self.share_flag=share_flag
174
+ self.num_experts_per_tok = num_experts_per_tok
175
+ self.num_experts = num_experts
176
+ self.attn_init_change=attn_init_change
177
+ self.language_gate=language_gate
178
+
179
+
180
+