Update configuration_quasar.py
Browse files- configuration_quasar.py +4 -47
configuration_quasar.py
CHANGED
@@ -23,10 +23,10 @@ from transformers.utils import logging
|
|
23 |
logger = logging.get_logger(__name__)
|
24 |
|
25 |
QUASAR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
26 |
-
"AstraMindAI/AstraQuasar-
|
27 |
}
|
28 |
|
29 |
-
#
|
30 |
class QuasarConfig(PretrainedConfig):
|
31 |
r"""
|
32 |
This is the configuration class to store the configuration of a [`QuasarModel`]. It is used to instantiate an Quasar
|
@@ -138,11 +138,7 @@ class QuasarConfig(PretrainedConfig):
|
|
138 |
qk_layernorm=False,
|
139 |
bos_token_id=1,
|
140 |
eos_token_id=2,
|
141 |
-
|
142 |
-
#duplicate_trick_v2=True,
|
143 |
-
#duplicate_rank=8,
|
144 |
-
#duplicate_dropout=0.0,
|
145 |
-
sliding_window=4096,
|
146 |
simple_norm=False,
|
147 |
remove_ff_bias=True,
|
148 |
gated_activation=False,
|
@@ -151,7 +147,7 @@ class QuasarConfig(PretrainedConfig):
|
|
151 |
layer_ranges=[[0, 16],[8, 21],[12, 25],[16, 29],[25, 32]],
|
152 |
**kwargs,
|
153 |
):
|
154 |
-
|
155 |
self.sliding_window = sliding_window
|
156 |
self.simple_norm = simple_norm
|
157 |
self.remove_ff_bias = remove_ff_bias
|
@@ -160,15 +156,6 @@ class QuasarConfig(PretrainedConfig):
|
|
160 |
self.duplicate_grad = duplicate_grad
|
161 |
self.layer_ranges = layer_ranges if layer_ranges is not None else []
|
162 |
|
163 |
-
##V2###
|
164 |
-
#self.duplicate_trick_v2 = duplicate_trick_v2
|
165 |
-
#self.layer_ranges_duplicate_v2 = []
|
166 |
-
#self._assing_layer_ranges_duplicate_v2()
|
167 |
-
#self.duplicate_rank = duplicate_rank
|
168 |
-
#self.duplicate_dropout = duplicate_dropout
|
169 |
-
#self._duplicate_trick_v2_validation()
|
170 |
-
####
|
171 |
-
|
172 |
self.vocab_size = vocab_size
|
173 |
self.hidden_size = hidden_size
|
174 |
self.intermediate_size = intermediate_size
|
@@ -200,36 +187,6 @@ class QuasarConfig(PretrainedConfig):
|
|
200 |
**kwargs,
|
201 |
)
|
202 |
|
203 |
-
def _assing_layer_ranges_duplicate_v2(self):
|
204 |
-
# Calcolo gli offset iniziali per ciascun intervallo nella lista unica
|
205 |
-
offsets = [0]
|
206 |
-
for i in range(1, len(self.layer_ranges)):
|
207 |
-
offset = offsets[-1] + self.layer_ranges[i - 1][1] - self.layer_ranges[i - 1][0]
|
208 |
-
offsets.append(offset)
|
209 |
-
|
210 |
-
# Seleziono solo gli intervalli dispari e calcolo le loro posizioni assolute
|
211 |
-
odd_intervals_positions = []
|
212 |
-
for i in range(1, len(self.layer_ranges), 2):
|
213 |
-
start, end = self.layer_ranges[i]
|
214 |
-
for n in range(start, end):
|
215 |
-
position = offsets[i] + (n - start)
|
216 |
-
odd_intervals_positions.append(position)
|
217 |
-
|
218 |
-
self.layer_ranges_duplicate_v2 = list(set(odd_intervals_positions))
|
219 |
-
|
220 |
-
|
221 |
-
def _duplicate_trick_v2_validation(self):
|
222 |
-
if self.duplicate_trick_v2 and self.duplicate_trick:
|
223 |
-
# warn just one time that only one of the two flags will be used
|
224 |
-
logger.warning(
|
225 |
-
"Both `duplicate_trick` and `duplicate_trick_v2` are set to True. Only `duplicate_trick_v2` will be used."
|
226 |
-
)
|
227 |
-
if self.duplicate_trick_v2 and self.duplicate_rank < 1:
|
228 |
-
raise ValueError("`duplicate_rank` must be a positive integer")
|
229 |
-
if self.duplicate_trick_v2 and not self.layer_ranges:
|
230 |
-
raise ValueError("`layer_ranges` must be set when `duplicate_trick_v2` is True")
|
231 |
-
|
232 |
-
|
233 |
# Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
|
234 |
def _rope_scaling_validation(self):
|
235 |
"""
|
|
|
23 |
logger = logging.get_logger(__name__)
|
24 |
|
25 |
QUASAR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
26 |
+
"AstraMindAI/AstraQuasar-4B": "https://huggingface.co/AstraMindAI/AstraQuasar-4B/resolve/main/config.json",
|
27 |
}
|
28 |
|
29 |
+
#from microsoft/phi-2, Phi -> Quasar
|
30 |
class QuasarConfig(PretrainedConfig):
|
31 |
r"""
|
32 |
This is the configuration class to store the configuration of a [`QuasarModel`]. It is used to instantiate an Quasar
|
|
|
138 |
qk_layernorm=False,
|
139 |
bos_token_id=1,
|
140 |
eos_token_id=2,
|
141 |
+
sliding_window=2048,
|
|
|
|
|
|
|
|
|
142 |
simple_norm=False,
|
143 |
remove_ff_bias=True,
|
144 |
gated_activation=False,
|
|
|
147 |
layer_ranges=[[0, 16],[8, 21],[12, 25],[16, 29],[25, 32]],
|
148 |
**kwargs,
|
149 |
):
|
150 |
+
|
151 |
self.sliding_window = sliding_window
|
152 |
self.simple_norm = simple_norm
|
153 |
self.remove_ff_bias = remove_ff_bias
|
|
|
156 |
self.duplicate_grad = duplicate_grad
|
157 |
self.layer_ranges = layer_ranges if layer_ranges is not None else []
|
158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
self.vocab_size = vocab_size
|
160 |
self.hidden_size = hidden_size
|
161 |
self.intermediate_size = intermediate_size
|
|
|
187 |
**kwargs,
|
188 |
)
|
189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
# Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
|
191 |
def _rope_scaling_validation(self):
|
192 |
"""
|