Update configuration_quasar.py
Browse files- configuration_quasar.py +16 -7
configuration_quasar.py
CHANGED
@@ -18,9 +18,7 @@ QUASAR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|
18 |
class QuasarConfig(PretrainedConfig):
|
19 |
r"""
|
20 |
This is the configuration class to store the configuration of a [`QuasarModel`]. It is used to instantiate an Quasar
|
21 |
-
model according to the specified arguments, defining the model architecture.
|
22 |
-
defaults will yield a similar configuration to that of the Quasar
|
23 |
-
[microsoft/quasar-1](https://huggingface.co/microsoft/quasar-1).
|
24 |
|
25 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
26 |
documentation from [`PretrainedConfig`] for more information.
|
@@ -83,15 +81,26 @@ class QuasarConfig(PretrainedConfig):
|
|
83 |
Denotes beginning of sequences token id.
|
84 |
eos_token_id (`int`, *optional*, defaults to 2):
|
85 |
Denotes end of sequences token id.
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
Example:
|
88 |
|
89 |
```python
|
90 |
-
>>> from transformers import AutoModel,
|
91 |
|
92 |
|
93 |
-
>>> # Initializing a Quasar
|
94 |
-
>>> configuration =
|
95 |
|
96 |
>>> # Initializing a model from the configuration
|
97 |
>>> model = QuasarModel(configuration, trust_remote_code=True)
|
|
|
18 |
class QuasarConfig(PretrainedConfig):
|
19 |
r"""
|
20 |
This is the configuration class to store the configuration of a [`QuasarModel`]. It is used to instantiate an Quasar
|
21 |
+
model according to the specified arguments, defining the model architecture.
|
|
|
|
|
22 |
|
23 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
24 |
documentation from [`PretrainedConfig`] for more information.
|
|
|
81 |
Denotes beginning of sequences token id.
|
82 |
eos_token_id (`int`, *optional*, defaults to 2):
|
83 |
Denotes end of sequences token id.
|
84 |
+
duplicate_trick (`bool`, *optional*, defaults to `True`):
|
85 |
+
Whether to use the trick of self layers calling
|
86 |
+
duplicate_grad (`bool`, *optional*, defaults to `True`):
|
87 |
+
Whether or not to do a double grad step during training. Thi is not compatible with Gradient Checkpointing
|
88 |
+
remove_ff_bias (`bool`, *optional*, defaults to `True`):
|
89 |
+
Whether or not to remove feed forward bias
|
90 |
+
gated_activation (`bool`, *optional*, defaults to `False`):
|
91 |
+
Whether or not to use a GeluGLU Activation
|
92 |
+
simple_norm (`bool`, *optional*, defaults to `False`):
|
93 |
+
Whether or not to use a simpler version of RMS Layer Norm
|
94 |
+
sliding_window ('int', *optional* defaults to 2048):
|
95 |
+
If specified it enables a sliding context window to extend the moel context from 2048 to 32K
|
96 |
Example:
|
97 |
|
98 |
```python
|
99 |
+
>>> from transformers import AutoModel, AutoConfig
|
100 |
|
101 |
|
102 |
+
>>> # Initializing a Quasar style configuration
|
103 |
+
>>> configuration = AutoConfig.from_pretrained("AstraMindAI/AstraQuasar-4B")
|
104 |
|
105 |
>>> # Initializing a model from the configuration
|
106 |
>>> model = QuasarModel(configuration, trust_remote_code=True)
|