mlinmg commited on
Commit
4f37118
1 Parent(s): a99a049

Update configuration_quasar.py

Browse files
Files changed (1) hide show
  1. configuration_quasar.py +16 -7
configuration_quasar.py CHANGED
@@ -18,9 +18,7 @@ QUASAR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
18
  class QuasarConfig(PretrainedConfig):
19
  r"""
20
  This is the configuration class to store the configuration of a [`QuasarModel`]. It is used to instantiate an Quasar
21
- model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
22
- defaults will yield a similar configuration to that of the Quasar
23
- [microsoft/quasar-1](https://huggingface.co/microsoft/quasar-1).
24
 
25
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
26
  documentation from [`PretrainedConfig`] for more information.
@@ -83,15 +81,26 @@ class QuasarConfig(PretrainedConfig):
83
  Denotes beginning of sequences token id.
84
  eos_token_id (`int`, *optional*, defaults to 2):
85
  Denotes end of sequences token id.
86
-
 
 
 
 
 
 
 
 
 
 
 
87
  Example:
88
 
89
  ```python
90
- >>> from transformers import AutoModel, QuasarConfig
91
 
92
 
93
- >>> # Initializing a Quasar-1 style configuration
94
- >>> configuration = QuasarConfig.from_pretrained("AstraMindAI/AstraQuasar-4.5B")
95
 
96
  >>> # Initializing a model from the configuration
97
  >>> model = QuasarModel(configuration, trust_remote_code=True)
 
18
  class QuasarConfig(PretrainedConfig):
19
  r"""
20
  This is the configuration class to store the configuration of a [`QuasarModel`]. It is used to instantiate an Quasar
21
+ model according to the specified arguments, defining the model architecture.
 
 
22
 
23
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
24
  documentation from [`PretrainedConfig`] for more information.
 
81
  Denotes beginning of sequences token id.
82
  eos_token_id (`int`, *optional*, defaults to 2):
83
  Denotes end of sequences token id.
84
+ duplicate_trick (`bool`, *optional*, defaults to `True`):
85
+ Whether to use the trick of self layers calling
86
+ duplicate_grad (`bool`, *optional*, defaults to `True`):
87
+ Whether or not to do a double grad step during training. Thi is not compatible with Gradient Checkpointing
88
+ remove_ff_bias (`bool`, *optional*, defaults to `True`):
89
+ Whether or not to remove feed forward bias
90
+ gated_activation (`bool`, *optional*, defaults to `False`):
91
+ Whether or not to use a GeluGLU Activation
92
+ simple_norm (`bool`, *optional*, defaults to `False`):
93
+ Whether or not to use a simpler version of RMS Layer Norm
94
+ sliding_window ('int', *optional* defaults to 2048):
95
+ If specified it enables a sliding context window to extend the moel context from 2048 to 32K
96
  Example:
97
 
98
  ```python
99
+ >>> from transformers import AutoModel, AutoConfig
100
 
101
 
102
+ >>> # Initializing a Quasar style configuration
103
+ >>> configuration = AutoConfig.from_pretrained("AstraMindAI/AstraQuasar-4B")
104
 
105
  >>> # Initializing a model from the configuration
106
  >>> model = QuasarModel(configuration, trust_remote_code=True)