Xenova HF staff commited on
Commit
d1526e2
1 Parent(s): ae73fc0

Delete configuration_jais.py

Browse files
Files changed (1) hide show
  1. configuration_jais.py +0 -143
configuration_jais.py DELETED
@@ -1,143 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
3
- # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
- # Copyright 2023 Cerebras Systems.
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- # Unless required by applicable law or agreed to in writing, software
13
- # distributed under the License is distributed on an "AS IS" BASIS,
14
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
- # See the License for the specific language governing permissions and
16
- # limitations under the License.
17
- """ JAIS configuration"""
18
-
19
- from transformers.configuration_utils import PretrainedConfig
20
- from transformers.utils import logging
21
-
22
-
23
- logger = logging.get_logger(__name__)
24
-
25
- class JAISConfig(PretrainedConfig):
26
- """
27
- This is the configuration class to store the configuration of a [`JAISModel`]. It is used to
28
- instantiate a JAIS model according to the specified arguments, defining the model architecture.
29
-
30
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
31
- documentation from [`PretrainedConfig`] for more information.
32
-
33
-
34
- Args:
35
- vocab_size (`int`, *optional*, defaults to 50257):
36
- Vocabulary size of the JAIS model. Defines the number of different tokens that can be represented by the
37
- `inputs_ids` passed when calling [`JAISModel`].
38
- n_positions (`int`, *optional*, defaults to 1024):
39
- The maximum sequence length that this model might ever be used with. Typically set this to something large
40
- just in case (e.g., 512 or 1024 or 2048).
41
- n_embd (`int`, *optional*, defaults to 768):
42
- Dimensionality of the embeddings and hidden states.
43
- n_layer (`int`, *optional*, defaults to 12):
44
- Number of hidden layers in the Transformer encoder.
45
- n_head (`int`, *optional*, defaults to 12):
46
- Number of attention heads for each attention layer in the Transformer encoder.
47
- n_inner (`int`, *optional*, defaults to None):
48
- Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
49
- activation_function (`str`, *optional*, defaults to `"gelu"`):
50
- Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new", "swiglu"]`.
51
- resid_pdrop (`float`, *optional*, defaults to 0.1):
52
- The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
53
- embd_pdrop (`float`, *optional*, defaults to 0.1):
54
- The dropout ratio for the embeddings.
55
- attn_pdrop (`float`, *optional*, defaults to 0.1):
56
- The dropout ratio for the attention.
57
- layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
58
- The epsilon to use in the layer normalization layers.
59
- initializer_range (`float`, *optional*, defaults to 0.02):
60
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
61
- scale_attn_weights (`bool`, *optional*, defaults to `True`):
62
- Scale attention weights by dividing by sqrt(hidden_size)..
63
- use_cache (`bool`, *optional*, defaults to `True`):
64
- Whether or not the model should return the last key/values attentions (not used by all models).
65
- scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
66
- Whether to additionally scale attention weights by `1 / layer_idx + 1`.
67
- reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
68
- Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
69
- dot-product/softmax to float() when training with mixed precision.
70
- position_embedding_type (`str`, *optional*, defaults to `"learned"`):
71
- Positional embedding can be either `"alibi"` or `"learned"`.
72
- width_scale (`float`, *optional*, defaults to 1.0):
73
- muP parameter to scale output logits and initializers. Calculated as (`d_model,0 / d_model`),
74
- where `d_model` is the model's width and `d_model,0` is the proxy model's width.
75
- embeddings_scale (`float`, *optional*, defaults to 1.0):
76
- muP parameter to scale token and position embeddings.
77
- scale_qk_dot_by_d (`bool`, *optional*, defaults to `False`):
78
- Scale attention weights by dividing by hidden_size instead of sqrt(hidden_size).
79
- Need to set scale_attn_weights to `True` as well.
80
-
81
- """
82
-
83
- model_type = "jais"
84
- keys_to_ignore_at_inference = ["past_key_values"]
85
- attribute_map = {
86
- "hidden_size": "n_embd",
87
- "max_position_embeddings": "n_positions",
88
- "num_attention_heads": "n_head",
89
- "num_hidden_layers": "n_layer",
90
- }
91
-
92
- def __init__(
93
- self,
94
- vocab_size=50257,
95
- n_positions=1024,
96
- n_embd=768,
97
- n_layer=12,
98
- n_head=12,
99
- n_inner=None,
100
- activation_function="gelu_new",
101
- resid_pdrop=0.1,
102
- embd_pdrop=0.1,
103
- attn_pdrop=0.1,
104
- layer_norm_epsilon=1e-5,
105
- initializer_range=0.02,
106
- scale_attn_weights=True,
107
- use_cache=True,
108
- bos_token_id=50256,
109
- eos_token_id=50256,
110
- scale_attn_by_inverse_layer_idx=False,
111
- reorder_and_upcast_attn=False,
112
- position_embedding_type="learned",
113
- width_scale=1.0,
114
- embeddings_scale=1.0,
115
- scale_qk_dot_by_d=False,
116
- **kwargs,
117
- ):
118
- self.vocab_size = vocab_size
119
- self.n_positions = n_positions
120
- self.n_embd = n_embd
121
- self.n_layer = n_layer
122
- self.n_head = n_head
123
- self.n_inner = n_inner
124
- self.activation_function = activation_function
125
- self.resid_pdrop = resid_pdrop
126
- self.embd_pdrop = embd_pdrop
127
- self.attn_pdrop = attn_pdrop
128
- self.layer_norm_epsilon = layer_norm_epsilon
129
- self.initializer_range = initializer_range
130
- self.scale_attn_weights = scale_attn_weights
131
- self.use_cache = use_cache
132
- self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
133
- self.reorder_and_upcast_attn = reorder_and_upcast_attn
134
-
135
- self.bos_token_id = bos_token_id
136
- self.eos_token_id = eos_token_id
137
-
138
- self.position_embedding_type = position_embedding_type
139
- self.width_scale = width_scale
140
- self.embeddings_scale = embeddings_scale
141
- self.scale_qk_dot_by_d = scale_qk_dot_by_d
142
-
143
- super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)