prince-canuma commited on
Commit
738ec8a
1 Parent(s): 43e99fa

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ - fr
5
+ - de
6
+ - es
7
+ - it
8
+ - pt
9
+ - ja
10
+ - ko
11
+ - zh
12
+ - ar
13
+ library_name: transformers
14
+ tags:
15
+ - mlx
16
+ ---
17
+
18
+ # mlx-community/c4ai-command-r-v01-4bit
19
+ This model was converted to MLX format from [`CohereForAI/c4ai-command-r-v01`]().
20
+ Refer to the [original model card](https://huggingface.co/CohereForAI/c4ai-command-r-v01) for more details on the model.
21
+ ## Use with mlx
22
+
23
+ ```bash
24
+ pip install mlx-lm
25
+ ```
26
+
27
+ ```python
28
+ from mlx_lm import load, generate
29
+
30
+ model, tokenizer = load("mlx-community/c4ai-command-r-v01-4bit")
31
+ response = generate(model, tokenizer, prompt="hello", verbose=True)
32
+ ```
config.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 256000,
3
+ "max_position_embeddings": 8192,
4
+ "hidden_size": 8192,
5
+ "intermediate_size": 22528,
6
+ "num_hidden_layers": 40,
7
+ "num_attention_heads": 64,
8
+ "num_key_value_heads": 64,
9
+ "hidden_act": "silu",
10
+ "initializer_range": 0.02,
11
+ "layer_norm_eps": 1e-05,
12
+ "pretraining_tp": 1,
13
+ "use_cache": true,
14
+ "rope_theta": 8000000.0,
15
+ "attention_bias": false,
16
+ "attention_dropout": 0.0,
17
+ "return_dict": true,
18
+ "output_hidden_states": false,
19
+ "output_attentions": false,
20
+ "torchscript": false,
21
+ "torch_dtype": "float16",
22
+ "use_bfloat16": false,
23
+ "tf_legacy_loss": false,
24
+ "pruned_heads": {},
25
+ "tie_word_embeddings": true,
26
+ "chunk_size_feed_forward": 0,
27
+ "is_encoder_decoder": false,
28
+ "is_decoder": false,
29
+ "cross_attention_hidden_size": null,
30
+ "add_cross_attention": false,
31
+ "tie_encoder_decoder": false,
32
+ "max_length": 20,
33
+ "min_length": 0,
34
+ "do_sample": false,
35
+ "early_stopping": false,
36
+ "num_beams": 1,
37
+ "num_beam_groups": 1,
38
+ "diversity_penalty": 0.0,
39
+ "temperature": 1.0,
40
+ "top_k": 50,
41
+ "top_p": 1.0,
42
+ "typical_p": 1.0,
43
+ "repetition_penalty": 1.0,
44
+ "length_penalty": 1.0,
45
+ "no_repeat_ngram_size": 0,
46
+ "encoder_no_repeat_ngram_size": 0,
47
+ "bad_words_ids": null,
48
+ "num_return_sequences": 1,
49
+ "output_scores": false,
50
+ "return_dict_in_generate": false,
51
+ "forced_bos_token_id": null,
52
+ "forced_eos_token_id": null,
53
+ "remove_invalid_values": false,
54
+ "exponential_decay_length_penalty": null,
55
+ "suppress_tokens": null,
56
+ "begin_suppress_tokens": null,
57
+ "architectures": [
58
+ "CohereForCausalLM"
59
+ ],
60
+ "finetuning_task": null,
61
+ "id2label": {
62
+ "0": "LABEL_0",
63
+ "1": "LABEL_1"
64
+ },
65
+ "label2id": {
66
+ "LABEL_0": 0,
67
+ "LABEL_1": 1
68
+ },
69
+ "tokenizer_class": null,
70
+ "prefix": null,
71
+ "bos_token_id": 5,
72
+ "pad_token_id": 0,
73
+ "eos_token_id": 255001,
74
+ "sep_token_id": null,
75
+ "decoder_start_token_id": null,
76
+ "task_specific_params": null,
77
+ "problem_type": null,
78
+ "_name_or_path": "/home/.cache/huggingface/hub/models--CohereForAI--c4ai-command-r-v01/snapshots/f69b9efadcc5f2f0ed664388fa27c50f4825df9f",
79
+ "transformers_version": "4.38.2",
80
+ "auto_map": {
81
+ "AutoConfig": "configuration_cohere.CohereConfig",
82
+ "AutoModel": "modeling_cohere.CohereModel",
83
+ "AutoModelForCausalLM": "modeling_cohere.CohereForCausalLM"
84
+ },
85
+ "logit_scale": 0.0625,
86
+ "model_type": "cohere",
87
+ "quantization": {
88
+ "group_size": 64,
89
+ "bits": 4
90
+ }
91
+ }
configuration_cohere.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Cohere team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """ Cohere model configuration"""
21
+
22
+ from transformers import PretrainedConfig, AutoConfig
23
+ from transformers.utils import logging
24
+
25
+
26
+ logger = logging.get_logger(__name__)
27
+
28
+
29
+ class CohereConfig(PretrainedConfig):
30
+ r"""
31
+ This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
32
+ model according to the specified arguments, defining the model architecture.
33
+
34
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
35
+ documentation from [`PretrainedConfig`] for more information.
36
+
37
+
38
+ Args:
39
+ vocab_size (`int`, *optional*, defaults to 256000):
40
+ Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
41
+ `inputs_ids` passed when calling [`CohereModel`]
42
+ hidden_size (`int`, *optional*, defaults to 8192):
43
+ Dimension of the hidden representations.
44
+ intermediate_size (`int`, *optional*, defaults to 22528):
45
+ Dimension of the MLP representations.
46
+ num_hidden_layers (`int`, *optional*, defaults to 40):
47
+ Number of hidden layers in the Transformer decoder.
48
+ num_attention_heads (`int`, *optional*, defaults to 64):
49
+ Number of attention heads for each attention layer in the Transformer decoder.
50
+ num_key_value_heads (`int`, *optional*):
51
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
52
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
53
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
54
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
55
+ by meanpooling all the original heads within that group. For more details checkout [this
56
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
57
+ `num_attention_heads`.
58
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
59
+ The non-linear activation function (function or string) in the decoder.
60
+ max_position_embeddings (`int`, *optional*, defaults to 8192):
61
+ The maximum sequence length that this model might ever be used with.
62
+ initializer_range (`float`, *optional*, defaults to 0.02):
63
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
64
+ layer_norm_eps (`float`, *optional*, defaults to 1e-05):
65
+ The epsilon used by the layer normalization.
66
+ use_cache (`bool`, *optional*, defaults to `True`):
67
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
68
+ relevant if `config.is_decoder=True`.
69
+ pad_token_id (`int`, *optional*, defaults to 0):
70
+ Padding token id.
71
+ bos_token_id (`int`, *optional*, defaults to 5):
72
+ Beginning of stream token id.
73
+ eos_token_id (`int`, *optional*, defaults to 255001):
74
+ End of stream token id.
75
+ pretraining_tp (`int`, *optional*, defaults to 1):
76
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
77
+ document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to understand more about it. This value is
78
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
79
+ issue](https://github.com/pytorch/pytorch/issues/76232).
80
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
81
+ Whether to tie weight embeddings
82
+ rope_theta (`float`, *optional*, defaults to 10000.0):
83
+ The base period of the RoPE embeddings.
84
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
85
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
86
+ attention_dropout (`float`, *optional*, defaults to 0.0):
87
+ The dropout ratio for the attention probabilities.
88
+
89
+ ```python
90
+ >>> from transformers import CohereModel, CohereConfig
91
+
92
+ >>> # Initializing a Cohere model configuration
93
+ >>> configuration = CohereConfig()
94
+
95
+ >>> # Initializing a model from the Cohere configuration
96
+ >>> model = CohereModel(configuration)
97
+
98
+ >>> # Accessing the model configuration
99
+ >>> configuration = model.config
100
+ ```"""
101
+
102
+ model_type = "cohere"
103
+ keys_to_ignore_at_inference = ["past_key_values"]
104
+
105
+ def __init__(
106
+ self,
107
+ vocab_size=256000,
108
+ hidden_size=8192,
109
+ intermediate_size=22528,
110
+ num_hidden_layers=40,
111
+ num_attention_heads=64,
112
+ num_key_value_heads=None,
113
+ hidden_act="silu",
114
+ max_position_embeddings=8192,
115
+ initializer_range=0.02,
116
+ layer_norm_eps=1e-5,
117
+ use_cache=True,
118
+ pad_token_id=0,
119
+ bos_token_id=5,
120
+ eos_token_id=255001,
121
+ pretraining_tp=1,
122
+ tie_word_embeddings=True,
123
+ rope_theta=10000.0,
124
+ attention_bias=False,
125
+ attention_dropout=0.0,
126
+ **kwargs,
127
+ ):
128
+ self.vocab_size = vocab_size
129
+ self.max_position_embeddings = max_position_embeddings
130
+ self.hidden_size = hidden_size
131
+ self.intermediate_size = intermediate_size
132
+ self.num_hidden_layers = num_hidden_layers
133
+ self.num_attention_heads = num_attention_heads
134
+
135
+ # for backward compatibility
136
+ if num_key_value_heads is None:
137
+ num_key_value_heads = num_attention_heads
138
+
139
+ self.num_key_value_heads = num_key_value_heads
140
+ self.hidden_act = hidden_act
141
+ self.initializer_range = initializer_range
142
+ self.layer_norm_eps = layer_norm_eps
143
+ self.pretraining_tp = pretraining_tp
144
+ self.use_cache = use_cache
145
+ self.rope_theta = rope_theta
146
+ self.attention_bias = attention_bias
147
+ self.attention_dropout = attention_dropout
148
+
149
+ super().__init__(
150
+ pad_token_id=pad_token_id,
151
+ bos_token_id=bos_token_id,
152
+ eos_token_id=eos_token_id,
153
+ tie_word_embeddings=tie_word_embeddings,
154
+ **kwargs,
155
+ )
156
+
157
+
158
+ # register the model config to AutoConfig
159
+ AutoConfig.register("cohere", CohereConfig)
model-00001-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4947f3c923dca18a94d54e95c8eb02a998b0cfa78c27728601e1fbd27587a28
3
+ size 5368224270
model-00002-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1975291234fdbe5b5493a3452c626e413e8e1d9a44f5c4c767a3224c593d17e
3
+ size 5338001558
model-00003-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c8cdbbc6c9a8ad8732b66ad210d2aa7975c01993f2267a5604a86ab7f44d53b
3
+ size 5303906755
model-00004-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bcea1209edca25c33acdc673de0f391094893d85ef539700d80654e29394556
3
+ size 5367886263
model-00005-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6e8993a8219a9a8dc8665d34f014f07669b24b48cd5f8b69cd68a7ec3a67dc3
3
+ size 1313938283
model.safetensors.index.json ADDED
@@ -0,0 +1,889 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 22691856384
4
+ },
5
+ "weight_map": {
6
+ "model.embed_tokens.weight": "model-00001-of-00005.safetensors",
7
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00005.safetensors",
8
+ "model.layers.0.mlp.down_proj.biases": "model-00001-of-00005.safetensors",
9
+ "model.layers.0.mlp.down_proj.scales": "model-00001-of-00005.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
11
+ "model.layers.0.mlp.gate_proj.biases": "model-00001-of-00005.safetensors",
12
+ "model.layers.0.mlp.gate_proj.scales": "model-00001-of-00005.safetensors",
13
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
14
+ "model.layers.0.mlp.up_proj.biases": "model-00001-of-00005.safetensors",
15
+ "model.layers.0.mlp.up_proj.scales": "model-00001-of-00005.safetensors",
16
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
17
+ "model.layers.0.self_attn.k_proj.biases": "model-00001-of-00005.safetensors",
18
+ "model.layers.0.self_attn.k_proj.scales": "model-00001-of-00005.safetensors",
19
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
20
+ "model.layers.0.self_attn.o_proj.biases": "model-00001-of-00005.safetensors",
21
+ "model.layers.0.self_attn.o_proj.scales": "model-00001-of-00005.safetensors",
22
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
23
+ "model.layers.0.self_attn.q_proj.biases": "model-00001-of-00005.safetensors",
24
+ "model.layers.0.self_attn.q_proj.scales": "model-00001-of-00005.safetensors",
25
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
26
+ "model.layers.0.self_attn.v_proj.biases": "model-00001-of-00005.safetensors",
27
+ "model.layers.0.self_attn.v_proj.scales": "model-00001-of-00005.safetensors",
28
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
29
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00005.safetensors",
30
+ "model.layers.1.mlp.down_proj.biases": "model-00001-of-00005.safetensors",
31
+ "model.layers.1.mlp.down_proj.scales": "model-00001-of-00005.safetensors",
32
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
33
+ "model.layers.1.mlp.gate_proj.biases": "model-00001-of-00005.safetensors",
34
+ "model.layers.1.mlp.gate_proj.scales": "model-00001-of-00005.safetensors",
35
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
36
+ "model.layers.1.mlp.up_proj.biases": "model-00001-of-00005.safetensors",
37
+ "model.layers.1.mlp.up_proj.scales": "model-00001-of-00005.safetensors",
38
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
39
+ "model.layers.1.self_attn.k_proj.biases": "model-00001-of-00005.safetensors",
40
+ "model.layers.1.self_attn.k_proj.scales": "model-00001-of-00005.safetensors",
41
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
42
+ "model.layers.1.self_attn.o_proj.biases": "model-00001-of-00005.safetensors",
43
+ "model.layers.1.self_attn.o_proj.scales": "model-00001-of-00005.safetensors",
44
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
45
+ "model.layers.1.self_attn.q_proj.biases": "model-00001-of-00005.safetensors",
46
+ "model.layers.1.self_attn.q_proj.scales": "model-00001-of-00005.safetensors",
47
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
48
+ "model.layers.1.self_attn.v_proj.biases": "model-00001-of-00005.safetensors",
49
+ "model.layers.1.self_attn.v_proj.scales": "model-00001-of-00005.safetensors",
50
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
51
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00005.safetensors",
52
+ "model.layers.10.mlp.down_proj.biases": "model-00002-of-00005.safetensors",
53
+ "model.layers.10.mlp.down_proj.scales": "model-00002-of-00005.safetensors",
54
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
55
+ "model.layers.10.mlp.gate_proj.biases": "model-00002-of-00005.safetensors",
56
+ "model.layers.10.mlp.gate_proj.scales": "model-00002-of-00005.safetensors",
57
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
58
+ "model.layers.10.mlp.up_proj.biases": "model-00002-of-00005.safetensors",
59
+ "model.layers.10.mlp.up_proj.scales": "model-00002-of-00005.safetensors",
60
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
61
+ "model.layers.10.self_attn.k_proj.biases": "model-00002-of-00005.safetensors",
62
+ "model.layers.10.self_attn.k_proj.scales": "model-00002-of-00005.safetensors",
63
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
64
+ "model.layers.10.self_attn.o_proj.biases": "model-00002-of-00005.safetensors",
65
+ "model.layers.10.self_attn.o_proj.scales": "model-00002-of-00005.safetensors",
66
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
67
+ "model.layers.10.self_attn.q_proj.biases": "model-00002-of-00005.safetensors",
68
+ "model.layers.10.self_attn.q_proj.scales": "model-00002-of-00005.safetensors",
69
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
70
+ "model.layers.10.self_attn.v_proj.biases": "model-00002-of-00005.safetensors",
71
+ "model.layers.10.self_attn.v_proj.scales": "model-00002-of-00005.safetensors",
72
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
73
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00005.safetensors",
74
+ "model.layers.11.mlp.down_proj.biases": "model-00002-of-00005.safetensors",
75
+ "model.layers.11.mlp.down_proj.scales": "model-00002-of-00005.safetensors",
76
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
77
+ "model.layers.11.mlp.gate_proj.biases": "model-00002-of-00005.safetensors",
78
+ "model.layers.11.mlp.gate_proj.scales": "model-00002-of-00005.safetensors",
79
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
80
+ "model.layers.11.mlp.up_proj.biases": "model-00002-of-00005.safetensors",
81
+ "model.layers.11.mlp.up_proj.scales": "model-00002-of-00005.safetensors",
82
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
83
+ "model.layers.11.self_attn.k_proj.biases": "model-00002-of-00005.safetensors",
84
+ "model.layers.11.self_attn.k_proj.scales": "model-00002-of-00005.safetensors",
85
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
86
+ "model.layers.11.self_attn.o_proj.biases": "model-00002-of-00005.safetensors",
87
+ "model.layers.11.self_attn.o_proj.scales": "model-00002-of-00005.safetensors",
88
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
89
+ "model.layers.11.self_attn.q_proj.biases": "model-00002-of-00005.safetensors",
90
+ "model.layers.11.self_attn.q_proj.scales": "model-00002-of-00005.safetensors",
91
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
92
+ "model.layers.11.self_attn.v_proj.biases": "model-00002-of-00005.safetensors",
93
+ "model.layers.11.self_attn.v_proj.scales": "model-00002-of-00005.safetensors",
94
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
95
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00005.safetensors",
96
+ "model.layers.12.mlp.down_proj.biases": "model-00002-of-00005.safetensors",
97
+ "model.layers.12.mlp.down_proj.scales": "model-00002-of-00005.safetensors",
98
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
99
+ "model.layers.12.mlp.gate_proj.biases": "model-00002-of-00005.safetensors",
100
+ "model.layers.12.mlp.gate_proj.scales": "model-00002-of-00005.safetensors",
101
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
102
+ "model.layers.12.mlp.up_proj.biases": "model-00002-of-00005.safetensors",
103
+ "model.layers.12.mlp.up_proj.scales": "model-00002-of-00005.safetensors",
104
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
105
+ "model.layers.12.self_attn.k_proj.biases": "model-00002-of-00005.safetensors",
106
+ "model.layers.12.self_attn.k_proj.scales": "model-00002-of-00005.safetensors",
107
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
108
+ "model.layers.12.self_attn.o_proj.biases": "model-00002-of-00005.safetensors",
109
+ "model.layers.12.self_attn.o_proj.scales": "model-00002-of-00005.safetensors",
110
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
111
+ "model.layers.12.self_attn.q_proj.biases": "model-00002-of-00005.safetensors",
112
+ "model.layers.12.self_attn.q_proj.scales": "model-00002-of-00005.safetensors",
113
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
114
+ "model.layers.12.self_attn.v_proj.biases": "model-00002-of-00005.safetensors",
115
+ "model.layers.12.self_attn.v_proj.scales": "model-00002-of-00005.safetensors",
116
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
117
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00005.safetensors",
118
+ "model.layers.13.mlp.down_proj.biases": "model-00002-of-00005.safetensors",
119
+ "model.layers.13.mlp.down_proj.scales": "model-00002-of-00005.safetensors",
120
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
121
+ "model.layers.13.mlp.gate_proj.biases": "model-00002-of-00005.safetensors",
122
+ "model.layers.13.mlp.gate_proj.scales": "model-00002-of-00005.safetensors",
123
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
124
+ "model.layers.13.mlp.up_proj.biases": "model-00002-of-00005.safetensors",
125
+ "model.layers.13.mlp.up_proj.scales": "model-00002-of-00005.safetensors",
126
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
127
+ "model.layers.13.self_attn.k_proj.biases": "model-00002-of-00005.safetensors",
128
+ "model.layers.13.self_attn.k_proj.scales": "model-00002-of-00005.safetensors",
129
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
130
+ "model.layers.13.self_attn.o_proj.biases": "model-00002-of-00005.safetensors",
131
+ "model.layers.13.self_attn.o_proj.scales": "model-00002-of-00005.safetensors",
132
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
133
+ "model.layers.13.self_attn.q_proj.biases": "model-00002-of-00005.safetensors",
134
+ "model.layers.13.self_attn.q_proj.scales": "model-00002-of-00005.safetensors",
135
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
136
+ "model.layers.13.self_attn.v_proj.biases": "model-00002-of-00005.safetensors",
137
+ "model.layers.13.self_attn.v_proj.scales": "model-00002-of-00005.safetensors",
138
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
139
+ "model.layers.14.input_layernorm.weight": "model-00003-of-00005.safetensors",
140
+ "model.layers.14.mlp.down_proj.biases": "model-00003-of-00005.safetensors",
141
+ "model.layers.14.mlp.down_proj.scales": "model-00003-of-00005.safetensors",
142
+ "model.layers.14.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
143
+ "model.layers.14.mlp.gate_proj.biases": "model-00003-of-00005.safetensors",
144
+ "model.layers.14.mlp.gate_proj.scales": "model-00003-of-00005.safetensors",
145
+ "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
146
+ "model.layers.14.mlp.up_proj.biases": "model-00003-of-00005.safetensors",
147
+ "model.layers.14.mlp.up_proj.scales": "model-00003-of-00005.safetensors",
148
+ "model.layers.14.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
149
+ "model.layers.14.self_attn.k_proj.biases": "model-00003-of-00005.safetensors",
150
+ "model.layers.14.self_attn.k_proj.scales": "model-00003-of-00005.safetensors",
151
+ "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
152
+ "model.layers.14.self_attn.o_proj.biases": "model-00003-of-00005.safetensors",
153
+ "model.layers.14.self_attn.o_proj.scales": "model-00003-of-00005.safetensors",
154
+ "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
155
+ "model.layers.14.self_attn.q_proj.biases": "model-00002-of-00005.safetensors",
156
+ "model.layers.14.self_attn.q_proj.scales": "model-00002-of-00005.safetensors",
157
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
158
+ "model.layers.14.self_attn.v_proj.biases": "model-00003-of-00005.safetensors",
159
+ "model.layers.14.self_attn.v_proj.scales": "model-00003-of-00005.safetensors",
160
+ "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
161
+ "model.layers.15.input_layernorm.weight": "model-00003-of-00005.safetensors",
162
+ "model.layers.15.mlp.down_proj.biases": "model-00003-of-00005.safetensors",
163
+ "model.layers.15.mlp.down_proj.scales": "model-00003-of-00005.safetensors",
164
+ "model.layers.15.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
165
+ "model.layers.15.mlp.gate_proj.biases": "model-00003-of-00005.safetensors",
166
+ "model.layers.15.mlp.gate_proj.scales": "model-00003-of-00005.safetensors",
167
+ "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
168
+ "model.layers.15.mlp.up_proj.biases": "model-00003-of-00005.safetensors",
169
+ "model.layers.15.mlp.up_proj.scales": "model-00003-of-00005.safetensors",
170
+ "model.layers.15.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
171
+ "model.layers.15.self_attn.k_proj.biases": "model-00003-of-00005.safetensors",
172
+ "model.layers.15.self_attn.k_proj.scales": "model-00003-of-00005.safetensors",
173
+ "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
174
+ "model.layers.15.self_attn.o_proj.biases": "model-00003-of-00005.safetensors",
175
+ "model.layers.15.self_attn.o_proj.scales": "model-00003-of-00005.safetensors",
176
+ "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
177
+ "model.layers.15.self_attn.q_proj.biases": "model-00003-of-00005.safetensors",
178
+ "model.layers.15.self_attn.q_proj.scales": "model-00003-of-00005.safetensors",
179
+ "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
180
+ "model.layers.15.self_attn.v_proj.biases": "model-00003-of-00005.safetensors",
181
+ "model.layers.15.self_attn.v_proj.scales": "model-00003-of-00005.safetensors",
182
+ "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
183
+ "model.layers.16.input_layernorm.weight": "model-00003-of-00005.safetensors",
184
+ "model.layers.16.mlp.down_proj.biases": "model-00003-of-00005.safetensors",
185
+ "model.layers.16.mlp.down_proj.scales": "model-00003-of-00005.safetensors",
186
+ "model.layers.16.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
187
+ "model.layers.16.mlp.gate_proj.biases": "model-00003-of-00005.safetensors",
188
+ "model.layers.16.mlp.gate_proj.scales": "model-00003-of-00005.safetensors",
189
+ "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
190
+ "model.layers.16.mlp.up_proj.biases": "model-00003-of-00005.safetensors",
191
+ "model.layers.16.mlp.up_proj.scales": "model-00003-of-00005.safetensors",
192
+ "model.layers.16.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
193
+ "model.layers.16.self_attn.k_proj.biases": "model-00003-of-00005.safetensors",
194
+ "model.layers.16.self_attn.k_proj.scales": "model-00003-of-00005.safetensors",
195
+ "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
196
+ "model.layers.16.self_attn.o_proj.biases": "model-00003-of-00005.safetensors",
197
+ "model.layers.16.self_attn.o_proj.scales": "model-00003-of-00005.safetensors",
198
+ "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
199
+ "model.layers.16.self_attn.q_proj.biases": "model-00003-of-00005.safetensors",
200
+ "model.layers.16.self_attn.q_proj.scales": "model-00003-of-00005.safetensors",
201
+ "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
202
+ "model.layers.16.self_attn.v_proj.biases": "model-00003-of-00005.safetensors",
203
+ "model.layers.16.self_attn.v_proj.scales": "model-00003-of-00005.safetensors",
204
+ "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
205
+ "model.layers.17.input_layernorm.weight": "model-00003-of-00005.safetensors",
206
+ "model.layers.17.mlp.down_proj.biases": "model-00003-of-00005.safetensors",
207
+ "model.layers.17.mlp.down_proj.scales": "model-00003-of-00005.safetensors",
208
+ "model.layers.17.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
209
+ "model.layers.17.mlp.gate_proj.biases": "model-00003-of-00005.safetensors",
210
+ "model.layers.17.mlp.gate_proj.scales": "model-00003-of-00005.safetensors",
211
+ "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
212
+ "model.layers.17.mlp.up_proj.biases": "model-00003-of-00005.safetensors",
213
+ "model.layers.17.mlp.up_proj.scales": "model-00003-of-00005.safetensors",
214
+ "model.layers.17.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
215
+ "model.layers.17.self_attn.k_proj.biases": "model-00003-of-00005.safetensors",
216
+ "model.layers.17.self_attn.k_proj.scales": "model-00003-of-00005.safetensors",
217
+ "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
218
+ "model.layers.17.self_attn.o_proj.biases": "model-00003-of-00005.safetensors",
219
+ "model.layers.17.self_attn.o_proj.scales": "model-00003-of-00005.safetensors",
220
+ "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
221
+ "model.layers.17.self_attn.q_proj.biases": "model-00003-of-00005.safetensors",
222
+ "model.layers.17.self_attn.q_proj.scales": "model-00003-of-00005.safetensors",
223
+ "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
224
+ "model.layers.17.self_attn.v_proj.biases": "model-00003-of-00005.safetensors",
225
+ "model.layers.17.self_attn.v_proj.scales": "model-00003-of-00005.safetensors",
226
+ "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
227
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00005.safetensors",
228
+ "model.layers.18.mlp.down_proj.biases": "model-00003-of-00005.safetensors",
229
+ "model.layers.18.mlp.down_proj.scales": "model-00003-of-00005.safetensors",
230
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
231
+ "model.layers.18.mlp.gate_proj.biases": "model-00003-of-00005.safetensors",
232
+ "model.layers.18.mlp.gate_proj.scales": "model-00003-of-00005.safetensors",
233
+ "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
234
+ "model.layers.18.mlp.up_proj.biases": "model-00003-of-00005.safetensors",
235
+ "model.layers.18.mlp.up_proj.scales": "model-00003-of-00005.safetensors",
236
+ "model.layers.18.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
237
+ "model.layers.18.self_attn.k_proj.biases": "model-00003-of-00005.safetensors",
238
+ "model.layers.18.self_attn.k_proj.scales": "model-00003-of-00005.safetensors",
239
+ "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
240
+ "model.layers.18.self_attn.o_proj.biases": "model-00003-of-00005.safetensors",
241
+ "model.layers.18.self_attn.o_proj.scales": "model-00003-of-00005.safetensors",
242
+ "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
243
+ "model.layers.18.self_attn.q_proj.biases": "model-00003-of-00005.safetensors",
244
+ "model.layers.18.self_attn.q_proj.scales": "model-00003-of-00005.safetensors",
245
+ "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
246
+ "model.layers.18.self_attn.v_proj.biases": "model-00003-of-00005.safetensors",
247
+ "model.layers.18.self_attn.v_proj.scales": "model-00003-of-00005.safetensors",
248
+ "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
249
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00005.safetensors",
250
+ "model.layers.19.mlp.down_proj.biases": "model-00003-of-00005.safetensors",
251
+ "model.layers.19.mlp.down_proj.scales": "model-00003-of-00005.safetensors",
252
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
253
+ "model.layers.19.mlp.gate_proj.biases": "model-00003-of-00005.safetensors",
254
+ "model.layers.19.mlp.gate_proj.scales": "model-00003-of-00005.safetensors",
255
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
256
+ "model.layers.19.mlp.up_proj.biases": "model-00003-of-00005.safetensors",
257
+ "model.layers.19.mlp.up_proj.scales": "model-00003-of-00005.safetensors",
258
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
259
+ "model.layers.19.self_attn.k_proj.biases": "model-00003-of-00005.safetensors",
260
+ "model.layers.19.self_attn.k_proj.scales": "model-00003-of-00005.safetensors",
261
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
262
+ "model.layers.19.self_attn.o_proj.biases": "model-00003-of-00005.safetensors",
263
+ "model.layers.19.self_attn.o_proj.scales": "model-00003-of-00005.safetensors",
264
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
265
+ "model.layers.19.self_attn.q_proj.biases": "model-00003-of-00005.safetensors",
266
+ "model.layers.19.self_attn.q_proj.scales": "model-00003-of-00005.safetensors",
267
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
268
+ "model.layers.19.self_attn.v_proj.biases": "model-00003-of-00005.safetensors",
269
+ "model.layers.19.self_attn.v_proj.scales": "model-00003-of-00005.safetensors",
270
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
271
+ "model.layers.2.input_layernorm.weight": "model-00002-of-00005.safetensors",
272
+ "model.layers.2.mlp.down_proj.biases": "model-00002-of-00005.safetensors",
273
+ "model.layers.2.mlp.down_proj.scales": "model-00002-of-00005.safetensors",
274
+ "model.layers.2.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
275
+ "model.layers.2.mlp.gate_proj.biases": "model-00002-of-00005.safetensors",
276
+ "model.layers.2.mlp.gate_proj.scales": "model-00001-of-00005.safetensors",
277
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
278
+ "model.layers.2.mlp.up_proj.biases": "model-00002-of-00005.safetensors",
279
+ "model.layers.2.mlp.up_proj.scales": "model-00002-of-00005.safetensors",
280
+ "model.layers.2.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
281
+ "model.layers.2.self_attn.k_proj.biases": "model-00001-of-00005.safetensors",
282
+ "model.layers.2.self_attn.k_proj.scales": "model-00001-of-00005.safetensors",
283
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
284
+ "model.layers.2.self_attn.o_proj.biases": "model-00001-of-00005.safetensors",
285
+ "model.layers.2.self_attn.o_proj.scales": "model-00001-of-00005.safetensors",
286
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
287
+ "model.layers.2.self_attn.q_proj.biases": "model-00001-of-00005.safetensors",
288
+ "model.layers.2.self_attn.q_proj.scales": "model-00001-of-00005.safetensors",
289
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
290
+ "model.layers.2.self_attn.v_proj.biases": "model-00001-of-00005.safetensors",
291
+ "model.layers.2.self_attn.v_proj.scales": "model-00001-of-00005.safetensors",
292
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
293
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00005.safetensors",
294
+ "model.layers.20.mlp.down_proj.biases": "model-00003-of-00005.safetensors",
295
+ "model.layers.20.mlp.down_proj.scales": "model-00003-of-00005.safetensors",
296
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
297
+ "model.layers.20.mlp.gate_proj.biases": "model-00003-of-00005.safetensors",
298
+ "model.layers.20.mlp.gate_proj.scales": "model-00003-of-00005.safetensors",
299
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
300
+ "model.layers.20.mlp.up_proj.biases": "model-00003-of-00005.safetensors",
301
+ "model.layers.20.mlp.up_proj.scales": "model-00003-of-00005.safetensors",
302
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
303
+ "model.layers.20.self_attn.k_proj.biases": "model-00003-of-00005.safetensors",
304
+ "model.layers.20.self_attn.k_proj.scales": "model-00003-of-00005.safetensors",
305
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
306
+ "model.layers.20.self_attn.o_proj.biases": "model-00003-of-00005.safetensors",
307
+ "model.layers.20.self_attn.o_proj.scales": "model-00003-of-00005.safetensors",
308
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
309
+ "model.layers.20.self_attn.q_proj.biases": "model-00003-of-00005.safetensors",
310
+ "model.layers.20.self_attn.q_proj.scales": "model-00003-of-00005.safetensors",
311
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
312
+ "model.layers.20.self_attn.v_proj.biases": "model-00003-of-00005.safetensors",
313
+ "model.layers.20.self_attn.v_proj.scales": "model-00003-of-00005.safetensors",
314
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
315
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00005.safetensors",
316
+ "model.layers.21.mlp.down_proj.biases": "model-00003-of-00005.safetensors",
317
+ "model.layers.21.mlp.down_proj.scales": "model-00003-of-00005.safetensors",
318
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
319
+ "model.layers.21.mlp.gate_proj.biases": "model-00003-of-00005.safetensors",
320
+ "model.layers.21.mlp.gate_proj.scales": "model-00003-of-00005.safetensors",
321
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
322
+ "model.layers.21.mlp.up_proj.biases": "model-00003-of-00005.safetensors",
323
+ "model.layers.21.mlp.up_proj.scales": "model-00003-of-00005.safetensors",
324
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
325
+ "model.layers.21.self_attn.k_proj.biases": "model-00003-of-00005.safetensors",
326
+ "model.layers.21.self_attn.k_proj.scales": "model-00003-of-00005.safetensors",
327
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
328
+ "model.layers.21.self_attn.o_proj.biases": "model-00003-of-00005.safetensors",
329
+ "model.layers.21.self_attn.o_proj.scales": "model-00003-of-00005.safetensors",
330
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
331
+ "model.layers.21.self_attn.q_proj.biases": "model-00003-of-00005.safetensors",
332
+ "model.layers.21.self_attn.q_proj.scales": "model-00003-of-00005.safetensors",
333
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
334
+ "model.layers.21.self_attn.v_proj.biases": "model-00003-of-00005.safetensors",
335
+ "model.layers.21.self_attn.v_proj.scales": "model-00003-of-00005.safetensors",
336
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
337
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00005.safetensors",
338
+ "model.layers.22.mlp.down_proj.biases": "model-00003-of-00005.safetensors",
339
+ "model.layers.22.mlp.down_proj.scales": "model-00003-of-00005.safetensors",
340
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
341
+ "model.layers.22.mlp.gate_proj.biases": "model-00003-of-00005.safetensors",
342
+ "model.layers.22.mlp.gate_proj.scales": "model-00003-of-00005.safetensors",
343
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
344
+ "model.layers.22.mlp.up_proj.biases": "model-00003-of-00005.safetensors",
345
+ "model.layers.22.mlp.up_proj.scales": "model-00003-of-00005.safetensors",
346
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
347
+ "model.layers.22.self_attn.k_proj.biases": "model-00003-of-00005.safetensors",
348
+ "model.layers.22.self_attn.k_proj.scales": "model-00003-of-00005.safetensors",
349
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
350
+ "model.layers.22.self_attn.o_proj.biases": "model-00003-of-00005.safetensors",
351
+ "model.layers.22.self_attn.o_proj.scales": "model-00003-of-00005.safetensors",
352
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
353
+ "model.layers.22.self_attn.q_proj.biases": "model-00003-of-00005.safetensors",
354
+ "model.layers.22.self_attn.q_proj.scales": "model-00003-of-00005.safetensors",
355
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
356
+ "model.layers.22.self_attn.v_proj.biases": "model-00003-of-00005.safetensors",
357
+ "model.layers.22.self_attn.v_proj.scales": "model-00003-of-00005.safetensors",
358
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
359
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00005.safetensors",
360
+ "model.layers.23.mlp.down_proj.biases": "model-00003-of-00005.safetensors",
361
+ "model.layers.23.mlp.down_proj.scales": "model-00003-of-00005.safetensors",
362
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
363
+ "model.layers.23.mlp.gate_proj.biases": "model-00003-of-00005.safetensors",
364
+ "model.layers.23.mlp.gate_proj.scales": "model-00003-of-00005.safetensors",
365
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
366
+ "model.layers.23.mlp.up_proj.biases": "model-00003-of-00005.safetensors",
367
+ "model.layers.23.mlp.up_proj.scales": "model-00003-of-00005.safetensors",
368
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
369
+ "model.layers.23.self_attn.k_proj.biases": "model-00003-of-00005.safetensors",
370
+ "model.layers.23.self_attn.k_proj.scales": "model-00003-of-00005.safetensors",
371
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
372
+ "model.layers.23.self_attn.o_proj.biases": "model-00003-of-00005.safetensors",
373
+ "model.layers.23.self_attn.o_proj.scales": "model-00003-of-00005.safetensors",
374
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
375
+ "model.layers.23.self_attn.q_proj.biases": "model-00003-of-00005.safetensors",
376
+ "model.layers.23.self_attn.q_proj.scales": "model-00003-of-00005.safetensors",
377
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
378
+ "model.layers.23.self_attn.v_proj.biases": "model-00003-of-00005.safetensors",
379
+ "model.layers.23.self_attn.v_proj.scales": "model-00003-of-00005.safetensors",
380
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
381
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00005.safetensors",
382
+ "model.layers.24.mlp.down_proj.biases": "model-00003-of-00005.safetensors",
383
+ "model.layers.24.mlp.down_proj.scales": "model-00003-of-00005.safetensors",
384
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
385
+ "model.layers.24.mlp.gate_proj.biases": "model-00003-of-00005.safetensors",
386
+ "model.layers.24.mlp.gate_proj.scales": "model-00003-of-00005.safetensors",
387
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
388
+ "model.layers.24.mlp.up_proj.biases": "model-00003-of-00005.safetensors",
389
+ "model.layers.24.mlp.up_proj.scales": "model-00003-of-00005.safetensors",
390
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
391
+ "model.layers.24.self_attn.k_proj.biases": "model-00003-of-00005.safetensors",
392
+ "model.layers.24.self_attn.k_proj.scales": "model-00003-of-00005.safetensors",
393
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
394
+ "model.layers.24.self_attn.o_proj.biases": "model-00003-of-00005.safetensors",
395
+ "model.layers.24.self_attn.o_proj.scales": "model-00003-of-00005.safetensors",
396
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
397
+ "model.layers.24.self_attn.q_proj.biases": "model-00003-of-00005.safetensors",
398
+ "model.layers.24.self_attn.q_proj.scales": "model-00003-of-00005.safetensors",
399
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
400
+ "model.layers.24.self_attn.v_proj.biases": "model-00003-of-00005.safetensors",
401
+ "model.layers.24.self_attn.v_proj.scales": "model-00003-of-00005.safetensors",
402
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
403
+ "model.layers.25.input_layernorm.weight": "model-00004-of-00005.safetensors",
404
+ "model.layers.25.mlp.down_proj.biases": "model-00004-of-00005.safetensors",
405
+ "model.layers.25.mlp.down_proj.scales": "model-00004-of-00005.safetensors",
406
+ "model.layers.25.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
407
+ "model.layers.25.mlp.gate_proj.biases": "model-00003-of-00005.safetensors",
408
+ "model.layers.25.mlp.gate_proj.scales": "model-00003-of-00005.safetensors",
409
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
410
+ "model.layers.25.mlp.up_proj.biases": "model-00004-of-00005.safetensors",
411
+ "model.layers.25.mlp.up_proj.scales": "model-00004-of-00005.safetensors",
412
+ "model.layers.25.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
413
+ "model.layers.25.self_attn.k_proj.biases": "model-00003-of-00005.safetensors",
414
+ "model.layers.25.self_attn.k_proj.scales": "model-00003-of-00005.safetensors",
415
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
416
+ "model.layers.25.self_attn.o_proj.biases": "model-00003-of-00005.safetensors",
417
+ "model.layers.25.self_attn.o_proj.scales": "model-00003-of-00005.safetensors",
418
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
419
+ "model.layers.25.self_attn.q_proj.biases": "model-00003-of-00005.safetensors",
420
+ "model.layers.25.self_attn.q_proj.scales": "model-00003-of-00005.safetensors",
421
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
422
+ "model.layers.25.self_attn.v_proj.biases": "model-00003-of-00005.safetensors",
423
+ "model.layers.25.self_attn.v_proj.scales": "model-00003-of-00005.safetensors",
424
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
425
+ "model.layers.26.input_layernorm.weight": "model-00004-of-00005.safetensors",
426
+ "model.layers.26.mlp.down_proj.biases": "model-00004-of-00005.safetensors",
427
+ "model.layers.26.mlp.down_proj.scales": "model-00004-of-00005.safetensors",
428
+ "model.layers.26.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
429
+ "model.layers.26.mlp.gate_proj.biases": "model-00004-of-00005.safetensors",
430
+ "model.layers.26.mlp.gate_proj.scales": "model-00004-of-00005.safetensors",
431
+ "model.layers.26.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
432
+ "model.layers.26.mlp.up_proj.biases": "model-00004-of-00005.safetensors",
433
+ "model.layers.26.mlp.up_proj.scales": "model-00004-of-00005.safetensors",
434
+ "model.layers.26.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
435
+ "model.layers.26.self_attn.k_proj.biases": "model-00004-of-00005.safetensors",
436
+ "model.layers.26.self_attn.k_proj.scales": "model-00004-of-00005.safetensors",
437
+ "model.layers.26.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
438
+ "model.layers.26.self_attn.o_proj.biases": "model-00004-of-00005.safetensors",
439
+ "model.layers.26.self_attn.o_proj.scales": "model-00004-of-00005.safetensors",
440
+ "model.layers.26.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
441
+ "model.layers.26.self_attn.q_proj.biases": "model-00004-of-00005.safetensors",
442
+ "model.layers.26.self_attn.q_proj.scales": "model-00004-of-00005.safetensors",
443
+ "model.layers.26.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
444
+ "model.layers.26.self_attn.v_proj.biases": "model-00004-of-00005.safetensors",
445
+ "model.layers.26.self_attn.v_proj.scales": "model-00004-of-00005.safetensors",
446
+ "model.layers.26.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
447
+ "model.layers.27.input_layernorm.weight": "model-00004-of-00005.safetensors",
448
+ "model.layers.27.mlp.down_proj.biases": "model-00004-of-00005.safetensors",
449
+ "model.layers.27.mlp.down_proj.scales": "model-00004-of-00005.safetensors",
450
+ "model.layers.27.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
451
+ "model.layers.27.mlp.gate_proj.biases": "model-00004-of-00005.safetensors",
452
+ "model.layers.27.mlp.gate_proj.scales": "model-00004-of-00005.safetensors",
453
+ "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
454
+ "model.layers.27.mlp.up_proj.biases": "model-00004-of-00005.safetensors",
455
+ "model.layers.27.mlp.up_proj.scales": "model-00004-of-00005.safetensors",
456
+ "model.layers.27.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
457
+ "model.layers.27.self_attn.k_proj.biases": "model-00004-of-00005.safetensors",
458
+ "model.layers.27.self_attn.k_proj.scales": "model-00004-of-00005.safetensors",
459
+ "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
460
+ "model.layers.27.self_attn.o_proj.biases": "model-00004-of-00005.safetensors",
461
+ "model.layers.27.self_attn.o_proj.scales": "model-00004-of-00005.safetensors",
462
+ "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
463
+ "model.layers.27.self_attn.q_proj.biases": "model-00004-of-00005.safetensors",
464
+ "model.layers.27.self_attn.q_proj.scales": "model-00004-of-00005.safetensors",
465
+ "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
466
+ "model.layers.27.self_attn.v_proj.biases": "model-00004-of-00005.safetensors",
467
+ "model.layers.27.self_attn.v_proj.scales": "model-00004-of-00005.safetensors",
468
+ "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
469
+ "model.layers.28.input_layernorm.weight": "model-00004-of-00005.safetensors",
470
+ "model.layers.28.mlp.down_proj.biases": "model-00004-of-00005.safetensors",
471
+ "model.layers.28.mlp.down_proj.scales": "model-00004-of-00005.safetensors",
472
+ "model.layers.28.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
473
+ "model.layers.28.mlp.gate_proj.biases": "model-00004-of-00005.safetensors",
474
+ "model.layers.28.mlp.gate_proj.scales": "model-00004-of-00005.safetensors",
475
+ "model.layers.28.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
476
+ "model.layers.28.mlp.up_proj.biases": "model-00004-of-00005.safetensors",
477
+ "model.layers.28.mlp.up_proj.scales": "model-00004-of-00005.safetensors",
478
+ "model.layers.28.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
479
+ "model.layers.28.self_attn.k_proj.biases": "model-00004-of-00005.safetensors",
480
+ "model.layers.28.self_attn.k_proj.scales": "model-00004-of-00005.safetensors",
481
+ "model.layers.28.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
482
+ "model.layers.28.self_attn.o_proj.biases": "model-00004-of-00005.safetensors",
483
+ "model.layers.28.self_attn.o_proj.scales": "model-00004-of-00005.safetensors",
484
+ "model.layers.28.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
485
+ "model.layers.28.self_attn.q_proj.biases": "model-00004-of-00005.safetensors",
486
+ "model.layers.28.self_attn.q_proj.scales": "model-00004-of-00005.safetensors",
487
+ "model.layers.28.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
488
+ "model.layers.28.self_attn.v_proj.biases": "model-00004-of-00005.safetensors",
489
+ "model.layers.28.self_attn.v_proj.scales": "model-00004-of-00005.safetensors",
490
+ "model.layers.28.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
491
+ "model.layers.29.input_layernorm.weight": "model-00004-of-00005.safetensors",
492
+ "model.layers.29.mlp.down_proj.biases": "model-00004-of-00005.safetensors",
493
+ "model.layers.29.mlp.down_proj.scales": "model-00004-of-00005.safetensors",
494
+ "model.layers.29.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
495
+ "model.layers.29.mlp.gate_proj.biases": "model-00004-of-00005.safetensors",
496
+ "model.layers.29.mlp.gate_proj.scales": "model-00004-of-00005.safetensors",
497
+ "model.layers.29.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
498
+ "model.layers.29.mlp.up_proj.biases": "model-00004-of-00005.safetensors",
499
+ "model.layers.29.mlp.up_proj.scales": "model-00004-of-00005.safetensors",
500
+ "model.layers.29.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
501
+ "model.layers.29.self_attn.k_proj.biases": "model-00004-of-00005.safetensors",
502
+ "model.layers.29.self_attn.k_proj.scales": "model-00004-of-00005.safetensors",
503
+ "model.layers.29.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
504
+ "model.layers.29.self_attn.o_proj.biases": "model-00004-of-00005.safetensors",
505
+ "model.layers.29.self_attn.o_proj.scales": "model-00004-of-00005.safetensors",
506
+ "model.layers.29.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
507
+ "model.layers.29.self_attn.q_proj.biases": "model-00004-of-00005.safetensors",
508
+ "model.layers.29.self_attn.q_proj.scales": "model-00004-of-00005.safetensors",
509
+ "model.layers.29.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
510
+ "model.layers.29.self_attn.v_proj.biases": "model-00004-of-00005.safetensors",
511
+ "model.layers.29.self_attn.v_proj.scales": "model-00004-of-00005.safetensors",
512
+ "model.layers.29.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
513
+ "model.layers.3.input_layernorm.weight": "model-00002-of-00005.safetensors",
514
+ "model.layers.3.mlp.down_proj.biases": "model-00002-of-00005.safetensors",
515
+ "model.layers.3.mlp.down_proj.scales": "model-00002-of-00005.safetensors",
516
+ "model.layers.3.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
517
+ "model.layers.3.mlp.gate_proj.biases": "model-00002-of-00005.safetensors",
518
+ "model.layers.3.mlp.gate_proj.scales": "model-00002-of-00005.safetensors",
519
+ "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
520
+ "model.layers.3.mlp.up_proj.biases": "model-00002-of-00005.safetensors",
521
+ "model.layers.3.mlp.up_proj.scales": "model-00002-of-00005.safetensors",
522
+ "model.layers.3.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
523
+ "model.layers.3.self_attn.k_proj.biases": "model-00002-of-00005.safetensors",
524
+ "model.layers.3.self_attn.k_proj.scales": "model-00002-of-00005.safetensors",
525
+ "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
526
+ "model.layers.3.self_attn.o_proj.biases": "model-00002-of-00005.safetensors",
527
+ "model.layers.3.self_attn.o_proj.scales": "model-00002-of-00005.safetensors",
528
+ "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
529
+ "model.layers.3.self_attn.q_proj.biases": "model-00002-of-00005.safetensors",
530
+ "model.layers.3.self_attn.q_proj.scales": "model-00002-of-00005.safetensors",
531
+ "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
532
+ "model.layers.3.self_attn.v_proj.biases": "model-00002-of-00005.safetensors",
533
+ "model.layers.3.self_attn.v_proj.scales": "model-00002-of-00005.safetensors",
534
+ "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
535
+ "model.layers.30.input_layernorm.weight": "model-00004-of-00005.safetensors",
536
+ "model.layers.30.mlp.down_proj.biases": "model-00004-of-00005.safetensors",
537
+ "model.layers.30.mlp.down_proj.scales": "model-00004-of-00005.safetensors",
538
+ "model.layers.30.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
539
+ "model.layers.30.mlp.gate_proj.biases": "model-00004-of-00005.safetensors",
540
+ "model.layers.30.mlp.gate_proj.scales": "model-00004-of-00005.safetensors",
541
+ "model.layers.30.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
542
+ "model.layers.30.mlp.up_proj.biases": "model-00004-of-00005.safetensors",
543
+ "model.layers.30.mlp.up_proj.scales": "model-00004-of-00005.safetensors",
544
+ "model.layers.30.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
545
+ "model.layers.30.self_attn.k_proj.biases": "model-00004-of-00005.safetensors",
546
+ "model.layers.30.self_attn.k_proj.scales": "model-00004-of-00005.safetensors",
547
+ "model.layers.30.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
548
+ "model.layers.30.self_attn.o_proj.biases": "model-00004-of-00005.safetensors",
549
+ "model.layers.30.self_attn.o_proj.scales": "model-00004-of-00005.safetensors",
550
+ "model.layers.30.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
551
+ "model.layers.30.self_attn.q_proj.biases": "model-00004-of-00005.safetensors",
552
+ "model.layers.30.self_attn.q_proj.scales": "model-00004-of-00005.safetensors",
553
+ "model.layers.30.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
554
+ "model.layers.30.self_attn.v_proj.biases": "model-00004-of-00005.safetensors",
555
+ "model.layers.30.self_attn.v_proj.scales": "model-00004-of-00005.safetensors",
556
+ "model.layers.30.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
557
+ "model.layers.31.input_layernorm.weight": "model-00004-of-00005.safetensors",
558
+ "model.layers.31.mlp.down_proj.biases": "model-00004-of-00005.safetensors",
559
+ "model.layers.31.mlp.down_proj.scales": "model-00004-of-00005.safetensors",
560
+ "model.layers.31.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
561
+ "model.layers.31.mlp.gate_proj.biases": "model-00004-of-00005.safetensors",
562
+ "model.layers.31.mlp.gate_proj.scales": "model-00004-of-00005.safetensors",
563
+ "model.layers.31.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
564
+ "model.layers.31.mlp.up_proj.biases": "model-00004-of-00005.safetensors",
565
+ "model.layers.31.mlp.up_proj.scales": "model-00004-of-00005.safetensors",
566
+ "model.layers.31.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
567
+ "model.layers.31.self_attn.k_proj.biases": "model-00004-of-00005.safetensors",
568
+ "model.layers.31.self_attn.k_proj.scales": "model-00004-of-00005.safetensors",
569
+ "model.layers.31.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
570
+ "model.layers.31.self_attn.o_proj.biases": "model-00004-of-00005.safetensors",
571
+ "model.layers.31.self_attn.o_proj.scales": "model-00004-of-00005.safetensors",
572
+ "model.layers.31.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
573
+ "model.layers.31.self_attn.q_proj.biases": "model-00004-of-00005.safetensors",
574
+ "model.layers.31.self_attn.q_proj.scales": "model-00004-of-00005.safetensors",
575
+ "model.layers.31.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
576
+ "model.layers.31.self_attn.v_proj.biases": "model-00004-of-00005.safetensors",
577
+ "model.layers.31.self_attn.v_proj.scales": "model-00004-of-00005.safetensors",
578
+ "model.layers.31.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
579
+ "model.layers.32.input_layernorm.weight": "model-00004-of-00005.safetensors",
580
+ "model.layers.32.mlp.down_proj.biases": "model-00004-of-00005.safetensors",
581
+ "model.layers.32.mlp.down_proj.scales": "model-00004-of-00005.safetensors",
582
+ "model.layers.32.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
583
+ "model.layers.32.mlp.gate_proj.biases": "model-00004-of-00005.safetensors",
584
+ "model.layers.32.mlp.gate_proj.scales": "model-00004-of-00005.safetensors",
585
+ "model.layers.32.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
586
+ "model.layers.32.mlp.up_proj.biases": "model-00004-of-00005.safetensors",
587
+ "model.layers.32.mlp.up_proj.scales": "model-00004-of-00005.safetensors",
588
+ "model.layers.32.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
589
+ "model.layers.32.self_attn.k_proj.biases": "model-00004-of-00005.safetensors",
590
+ "model.layers.32.self_attn.k_proj.scales": "model-00004-of-00005.safetensors",
591
+ "model.layers.32.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
592
+ "model.layers.32.self_attn.o_proj.biases": "model-00004-of-00005.safetensors",
593
+ "model.layers.32.self_attn.o_proj.scales": "model-00004-of-00005.safetensors",
594
+ "model.layers.32.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
595
+ "model.layers.32.self_attn.q_proj.biases": "model-00004-of-00005.safetensors",
596
+ "model.layers.32.self_attn.q_proj.scales": "model-00004-of-00005.safetensors",
597
+ "model.layers.32.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
598
+ "model.layers.32.self_attn.v_proj.biases": "model-00004-of-00005.safetensors",
599
+ "model.layers.32.self_attn.v_proj.scales": "model-00004-of-00005.safetensors",
600
+ "model.layers.32.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
601
+ "model.layers.33.input_layernorm.weight": "model-00004-of-00005.safetensors",
602
+ "model.layers.33.mlp.down_proj.biases": "model-00004-of-00005.safetensors",
603
+ "model.layers.33.mlp.down_proj.scales": "model-00004-of-00005.safetensors",
604
+ "model.layers.33.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
605
+ "model.layers.33.mlp.gate_proj.biases": "model-00004-of-00005.safetensors",
606
+ "model.layers.33.mlp.gate_proj.scales": "model-00004-of-00005.safetensors",
607
+ "model.layers.33.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
608
+ "model.layers.33.mlp.up_proj.biases": "model-00004-of-00005.safetensors",
609
+ "model.layers.33.mlp.up_proj.scales": "model-00004-of-00005.safetensors",
610
+ "model.layers.33.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
611
+ "model.layers.33.self_attn.k_proj.biases": "model-00004-of-00005.safetensors",
612
+ "model.layers.33.self_attn.k_proj.scales": "model-00004-of-00005.safetensors",
613
+ "model.layers.33.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
614
+ "model.layers.33.self_attn.o_proj.biases": "model-00004-of-00005.safetensors",
615
+ "model.layers.33.self_attn.o_proj.scales": "model-00004-of-00005.safetensors",
616
+ "model.layers.33.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
617
+ "model.layers.33.self_attn.q_proj.biases": "model-00004-of-00005.safetensors",
618
+ "model.layers.33.self_attn.q_proj.scales": "model-00004-of-00005.safetensors",
619
+ "model.layers.33.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
620
+ "model.layers.33.self_attn.v_proj.biases": "model-00004-of-00005.safetensors",
621
+ "model.layers.33.self_attn.v_proj.scales": "model-00004-of-00005.safetensors",
622
+ "model.layers.33.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
623
+ "model.layers.34.input_layernorm.weight": "model-00004-of-00005.safetensors",
624
+ "model.layers.34.mlp.down_proj.biases": "model-00004-of-00005.safetensors",
625
+ "model.layers.34.mlp.down_proj.scales": "model-00004-of-00005.safetensors",
626
+ "model.layers.34.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
627
+ "model.layers.34.mlp.gate_proj.biases": "model-00004-of-00005.safetensors",
628
+ "model.layers.34.mlp.gate_proj.scales": "model-00004-of-00005.safetensors",
629
+ "model.layers.34.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
630
+ "model.layers.34.mlp.up_proj.biases": "model-00004-of-00005.safetensors",
631
+ "model.layers.34.mlp.up_proj.scales": "model-00004-of-00005.safetensors",
632
+ "model.layers.34.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
633
+ "model.layers.34.self_attn.k_proj.biases": "model-00004-of-00005.safetensors",
634
+ "model.layers.34.self_attn.k_proj.scales": "model-00004-of-00005.safetensors",
635
+ "model.layers.34.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
636
+ "model.layers.34.self_attn.o_proj.biases": "model-00004-of-00005.safetensors",
637
+ "model.layers.34.self_attn.o_proj.scales": "model-00004-of-00005.safetensors",
638
+ "model.layers.34.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
639
+ "model.layers.34.self_attn.q_proj.biases": "model-00004-of-00005.safetensors",
640
+ "model.layers.34.self_attn.q_proj.scales": "model-00004-of-00005.safetensors",
641
+ "model.layers.34.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
642
+ "model.layers.34.self_attn.v_proj.biases": "model-00004-of-00005.safetensors",
643
+ "model.layers.34.self_attn.v_proj.scales": "model-00004-of-00005.safetensors",
644
+ "model.layers.34.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
645
+ "model.layers.35.input_layernorm.weight": "model-00004-of-00005.safetensors",
646
+ "model.layers.35.mlp.down_proj.biases": "model-00004-of-00005.safetensors",
647
+ "model.layers.35.mlp.down_proj.scales": "model-00004-of-00005.safetensors",
648
+ "model.layers.35.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
649
+ "model.layers.35.mlp.gate_proj.biases": "model-00004-of-00005.safetensors",
650
+ "model.layers.35.mlp.gate_proj.scales": "model-00004-of-00005.safetensors",
651
+ "model.layers.35.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
652
+ "model.layers.35.mlp.up_proj.biases": "model-00004-of-00005.safetensors",
653
+ "model.layers.35.mlp.up_proj.scales": "model-00004-of-00005.safetensors",
654
+ "model.layers.35.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
655
+ "model.layers.35.self_attn.k_proj.biases": "model-00004-of-00005.safetensors",
656
+ "model.layers.35.self_attn.k_proj.scales": "model-00004-of-00005.safetensors",
657
+ "model.layers.35.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
658
+ "model.layers.35.self_attn.o_proj.biases": "model-00004-of-00005.safetensors",
659
+ "model.layers.35.self_attn.o_proj.scales": "model-00004-of-00005.safetensors",
660
+ "model.layers.35.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
661
+ "model.layers.35.self_attn.q_proj.biases": "model-00004-of-00005.safetensors",
662
+ "model.layers.35.self_attn.q_proj.scales": "model-00004-of-00005.safetensors",
663
+ "model.layers.35.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
664
+ "model.layers.35.self_attn.v_proj.biases": "model-00004-of-00005.safetensors",
665
+ "model.layers.35.self_attn.v_proj.scales": "model-00004-of-00005.safetensors",
666
+ "model.layers.35.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
667
+ "model.layers.36.input_layernorm.weight": "model-00004-of-00005.safetensors",
668
+ "model.layers.36.mlp.down_proj.biases": "model-00004-of-00005.safetensors",
669
+ "model.layers.36.mlp.down_proj.scales": "model-00004-of-00005.safetensors",
670
+ "model.layers.36.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
671
+ "model.layers.36.mlp.gate_proj.biases": "model-00004-of-00005.safetensors",
672
+ "model.layers.36.mlp.gate_proj.scales": "model-00004-of-00005.safetensors",
673
+ "model.layers.36.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
674
+ "model.layers.36.mlp.up_proj.biases": "model-00004-of-00005.safetensors",
675
+ "model.layers.36.mlp.up_proj.scales": "model-00004-of-00005.safetensors",
676
+ "model.layers.36.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
677
+ "model.layers.36.self_attn.k_proj.biases": "model-00004-of-00005.safetensors",
678
+ "model.layers.36.self_attn.k_proj.scales": "model-00004-of-00005.safetensors",
679
+ "model.layers.36.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
680
+ "model.layers.36.self_attn.o_proj.biases": "model-00004-of-00005.safetensors",
681
+ "model.layers.36.self_attn.o_proj.scales": "model-00004-of-00005.safetensors",
682
+ "model.layers.36.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
683
+ "model.layers.36.self_attn.q_proj.biases": "model-00004-of-00005.safetensors",
684
+ "model.layers.36.self_attn.q_proj.scales": "model-00004-of-00005.safetensors",
685
+ "model.layers.36.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
686
+ "model.layers.36.self_attn.v_proj.biases": "model-00004-of-00005.safetensors",
687
+ "model.layers.36.self_attn.v_proj.scales": "model-00004-of-00005.safetensors",
688
+ "model.layers.36.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
689
+ "model.layers.37.input_layernorm.weight": "model-00005-of-00005.safetensors",
690
+ "model.layers.37.mlp.down_proj.biases": "model-00005-of-00005.safetensors",
691
+ "model.layers.37.mlp.down_proj.scales": "model-00005-of-00005.safetensors",
692
+ "model.layers.37.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
693
+ "model.layers.37.mlp.gate_proj.biases": "model-00005-of-00005.safetensors",
694
+ "model.layers.37.mlp.gate_proj.scales": "model-00005-of-00005.safetensors",
695
+ "model.layers.37.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
696
+ "model.layers.37.mlp.up_proj.biases": "model-00005-of-00005.safetensors",
697
+ "model.layers.37.mlp.up_proj.scales": "model-00005-of-00005.safetensors",
698
+ "model.layers.37.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
699
+ "model.layers.37.self_attn.k_proj.biases": "model-00005-of-00005.safetensors",
700
+ "model.layers.37.self_attn.k_proj.scales": "model-00004-of-00005.safetensors",
701
+ "model.layers.37.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
702
+ "model.layers.37.self_attn.o_proj.biases": "model-00005-of-00005.safetensors",
703
+ "model.layers.37.self_attn.o_proj.scales": "model-00005-of-00005.safetensors",
704
+ "model.layers.37.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
705
+ "model.layers.37.self_attn.q_proj.biases": "model-00004-of-00005.safetensors",
706
+ "model.layers.37.self_attn.q_proj.scales": "model-00004-of-00005.safetensors",
707
+ "model.layers.37.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
708
+ "model.layers.37.self_attn.v_proj.biases": "model-00005-of-00005.safetensors",
709
+ "model.layers.37.self_attn.v_proj.scales": "model-00005-of-00005.safetensors",
710
+ "model.layers.37.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
711
+ "model.layers.38.input_layernorm.weight": "model-00005-of-00005.safetensors",
712
+ "model.layers.38.mlp.down_proj.biases": "model-00005-of-00005.safetensors",
713
+ "model.layers.38.mlp.down_proj.scales": "model-00005-of-00005.safetensors",
714
+ "model.layers.38.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
715
+ "model.layers.38.mlp.gate_proj.biases": "model-00005-of-00005.safetensors",
716
+ "model.layers.38.mlp.gate_proj.scales": "model-00005-of-00005.safetensors",
717
+ "model.layers.38.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
718
+ "model.layers.38.mlp.up_proj.biases": "model-00005-of-00005.safetensors",
719
+ "model.layers.38.mlp.up_proj.scales": "model-00005-of-00005.safetensors",
720
+ "model.layers.38.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
721
+ "model.layers.38.self_attn.k_proj.biases": "model-00005-of-00005.safetensors",
722
+ "model.layers.38.self_attn.k_proj.scales": "model-00005-of-00005.safetensors",
723
+ "model.layers.38.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
724
+ "model.layers.38.self_attn.o_proj.biases": "model-00005-of-00005.safetensors",
725
+ "model.layers.38.self_attn.o_proj.scales": "model-00005-of-00005.safetensors",
726
+ "model.layers.38.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
727
+ "model.layers.38.self_attn.q_proj.biases": "model-00005-of-00005.safetensors",
728
+ "model.layers.38.self_attn.q_proj.scales": "model-00005-of-00005.safetensors",
729
+ "model.layers.38.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
730
+ "model.layers.38.self_attn.v_proj.biases": "model-00005-of-00005.safetensors",
731
+ "model.layers.38.self_attn.v_proj.scales": "model-00005-of-00005.safetensors",
732
+ "model.layers.38.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
733
+ "model.layers.39.input_layernorm.weight": "model-00005-of-00005.safetensors",
734
+ "model.layers.39.mlp.down_proj.biases": "model-00005-of-00005.safetensors",
735
+ "model.layers.39.mlp.down_proj.scales": "model-00005-of-00005.safetensors",
736
+ "model.layers.39.mlp.down_proj.weight": "model-00005-of-00005.safetensors",
737
+ "model.layers.39.mlp.gate_proj.biases": "model-00005-of-00005.safetensors",
738
+ "model.layers.39.mlp.gate_proj.scales": "model-00005-of-00005.safetensors",
739
+ "model.layers.39.mlp.gate_proj.weight": "model-00005-of-00005.safetensors",
740
+ "model.layers.39.mlp.up_proj.biases": "model-00005-of-00005.safetensors",
741
+ "model.layers.39.mlp.up_proj.scales": "model-00005-of-00005.safetensors",
742
+ "model.layers.39.mlp.up_proj.weight": "model-00005-of-00005.safetensors",
743
+ "model.layers.39.self_attn.k_proj.biases": "model-00005-of-00005.safetensors",
744
+ "model.layers.39.self_attn.k_proj.scales": "model-00005-of-00005.safetensors",
745
+ "model.layers.39.self_attn.k_proj.weight": "model-00005-of-00005.safetensors",
746
+ "model.layers.39.self_attn.o_proj.biases": "model-00005-of-00005.safetensors",
747
+ "model.layers.39.self_attn.o_proj.scales": "model-00005-of-00005.safetensors",
748
+ "model.layers.39.self_attn.o_proj.weight": "model-00005-of-00005.safetensors",
749
+ "model.layers.39.self_attn.q_proj.biases": "model-00005-of-00005.safetensors",
750
+ "model.layers.39.self_attn.q_proj.scales": "model-00005-of-00005.safetensors",
751
+ "model.layers.39.self_attn.q_proj.weight": "model-00005-of-00005.safetensors",
752
+ "model.layers.39.self_attn.v_proj.biases": "model-00005-of-00005.safetensors",
753
+ "model.layers.39.self_attn.v_proj.scales": "model-00005-of-00005.safetensors",
754
+ "model.layers.39.self_attn.v_proj.weight": "model-00005-of-00005.safetensors",
755
+ "model.layers.4.input_layernorm.weight": "model-00002-of-00005.safetensors",
756
+ "model.layers.4.mlp.down_proj.biases": "model-00002-of-00005.safetensors",
757
+ "model.layers.4.mlp.down_proj.scales": "model-00002-of-00005.safetensors",
758
+ "model.layers.4.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
759
+ "model.layers.4.mlp.gate_proj.biases": "model-00002-of-00005.safetensors",
760
+ "model.layers.4.mlp.gate_proj.scales": "model-00002-of-00005.safetensors",
761
+ "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
762
+ "model.layers.4.mlp.up_proj.biases": "model-00002-of-00005.safetensors",
763
+ "model.layers.4.mlp.up_proj.scales": "model-00002-of-00005.safetensors",
764
+ "model.layers.4.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
765
+ "model.layers.4.self_attn.k_proj.biases": "model-00002-of-00005.safetensors",
766
+ "model.layers.4.self_attn.k_proj.scales": "model-00002-of-00005.safetensors",
767
+ "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
768
+ "model.layers.4.self_attn.o_proj.biases": "model-00002-of-00005.safetensors",
769
+ "model.layers.4.self_attn.o_proj.scales": "model-00002-of-00005.safetensors",
770
+ "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
771
+ "model.layers.4.self_attn.q_proj.biases": "model-00002-of-00005.safetensors",
772
+ "model.layers.4.self_attn.q_proj.scales": "model-00002-of-00005.safetensors",
773
+ "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
774
+ "model.layers.4.self_attn.v_proj.biases": "model-00002-of-00005.safetensors",
775
+ "model.layers.4.self_attn.v_proj.scales": "model-00002-of-00005.safetensors",
776
+ "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
777
+ "model.layers.5.input_layernorm.weight": "model-00002-of-00005.safetensors",
778
+ "model.layers.5.mlp.down_proj.biases": "model-00002-of-00005.safetensors",
779
+ "model.layers.5.mlp.down_proj.scales": "model-00002-of-00005.safetensors",
780
+ "model.layers.5.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
781
+ "model.layers.5.mlp.gate_proj.biases": "model-00002-of-00005.safetensors",
782
+ "model.layers.5.mlp.gate_proj.scales": "model-00002-of-00005.safetensors",
783
+ "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
784
+ "model.layers.5.mlp.up_proj.biases": "model-00002-of-00005.safetensors",
785
+ "model.layers.5.mlp.up_proj.scales": "model-00002-of-00005.safetensors",
786
+ "model.layers.5.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
787
+ "model.layers.5.self_attn.k_proj.biases": "model-00002-of-00005.safetensors",
788
+ "model.layers.5.self_attn.k_proj.scales": "model-00002-of-00005.safetensors",
789
+ "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
790
+ "model.layers.5.self_attn.o_proj.biases": "model-00002-of-00005.safetensors",
791
+ "model.layers.5.self_attn.o_proj.scales": "model-00002-of-00005.safetensors",
792
+ "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
793
+ "model.layers.5.self_attn.q_proj.biases": "model-00002-of-00005.safetensors",
794
+ "model.layers.5.self_attn.q_proj.scales": "model-00002-of-00005.safetensors",
795
+ "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
796
+ "model.layers.5.self_attn.v_proj.biases": "model-00002-of-00005.safetensors",
797
+ "model.layers.5.self_attn.v_proj.scales": "model-00002-of-00005.safetensors",
798
+ "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
799
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00005.safetensors",
800
+ "model.layers.6.mlp.down_proj.biases": "model-00002-of-00005.safetensors",
801
+ "model.layers.6.mlp.down_proj.scales": "model-00002-of-00005.safetensors",
802
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
803
+ "model.layers.6.mlp.gate_proj.biases": "model-00002-of-00005.safetensors",
804
+ "model.layers.6.mlp.gate_proj.scales": "model-00002-of-00005.safetensors",
805
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
806
+ "model.layers.6.mlp.up_proj.biases": "model-00002-of-00005.safetensors",
807
+ "model.layers.6.mlp.up_proj.scales": "model-00002-of-00005.safetensors",
808
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
809
+ "model.layers.6.self_attn.k_proj.biases": "model-00002-of-00005.safetensors",
810
+ "model.layers.6.self_attn.k_proj.scales": "model-00002-of-00005.safetensors",
811
+ "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
812
+ "model.layers.6.self_attn.o_proj.biases": "model-00002-of-00005.safetensors",
813
+ "model.layers.6.self_attn.o_proj.scales": "model-00002-of-00005.safetensors",
814
+ "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
815
+ "model.layers.6.self_attn.q_proj.biases": "model-00002-of-00005.safetensors",
816
+ "model.layers.6.self_attn.q_proj.scales": "model-00002-of-00005.safetensors",
817
+ "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
818
+ "model.layers.6.self_attn.v_proj.biases": "model-00002-of-00005.safetensors",
819
+ "model.layers.6.self_attn.v_proj.scales": "model-00002-of-00005.safetensors",
820
+ "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
821
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00005.safetensors",
822
+ "model.layers.7.mlp.down_proj.biases": "model-00002-of-00005.safetensors",
823
+ "model.layers.7.mlp.down_proj.scales": "model-00002-of-00005.safetensors",
824
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
825
+ "model.layers.7.mlp.gate_proj.biases": "model-00002-of-00005.safetensors",
826
+ "model.layers.7.mlp.gate_proj.scales": "model-00002-of-00005.safetensors",
827
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
828
+ "model.layers.7.mlp.up_proj.biases": "model-00002-of-00005.safetensors",
829
+ "model.layers.7.mlp.up_proj.scales": "model-00002-of-00005.safetensors",
830
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
831
+ "model.layers.7.self_attn.k_proj.biases": "model-00002-of-00005.safetensors",
832
+ "model.layers.7.self_attn.k_proj.scales": "model-00002-of-00005.safetensors",
833
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
834
+ "model.layers.7.self_attn.o_proj.biases": "model-00002-of-00005.safetensors",
835
+ "model.layers.7.self_attn.o_proj.scales": "model-00002-of-00005.safetensors",
836
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
837
+ "model.layers.7.self_attn.q_proj.biases": "model-00002-of-00005.safetensors",
838
+ "model.layers.7.self_attn.q_proj.scales": "model-00002-of-00005.safetensors",
839
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
840
+ "model.layers.7.self_attn.v_proj.biases": "model-00002-of-00005.safetensors",
841
+ "model.layers.7.self_attn.v_proj.scales": "model-00002-of-00005.safetensors",
842
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
843
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00005.safetensors",
844
+ "model.layers.8.mlp.down_proj.biases": "model-00002-of-00005.safetensors",
845
+ "model.layers.8.mlp.down_proj.scales": "model-00002-of-00005.safetensors",
846
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
847
+ "model.layers.8.mlp.gate_proj.biases": "model-00002-of-00005.safetensors",
848
+ "model.layers.8.mlp.gate_proj.scales": "model-00002-of-00005.safetensors",
849
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
850
+ "model.layers.8.mlp.up_proj.biases": "model-00002-of-00005.safetensors",
851
+ "model.layers.8.mlp.up_proj.scales": "model-00002-of-00005.safetensors",
852
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
853
+ "model.layers.8.self_attn.k_proj.biases": "model-00002-of-00005.safetensors",
854
+ "model.layers.8.self_attn.k_proj.scales": "model-00002-of-00005.safetensors",
855
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
856
+ "model.layers.8.self_attn.o_proj.biases": "model-00002-of-00005.safetensors",
857
+ "model.layers.8.self_attn.o_proj.scales": "model-00002-of-00005.safetensors",
858
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
859
+ "model.layers.8.self_attn.q_proj.biases": "model-00002-of-00005.safetensors",
860
+ "model.layers.8.self_attn.q_proj.scales": "model-00002-of-00005.safetensors",
861
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
862
+ "model.layers.8.self_attn.v_proj.biases": "model-00002-of-00005.safetensors",
863
+ "model.layers.8.self_attn.v_proj.scales": "model-00002-of-00005.safetensors",
864
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
865
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00005.safetensors",
866
+ "model.layers.9.mlp.down_proj.biases": "model-00002-of-00005.safetensors",
867
+ "model.layers.9.mlp.down_proj.scales": "model-00002-of-00005.safetensors",
868
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
869
+ "model.layers.9.mlp.gate_proj.biases": "model-00002-of-00005.safetensors",
870
+ "model.layers.9.mlp.gate_proj.scales": "model-00002-of-00005.safetensors",
871
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
872
+ "model.layers.9.mlp.up_proj.biases": "model-00002-of-00005.safetensors",
873
+ "model.layers.9.mlp.up_proj.scales": "model-00002-of-00005.safetensors",
874
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
875
+ "model.layers.9.self_attn.k_proj.biases": "model-00002-of-00005.safetensors",
876
+ "model.layers.9.self_attn.k_proj.scales": "model-00002-of-00005.safetensors",
877
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
878
+ "model.layers.9.self_attn.o_proj.biases": "model-00002-of-00005.safetensors",
879
+ "model.layers.9.self_attn.o_proj.scales": "model-00002-of-00005.safetensors",
880
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
881
+ "model.layers.9.self_attn.q_proj.biases": "model-00002-of-00005.safetensors",
882
+ "model.layers.9.self_attn.q_proj.scales": "model-00002-of-00005.safetensors",
883
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
884
+ "model.layers.9.self_attn.v_proj.biases": "model-00002-of-00005.safetensors",
885
+ "model.layers.9.self_attn.v_proj.scales": "model-00002-of-00005.safetensors",
886
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
887
+ "model.norm.weight": "model-00005-of-00005.safetensors"
888
+ }
889
+ }
modeling_cohere.py ADDED
@@ -0,0 +1,1281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ # This file is based on the LLama model definition file in transformers
22
+
23
+ """PyTorch Cohere model."""
24
+
25
+ import math
26
+ import warnings
27
+ from typing import List, Optional, Tuple, Union
28
+
29
+ import torch
30
+ import torch.nn.functional as F
31
+ import torch.utils.checkpoint
32
+ from torch import nn
33
+ from torch.nn import CrossEntropyLoss
34
+
35
+ from transformers import AutoModel, AutoModelForCausalLM
36
+ from transformers.activations import ACT2FN
37
+ from transformers.cache_utils import Cache, DynamicCache, StaticCache
38
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
39
+ from transformers.modeling_outputs import (
40
+ BaseModelOutputWithPast,
41
+ CausalLMOutputWithPast,
42
+ )
43
+ from transformers.modeling_utils import PreTrainedModel
44
+ from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
45
+ from transformers.utils import (
46
+ add_start_docstrings,
47
+ add_start_docstrings_to_model_forward,
48
+ is_flash_attn_2_available,
49
+ is_flash_attn_greater_or_equal_2_10,
50
+ logging,
51
+ replace_return_docstrings,
52
+ )
53
+ from .configuration_cohere import CohereConfig
54
+
55
+
56
+ logger = logging.get_logger(__name__)
57
+
58
+ _CONFIG_FOR_DOC = "CohereConfig"
59
+
60
+ # Copied from transformers.models.llama.modeling_llama._get_unpad_data
61
+ def _get_unpad_data(attention_mask):
62
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
63
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
64
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
65
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
66
+ return (
67
+ indices,
68
+ cu_seqlens,
69
+ max_seqlen_in_batch,
70
+ )
71
+
72
+
73
+ class LayerNorm(nn.Module):
74
+ def __init__(self, hidden_size, eps=1e-5, bias=False):
75
+ super().__init__()
76
+ self.weight = nn.Parameter(torch.ones(hidden_size))
77
+ self.bias = nn.Parameter(torch.zeros(hidden_size)) if bias else None
78
+ self.variance_epsilon = eps
79
+
80
+ def forward(self, hidden_states):
81
+ input_dtype = hidden_states.dtype
82
+ hidden_states = hidden_states.to(torch.float32)
83
+ mean = hidden_states.mean(-1, keepdim=True)
84
+ variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True)
85
+ hidden_states = (hidden_states - mean) * torch.rsqrt(variance + self.variance_epsilon)
86
+ hidden_states = self.weight.to(torch.float32) * hidden_states
87
+ if self.bias is not None:
88
+ hidden_states = hidden_states + self.bias.to(torch.float32)
89
+ return hidden_states.to(input_dtype)
90
+
91
+
92
+ ALL_LAYERNORM_LAYERS.append(LayerNorm)
93
+
94
+
95
+ # copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Cohere
96
+ class CohereRotaryEmbedding(nn.Module):
97
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
98
+ super().__init__()
99
+ self.scaling_factor = scaling_factor
100
+ self.dim = dim
101
+ self.max_position_embeddings = max_position_embeddings
102
+ self.base = base
103
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
104
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
105
+ # For BC we register cos and sin cached
106
+ self.max_seq_len_cached = max_position_embeddings
107
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
108
+ t = t / self.scaling_factor
109
+ freqs = torch.outer(t, self.inv_freq)
110
+ emb = torch.repeat_interleave(freqs, 2, dim=-1)
111
+ self.register_buffer("_cos_cached", emb.cos().to(torch.get_default_dtype()), persistent=False)
112
+ self.register_buffer("_sin_cached", emb.sin().to(torch.get_default_dtype()), persistent=False)
113
+
114
+ @property
115
+ def sin_cached(self):
116
+ logger.warning_once(
117
+ "The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use "
118
+ "the forward method of RoPE from now on instead. It is not used in the `CohereAttention` class"
119
+ )
120
+ return self._sin_cached
121
+
122
+ @property
123
+ def cos_cached(self):
124
+ logger.warning_once(
125
+ "The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use "
126
+ "the forward method of RoPE from now on instead. It is not used in the `CohereAttention` class"
127
+ )
128
+ return self._cos_cached
129
+
130
+ @torch.no_grad()
131
+ def forward(self, x, position_ids, seq_len=None):
132
+ if seq_len is not None:
133
+ logger.warning_once("The `seq_len` argument is deprecated and unused. It will be removed in v4.39.")
134
+
135
+ # x: [bs, num_attention_heads, seq_len, head_size]
136
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
137
+ position_ids_expanded = position_ids[:, None, :].float()
138
+ # Force float32 since bfloat16 loses precision on long contexts
139
+ # See https://github.com/huggingface/transformers/pull/29285
140
+ device_type = x.device.type
141
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
142
+ with torch.autocast(device_type=device_type, enabled=False):
143
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
144
+ emb = torch.repeat_interleave(freqs, 2, dim=-1)
145
+ cos = emb.cos()
146
+ sin = emb.sin()
147
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
148
+
149
+
150
+ def rotate_half(x):
151
+ # Split and rotate
152
+ x1 = x[..., ::2]
153
+ x2 = x[..., 1::2]
154
+ rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2)
155
+ return rot_x
156
+
157
+
158
+ # copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
159
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
160
+ """Applies Rotary Position Embedding to the query and key tensors.
161
+
162
+ Args:
163
+ q (`torch.Tensor`): The query tensor.
164
+ k (`torch.Tensor`): The key tensor.
165
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
166
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
167
+ position_ids (`torch.Tensor`, *optional*):
168
+ Deprecated and unused.
169
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
170
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
171
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
172
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
173
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
174
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
175
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
176
+ Returns:
177
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
178
+ """
179
+ cos = cos.unsqueeze(unsqueeze_dim)
180
+ sin = sin.unsqueeze(unsqueeze_dim)
181
+ q_embed = (q * cos) + (rotate_half(q) * sin)
182
+ k_embed = (k * cos) + (rotate_half(k) * sin)
183
+ return q_embed, k_embed
184
+
185
+
186
+ # Copied from transformers.models.llama.modeling_llama.LlamaMLP Llama->Cohere
187
+ class CohereMLP(nn.Module):
188
+ def __init__(self, config):
189
+ super().__init__()
190
+ self.config = config
191
+ self.hidden_size = config.hidden_size
192
+ self.intermediate_size = config.intermediate_size
193
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
194
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
195
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
196
+ self.act_fn = ACT2FN[config.hidden_act]
197
+
198
+ def forward(self, x):
199
+ if self.config.pretraining_tp > 1:
200
+ slice = self.intermediate_size // self.config.pretraining_tp
201
+ gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
202
+ up_proj_slices = self.up_proj.weight.split(slice, dim=0)
203
+ down_proj_slices = self.down_proj.weight.split(slice, dim=1)
204
+
205
+ gate_proj = torch.cat(
206
+ [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
207
+ )
208
+ up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
209
+
210
+ intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
211
+ down_proj = [
212
+ F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
213
+ ]
214
+ down_proj = sum(down_proj)
215
+ else:
216
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
217
+
218
+ return down_proj
219
+
220
+
221
+ # Copied from transformers.models.llama.modeling_llama.repeat_kv
222
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
223
+ """
224
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
225
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
226
+ """
227
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
228
+ if n_rep == 1:
229
+ return hidden_states
230
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
231
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
232
+
233
+
234
+ class Attention(nn.Module):
235
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
236
+
237
+ def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None):
238
+ super().__init__()
239
+ self.config = config
240
+ self.layer_idx = layer_idx
241
+ if layer_idx is None:
242
+ logger.warning_once(
243
+ f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
244
+ "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
245
+ "when creating this class."
246
+ )
247
+
248
+ self.attention_dropout = config.attention_dropout
249
+ self.hidden_size = config.hidden_size
250
+ self.num_heads = config.num_attention_heads
251
+ self.head_dim = self.hidden_size // self.num_heads
252
+ self.num_key_value_heads = config.num_key_value_heads
253
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
254
+ self.max_position_embeddings = config.max_position_embeddings
255
+ self.rope_theta = config.rope_theta
256
+ self.is_causal = True
257
+
258
+ if (self.head_dim * self.num_heads) != self.hidden_size:
259
+ raise ValueError(
260
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
261
+ f" and `num_heads`: {self.num_heads})."
262
+ )
263
+
264
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
265
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
266
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
267
+ self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
268
+ self.rotary_emb = CohereRotaryEmbedding(
269
+ self.head_dim,
270
+ max_position_embeddings=self.max_position_embeddings,
271
+ base=self.rope_theta,
272
+ )
273
+
274
+ def forward(
275
+ self,
276
+ hidden_states: torch.Tensor,
277
+ attention_mask: Optional[torch.Tensor] = None,
278
+ position_ids: Optional[torch.LongTensor] = None,
279
+ past_key_value: Optional[Cache] = None,
280
+ output_attentions: bool = False,
281
+ use_cache: bool = False,
282
+ cache_position: Optional[torch.LongTensor] = None,
283
+ **kwargs,
284
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
285
+ bsz, q_len, _ = hidden_states.size()
286
+
287
+ if self.config.pretraining_tp > 1:
288
+ key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
289
+ query_slices = self.q_proj.weight.split(
290
+ (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
291
+ )
292
+ key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
293
+ value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
294
+
295
+ query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
296
+ query_states = torch.cat(query_states, dim=-1)
297
+
298
+ key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
299
+ key_states = torch.cat(key_states, dim=-1)
300
+
301
+ value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
302
+ value_states = torch.cat(value_states, dim=-1)
303
+
304
+ else:
305
+ query_states = self.q_proj(hidden_states)
306
+ key_states = self.k_proj(hidden_states)
307
+ value_states = self.v_proj(hidden_states)
308
+
309
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
310
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
311
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
312
+
313
+ past_key_value = getattr(self, "past_key_value", past_key_value)
314
+ cos, sin = self.rotary_emb(value_states, position_ids)
315
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
316
+
317
+ if past_key_value is not None:
318
+ # sin and cos are specific to RoPE models; position_ids needed for the static cache
319
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
320
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
321
+
322
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
323
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
324
+
325
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
326
+
327
+ if attention_mask is not None: # no matter the length, we just slice it
328
+ causal_mask = attention_mask
329
+ if cache_position is not None:
330
+ causal_mask = attention_mask[:, :, cache_position, : key_states.shape[-2]]
331
+ attn_weights = attn_weights + causal_mask
332
+
333
+ # upcast attention to fp32
334
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
335
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
336
+ attn_output = torch.matmul(attn_weights, value_states)
337
+
338
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
339
+ raise ValueError(
340
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
341
+ f" {attn_output.size()}"
342
+ )
343
+
344
+ attn_output = attn_output.transpose(1, 2).contiguous()
345
+
346
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
347
+
348
+ if self.config.pretraining_tp > 1:
349
+ attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
350
+ o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
351
+ attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
352
+ else:
353
+ attn_output = self.o_proj(attn_output)
354
+
355
+ if not output_attentions:
356
+ attn_weights = None
357
+
358
+ return attn_output, attn_weights, past_key_value
359
+
360
+
361
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 Llama->Cohere
362
+ class CohereFlashAttention2(Attention):
363
+ """
364
+ Cohere flash attention module. This module inherits from `Attention` as the weights of the module stays
365
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
366
+ flash attention and deal with padding tokens in case the input contains any of them.
367
+ """
368
+
369
+ def __init__(self, *args, **kwargs):
370
+ super().__init__(*args, **kwargs)
371
+
372
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
373
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
374
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
375
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
376
+
377
+ def forward(
378
+ self,
379
+ hidden_states: torch.Tensor,
380
+ attention_mask: Optional[torch.LongTensor] = None,
381
+ position_ids: Optional[torch.LongTensor] = None,
382
+ past_key_value: Optional[Cache] = None,
383
+ output_attentions: bool = False,
384
+ use_cache: bool = False,
385
+ cache_position: Optional[torch.LongTensor] = None,
386
+ **kwargs,
387
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
388
+ output_attentions = False
389
+
390
+ bsz, q_len, _ = hidden_states.size()
391
+
392
+ query_states = self.q_proj(hidden_states)
393
+ key_states = self.k_proj(hidden_states)
394
+ value_states = self.v_proj(hidden_states)
395
+
396
+ # Flash attention requires the input to have the shape
397
+ # batch_size x seq_length x head_dim x hidden_dim
398
+ # therefore we just need to keep the original shape
399
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
400
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
401
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
402
+
403
+ cos, sin = self.rotary_emb(value_states, position_ids)
404
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
405
+
406
+ past_key_value = getattr(self, "past_key_value", past_key_value)
407
+
408
+ if past_key_value is not None:
409
+ # sin and cos are specific to RoPE models; position_ids needed for the static cache
410
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
411
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
412
+
413
+ # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
414
+ # to be able to avoid many of these transpose/reshape/view.
415
+ query_states = query_states.transpose(1, 2)
416
+ key_states = key_states.transpose(1, 2)
417
+ value_states = value_states.transpose(1, 2)
418
+
419
+ dropout_rate = self.attention_dropout if self.training else 0.0
420
+
421
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
422
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
423
+ # cast them back in the correct dtype just to be sure everything works as expected.
424
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
425
+ # in fp32.
426
+
427
+ input_dtype = query_states.dtype
428
+ if input_dtype == torch.float32:
429
+ if torch.is_autocast_enabled():
430
+ target_dtype = torch.get_autocast_gpu_dtype()
431
+ # Handle the case where the model is quantized
432
+ elif hasattr(self.config, "_pre_quantization_dtype"):
433
+ target_dtype = self.config._pre_quantization_dtype
434
+ else:
435
+ target_dtype = self.q_proj.weight.dtype
436
+
437
+ logger.warning_once(
438
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
439
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
440
+ f" {target_dtype}."
441
+ )
442
+
443
+ query_states = query_states.to(target_dtype)
444
+ key_states = key_states.to(target_dtype)
445
+ value_states = value_states.to(target_dtype)
446
+
447
+ attn_output = self._flash_attention_forward(
448
+ query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
449
+ )
450
+
451
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
452
+ attn_output = self.o_proj(attn_output)
453
+
454
+ if not output_attentions:
455
+ attn_weights = None
456
+
457
+ return attn_output, attn_weights, past_key_value
458
+
459
+ def _flash_attention_forward(
460
+ self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
461
+ ):
462
+ """
463
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
464
+ first unpad the input, then computes the attention scores and pad the final attention scores.
465
+
466
+ Args:
467
+ query_states (`torch.Tensor`):
468
+ Input query states to be passed to Flash Attention API
469
+ key_states (`torch.Tensor`):
470
+ Input key states to be passed to Flash Attention API
471
+ value_states (`torch.Tensor`):
472
+ Input value states to be passed to Flash Attention API
473
+ attention_mask (`torch.Tensor`):
474
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
475
+ position of padding tokens and 1 for the position of non-padding tokens.
476
+ dropout (`int`, *optional*):
477
+ Attention dropout
478
+ softmax_scale (`float`, *optional*):
479
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
480
+ """
481
+ if not self._flash_attn_uses_top_left_mask:
482
+ causal = self.is_causal
483
+ else:
484
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in CohereFlashAttention2 __init__.
485
+ causal = self.is_causal and query_length != 1
486
+
487
+ # Contains at least one padding token in the sequence
488
+ if attention_mask is not None:
489
+ batch_size = query_states.shape[0]
490
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
491
+ query_states, key_states, value_states, attention_mask, query_length
492
+ )
493
+
494
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
495
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
496
+
497
+ attn_output_unpad = flash_attn_varlen_func(
498
+ query_states,
499
+ key_states,
500
+ value_states,
501
+ cu_seqlens_q=cu_seqlens_q,
502
+ cu_seqlens_k=cu_seqlens_k,
503
+ max_seqlen_q=max_seqlen_in_batch_q,
504
+ max_seqlen_k=max_seqlen_in_batch_k,
505
+ dropout_p=dropout,
506
+ softmax_scale=softmax_scale,
507
+ causal=causal,
508
+ )
509
+
510
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
511
+ else:
512
+ attn_output = flash_attn_func(
513
+ query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
514
+ )
515
+
516
+ return attn_output
517
+
518
+ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
519
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
520
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
521
+
522
+ key_layer = index_first_axis(
523
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
524
+ )
525
+ value_layer = index_first_axis(
526
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
527
+ )
528
+ if query_length == kv_seq_len:
529
+ query_layer = index_first_axis(
530
+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
531
+ )
532
+ cu_seqlens_q = cu_seqlens_k
533
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
534
+ indices_q = indices_k
535
+ elif query_length == 1:
536
+ max_seqlen_in_batch_q = 1
537
+ cu_seqlens_q = torch.arange(
538
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
539
+ ) # There is a memcpy here, that is very bad.
540
+ indices_q = cu_seqlens_q[:-1]
541
+ query_layer = query_layer.squeeze(1)
542
+ else:
543
+ # The -q_len: slice assumes left padding.
544
+ attention_mask = attention_mask[:, -query_length:]
545
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
546
+
547
+ return (
548
+ query_layer,
549
+ key_layer,
550
+ value_layer,
551
+ indices_q,
552
+ (cu_seqlens_q, cu_seqlens_k),
553
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
554
+ )
555
+
556
+
557
+ class SdpaAttention(Attention):
558
+ """
559
+ Attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
560
+ `Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
561
+ SDPA API.
562
+ """
563
+
564
+ # Adapted from Attention.forward
565
+ def forward(
566
+ self,
567
+ hidden_states: torch.Tensor,
568
+ attention_mask: Optional[torch.Tensor] = None,
569
+ position_ids: Optional[torch.LongTensor] = None,
570
+ past_key_value: Optional[Cache] = None,
571
+ output_attentions: bool = False,
572
+ use_cache: bool = False,
573
+ cache_position: Optional[torch.LongTensor] = None,
574
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
575
+ if output_attentions:
576
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
577
+ logger.warning_once(
578
+ "CohereModel is using SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
579
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
580
+ )
581
+ return super().forward(
582
+ hidden_states=hidden_states,
583
+ attention_mask=attention_mask,
584
+ position_ids=position_ids,
585
+ past_key_value=past_key_value,
586
+ output_attentions=output_attentions,
587
+ use_cache=use_cache,
588
+ cache_position=cache_position,
589
+ )
590
+
591
+ bsz, q_len, _ = hidden_states.size()
592
+
593
+ query_states = self.q_proj(hidden_states)
594
+ key_states = self.k_proj(hidden_states)
595
+ value_states = self.v_proj(hidden_states)
596
+
597
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
598
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
599
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
600
+
601
+ cos, sin = self.rotary_emb(value_states, position_ids)
602
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
603
+
604
+ # In case static cache is used, it is an instance attribute.
605
+ past_key_value = getattr(self, "past_key_value", past_key_value)
606
+
607
+ if past_key_value is not None:
608
+ # sin and cos are specific to RoPE models; position_ids needed for the static cache
609
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
610
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
611
+
612
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
613
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
614
+
615
+ causal_mask = attention_mask
616
+ if attention_mask is not None and cache_position is not None:
617
+ causal_mask = causal_mask[:, :, cache_position, : key_states.shape[-2]]
618
+
619
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
620
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
621
+ if query_states.device.type == "cuda" and causal_mask is not None:
622
+ query_states = query_states.contiguous()
623
+ key_states = key_states.contiguous()
624
+ value_states = value_states.contiguous()
625
+
626
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
627
+ query_states,
628
+ key_states,
629
+ value_states,
630
+ attn_mask=causal_mask,
631
+ dropout_p=self.attention_dropout if self.training else 0.0,
632
+ )
633
+
634
+ attn_output = attn_output.transpose(1, 2).contiguous()
635
+ attn_output = attn_output.view(bsz, q_len, self.hidden_size)
636
+
637
+ attn_output = self.o_proj(attn_output)
638
+
639
+ return attn_output, None, past_key_value
640
+
641
+
642
+ COHERE_ATTENTION_CLASSES = {
643
+ "eager": Attention,
644
+ "flash_attention_2": CohereFlashAttention2,
645
+ "sdpa": SdpaAttention,
646
+ }
647
+
648
+
649
+ class CohereDecoderLayer(nn.Module):
650
+ def __init__(self, config: CohereConfig, layer_idx: int):
651
+ super().__init__()
652
+ self.hidden_size = config.hidden_size
653
+
654
+ self.self_attn = COHERE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
655
+
656
+ self.mlp = CohereMLP(config)
657
+ self.input_layernorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
658
+
659
+ def forward(
660
+ self,
661
+ hidden_states: torch.Tensor,
662
+ attention_mask: Optional[torch.Tensor] = None,
663
+ position_ids: Optional[torch.LongTensor] = None,
664
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
665
+ output_attentions: Optional[bool] = False,
666
+ use_cache: Optional[bool] = False,
667
+ cache_position: Optional[torch.LongTensor] = None,
668
+ **kwargs,
669
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
670
+ """
671
+ Args:
672
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
673
+ attention_mask (`torch.FloatTensor`, *optional*):
674
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
675
+ query_sequence_length, key_sequence_length)` if default attention is used.
676
+ output_attentions (`bool`, *optional*):
677
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
678
+ returned tensors for more detail.
679
+ use_cache (`bool`, *optional*):
680
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
681
+ (see `past_key_values`).
682
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
683
+ """
684
+ if "padding_mask" in kwargs:
685
+ warnings.warn(
686
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
687
+ )
688
+
689
+ residual = hidden_states
690
+
691
+ hidden_states = self.input_layernorm(hidden_states)
692
+
693
+ # Self Attention
694
+ hidden_states_attention, self_attn_weights, present_key_value = self.self_attn(
695
+ hidden_states=hidden_states,
696
+ attention_mask=attention_mask,
697
+ position_ids=position_ids,
698
+ past_key_value=past_key_value,
699
+ output_attentions=output_attentions,
700
+ use_cache=use_cache,
701
+ cache_position=cache_position,
702
+ **kwargs,
703
+ )
704
+
705
+ # Fully Connected
706
+ hidden_states_mlp = self.mlp(hidden_states)
707
+
708
+ # Add everything together
709
+ hidden_states = residual + hidden_states_attention + hidden_states_mlp
710
+
711
+ outputs = (hidden_states,)
712
+
713
+ if output_attentions:
714
+ outputs += (self_attn_weights,)
715
+
716
+ if use_cache:
717
+ outputs += (present_key_value,)
718
+
719
+ return outputs
720
+
721
+
722
+ COHERE_START_DOCSTRING = r"""
723
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
724
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
725
+ etc.)
726
+
727
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
728
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
729
+ and behavior.
730
+
731
+ Parameters:
732
+ config ([`CohereConfig`]):
733
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
734
+ load the weights associated with the model, only the configuration. Check out the
735
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
736
+ """
737
+
738
+
739
+ @add_start_docstrings(
740
+ "The bare Cohere Model outputting raw hidden-states without any specific head on top.",
741
+ COHERE_START_DOCSTRING,
742
+ )
743
+ class CoherePreTrainedModel(PreTrainedModel):
744
+ config_class = CohereConfig
745
+ base_model_prefix = "model"
746
+ supports_gradient_checkpointing = True
747
+ _no_split_modules = ["CohereDecoderLayer"]
748
+ _skip_keys_device_placement = ["past_key_values", "causal_mask"]
749
+ _supports_flash_attn_2 = True
750
+ _supports_sdpa = True
751
+ _supports_cache_class = True
752
+
753
+ def _init_weights(self, module):
754
+ std = self.config.initializer_range
755
+ if isinstance(module, nn.Linear):
756
+ module.weight.data.normal_(mean=0.0, std=std)
757
+ if module.bias is not None:
758
+ module.bias.data.zero_()
759
+ elif isinstance(module, nn.Embedding):
760
+ module.weight.data.normal_(mean=0.0, std=std)
761
+ if module.padding_idx is not None:
762
+ module.weight.data[module.padding_idx].zero_()
763
+
764
+ def _setup_cache(self, cache_cls, max_batch_size, max_cache_len: Optional[int] = None):
765
+ if self.config._attn_implementation == "flash_attention_2" and cache_cls == StaticCache:
766
+ raise ValueError(
767
+ "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
768
+ "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
769
+ )
770
+
771
+ if max_cache_len > self.model.causal_mask.shape[-1] or self.device != self.model.causal_mask.device:
772
+ causal_mask = torch.full(
773
+ (max_cache_len, max_cache_len), fill_value=True, device=self.device, dtype=torch.bool
774
+ )
775
+ self.register_buffer("causal_mask", torch.triu(causal_mask, diagonal=1), persistent=False)
776
+
777
+ for layer in self.model.layers:
778
+ device = layer.input_layernorm.weight.device
779
+ if hasattr(self.config, "_pre_quantization_dtype"):
780
+ dtype = self.config._pre_quantization_dtype
781
+ else:
782
+ dtype = layer.self_attn.o_proj.weight.dtype
783
+ layer.self_attn.past_key_value = cache_cls(
784
+ self.config, max_batch_size, max_cache_len, device=device, dtype=dtype
785
+ )
786
+
787
+ def _reset_cache(self):
788
+ for layer in self.model.layers:
789
+ layer.self_attn.past_key_value = None
790
+
791
+
792
+ COHERE_INPUTS_DOCSTRING = r"""
793
+ Args:
794
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
795
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
796
+ it.
797
+
798
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
799
+ [`PreTrainedTokenizer.__call__`] for details.
800
+
801
+ [What are input IDs?](../glossary#input-ids)
802
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
803
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
804
+
805
+ - 1 for tokens that are **not masked**,
806
+ - 0 for tokens that are **masked**.
807
+
808
+ [What are attention masks?](../glossary#attention-mask)
809
+
810
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
811
+ [`PreTrainedTokenizer.__call__`] for details.
812
+
813
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
814
+ `past_key_values`).
815
+
816
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
817
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
818
+ information on the default strategy.
819
+
820
+ - 1 indicates the head is **not masked**,
821
+ - 0 indicates the head is **masked**.
822
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
823
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
824
+ config.n_positions - 1]`.
825
+
826
+ [What are position IDs?](../glossary#position-ids)
827
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
828
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
829
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
830
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
831
+
832
+ Two formats are allowed:
833
+ - a [`~cache_utils.Cache`] instance;
834
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
835
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
836
+ cache format.
837
+
838
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
839
+ legacy cache format will be returned.
840
+
841
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
842
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
843
+ of shape `(batch_size, sequence_length)`.
844
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
845
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
846
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
847
+ model's internal embedding lookup matrix.
848
+ use_cache (`bool`, *optional*):
849
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
850
+ `past_key_values`).
851
+ output_attentions (`bool`, *optional*):
852
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
853
+ tensors for more detail.
854
+ output_hidden_states (`bool`, *optional*):
855
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
856
+ more detail.
857
+ return_dict (`bool`, *optional*):
858
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
859
+ """
860
+
861
+
862
+ @add_start_docstrings(
863
+ "The bare Cohere Model outputting raw hidden-states without any specific head on top.",
864
+ COHERE_START_DOCSTRING,
865
+ )
866
+ class CohereModel(CoherePreTrainedModel):
867
+ """
868
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CohereDecoderLayer`]
869
+
870
+ Args:
871
+ config: CohereConfig
872
+ """
873
+
874
+ def __init__(self, config: CohereConfig):
875
+ super().__init__(config)
876
+ self.padding_idx = config.pad_token_id
877
+ self.vocab_size = config.vocab_size
878
+
879
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
880
+ self.layers = nn.ModuleList(
881
+ [CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
882
+ )
883
+ self.norm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
884
+ self.gradient_checkpointing = False
885
+
886
+ # Register a causal mask to separate causal and padding mask creation. Merging happens in the attention class.
887
+ # NOTE: This is not friendly with TorchScript, ONNX, ExportedProgram serialization for very large `max_position_embeddings`.
888
+ causal_mask = torch.full(
889
+ (config.max_position_embeddings, config.max_position_embeddings), fill_value=True, dtype=torch.bool
890
+ )
891
+ self.register_buffer("causal_mask", torch.triu(causal_mask, diagonal=1), persistent=False)
892
+ # Initialize weights and apply final processing
893
+ self.post_init()
894
+
895
+ def get_input_embeddings(self):
896
+ return self.embed_tokens
897
+
898
+ def set_input_embeddings(self, value):
899
+ self.embed_tokens = value
900
+
901
+ @add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
902
+ def forward(
903
+ self,
904
+ input_ids: torch.LongTensor = None,
905
+ attention_mask: Optional[torch.Tensor] = None,
906
+ position_ids: Optional[torch.LongTensor] = None,
907
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
908
+ inputs_embeds: Optional[torch.FloatTensor] = None,
909
+ use_cache: Optional[bool] = None,
910
+ output_attentions: Optional[bool] = None,
911
+ output_hidden_states: Optional[bool] = None,
912
+ return_dict: Optional[bool] = None,
913
+ cache_position: Optional[torch.LongTensor] = None,
914
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
915
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
916
+ output_hidden_states = (
917
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
918
+ )
919
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
920
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
921
+
922
+ if (input_ids is None) ^ (inputs_embeds is not None):
923
+ raise ValueError(
924
+ "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
925
+ )
926
+
927
+ if self.gradient_checkpointing and self.training and use_cache:
928
+ logger.warning_once(
929
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
930
+ )
931
+ use_cache = False
932
+
933
+ if inputs_embeds is None:
934
+ inputs_embeds = self.embed_tokens(input_ids)
935
+
936
+ past_seen_tokens = 0
937
+ if use_cache: # kept for BC (cache positions)
938
+ if not isinstance(past_key_values, StaticCache):
939
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
940
+ past_seen_tokens = past_key_values.get_seq_length()
941
+
942
+ if cache_position is None:
943
+ if isinstance(past_key_values, StaticCache):
944
+ raise ValueError("cache_position is a required argument when using StaticCache.")
945
+ cache_position = torch.arange(
946
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
947
+ )
948
+
949
+ if position_ids is None:
950
+ position_ids = cache_position.unsqueeze(0)
951
+
952
+ causal_mask = self._update_causal_mask(attention_mask, inputs_embeds)
953
+
954
+ # embed positions
955
+ hidden_states = inputs_embeds
956
+
957
+ # decoder layers
958
+ all_hidden_states = () if output_hidden_states else None
959
+ all_self_attns = () if output_attentions else None
960
+ next_decoder_cache = None
961
+
962
+ for decoder_layer in self.layers:
963
+ if output_hidden_states:
964
+ all_hidden_states += (hidden_states,)
965
+
966
+ if self.gradient_checkpointing and self.training:
967
+ layer_outputs = self._gradient_checkpointing_func(
968
+ decoder_layer.__call__,
969
+ hidden_states,
970
+ causal_mask,
971
+ position_ids,
972
+ past_key_values,
973
+ output_attentions,
974
+ use_cache,
975
+ cache_position,
976
+ )
977
+ else:
978
+ layer_outputs = decoder_layer(
979
+ hidden_states,
980
+ attention_mask=causal_mask,
981
+ position_ids=position_ids,
982
+ past_key_value=past_key_values,
983
+ output_attentions=output_attentions,
984
+ use_cache=use_cache,
985
+ cache_position=cache_position,
986
+ )
987
+
988
+ hidden_states = layer_outputs[0]
989
+
990
+ if use_cache:
991
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
992
+
993
+ if output_attentions:
994
+ all_self_attns += (layer_outputs[1],)
995
+
996
+ hidden_states = self.norm(hidden_states)
997
+
998
+ # add hidden states from the last decoder layer
999
+ if output_hidden_states:
1000
+ all_hidden_states += (hidden_states,)
1001
+
1002
+ next_cache = None
1003
+ if use_cache:
1004
+ next_cache = (
1005
+ next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
1006
+ )
1007
+ if not return_dict:
1008
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
1009
+ return BaseModelOutputWithPast(
1010
+ last_hidden_state=hidden_states,
1011
+ past_key_values=next_cache,
1012
+ hidden_states=all_hidden_states,
1013
+ attentions=all_self_attns,
1014
+ )
1015
+
1016
+ # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
1017
+ # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
1018
+ # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
1019
+ # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
1020
+ def _update_causal_mask(self, attention_mask, input_tensor):
1021
+ if self.config._attn_implementation == "flash_attention_2":
1022
+ if attention_mask is not None and 0.0 in attention_mask:
1023
+ return attention_mask
1024
+ return None
1025
+
1026
+ batch_size, seq_length = input_tensor.shape[:2]
1027
+ dtype = input_tensor.dtype
1028
+ device = input_tensor.device
1029
+
1030
+ # support going beyond cached `max_position_embedding`
1031
+ if seq_length > self.causal_mask.shape[-1]:
1032
+ causal_mask = torch.full((2 * self.causal_mask.shape[-1], 2 * self.causal_mask.shape[-1]), fill_value=1)
1033
+ self.register_buffer("causal_mask", torch.triu(causal_mask, diagonal=1), persistent=False)
1034
+
1035
+ # We use the current dtype to avoid any overflows
1036
+ min_dtype = torch.finfo(dtype).min
1037
+ causal_mask = self.causal_mask[None, None, :, :].to(dtype=dtype, device=device) * min_dtype
1038
+ causal_mask = causal_mask.expand(batch_size, 1, -1, -1)
1039
+ if attention_mask is not None and attention_mask.dim() == 2:
1040
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
1041
+ mask_length = attention_mask.shape[-1]
1042
+ padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
1043
+ causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
1044
+
1045
+ if (
1046
+ self.config._attn_implementation == "sdpa"
1047
+ and attention_mask is not None
1048
+ and attention_mask.device.type == "cuda"
1049
+ ):
1050
+ # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).
1051
+ is_tracing = (
1052
+ torch.jit.is_tracing()
1053
+ or isinstance(input_tensor, torch.fx.Proxy)
1054
+ or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling())
1055
+ )
1056
+ if not is_tracing and torch.any(attention_mask != 1):
1057
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
1058
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
1059
+ # Details: https://github.com/pytorch/pytorch/issues/110213
1060
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
1061
+
1062
+ return causal_mask
1063
+
1064
+
1065
+ class CohereForCausalLM(CoherePreTrainedModel):
1066
+ _tied_weights_keys = ["model.embed_tokens.weight", "lm_head.weight"]
1067
+
1068
+ def __init__(self, config):
1069
+ super().__init__(config)
1070
+ self.model = CohereModel(config)
1071
+ self.vocab_size = config.vocab_size
1072
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1073
+ self.logit_scale = config.logit_scale
1074
+ # Initialize weights and apply final processing
1075
+ self.post_init()
1076
+
1077
+ def get_input_embeddings(self):
1078
+ return self.model.embed_tokens
1079
+
1080
+ def set_input_embeddings(self, value):
1081
+ self.model.embed_tokens = value
1082
+
1083
+ def get_output_embeddings(self):
1084
+ return self.lm_head
1085
+
1086
+ def set_output_embeddings(self, new_embeddings):
1087
+ self.lm_head = new_embeddings
1088
+
1089
+ def set_decoder(self, decoder):
1090
+ self.model = decoder
1091
+
1092
+ def get_decoder(self):
1093
+ return self.model
1094
+
1095
+ @add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
1096
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1097
+ def forward(
1098
+ self,
1099
+ input_ids: torch.LongTensor = None,
1100
+ attention_mask: Optional[torch.Tensor] = None,
1101
+ position_ids: Optional[torch.LongTensor] = None,
1102
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1103
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1104
+ labels: Optional[torch.LongTensor] = None,
1105
+ use_cache: Optional[bool] = None,
1106
+ output_attentions: Optional[bool] = None,
1107
+ output_hidden_states: Optional[bool] = None,
1108
+ return_dict: Optional[bool] = None,
1109
+ cache_position: Optional[torch.LongTensor] = None,
1110
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
1111
+ r"""
1112
+ Args:
1113
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1114
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1115
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1116
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1117
+
1118
+ Returns:
1119
+
1120
+ Example:
1121
+
1122
+ ```python
1123
+ >>> from transformers import AutoTokenizer, CohereForCausalLM
1124
+
1125
+ #TODO: Model name needs to be updated
1126
+ >>> model = CohereForCausalLM.from_pretrained("CohereForAI/Cohere-model")
1127
+ >>> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/Cohere-model")
1128
+
1129
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
1130
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1131
+
1132
+ >>> # Generate
1133
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1134
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1135
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1136
+ ```"""
1137
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1138
+ output_hidden_states = (
1139
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1140
+ )
1141
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1142
+
1143
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1144
+ outputs = self.model(
1145
+ input_ids=input_ids,
1146
+ attention_mask=attention_mask,
1147
+ position_ids=position_ids,
1148
+ past_key_values=past_key_values,
1149
+ inputs_embeds=inputs_embeds,
1150
+ use_cache=use_cache,
1151
+ output_attentions=output_attentions,
1152
+ output_hidden_states=output_hidden_states,
1153
+ return_dict=return_dict,
1154
+ cache_position=cache_position,
1155
+ )
1156
+
1157
+ hidden_states = outputs[0]
1158
+ if self.config.pretraining_tp > 1:
1159
+ lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
1160
+ logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
1161
+ logits = torch.cat(logits, dim=-1)
1162
+ else:
1163
+ logits = self.lm_head(hidden_states)
1164
+ logits = logits * self.logit_scale
1165
+ logits = logits.float()
1166
+
1167
+ loss = None
1168
+ if labels is not None:
1169
+ # Shift so that tokens < n predict n
1170
+ shift_logits = logits[..., :-1, :].contiguous()
1171
+ shift_labels = labels[..., 1:].contiguous()
1172
+ # Flatten the tokens
1173
+ loss_fct = CrossEntropyLoss()
1174
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
1175
+ shift_labels = shift_labels.view(-1)
1176
+ # Enable model parallelism
1177
+ shift_labels = shift_labels.to(shift_logits.device)
1178
+ loss = loss_fct(shift_logits, shift_labels)
1179
+
1180
+ if not return_dict:
1181
+ output = (logits,) + outputs[1:]
1182
+ return (loss,) + output if loss is not None else output
1183
+
1184
+ return CausalLMOutputWithPast(
1185
+ loss=loss,
1186
+ logits=logits,
1187
+ past_key_values=outputs.past_key_values,
1188
+ hidden_states=outputs.hidden_states,
1189
+ attentions=outputs.attentions,
1190
+ )
1191
+
1192
+ def prepare_inputs_for_generation(
1193
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
1194
+ ):
1195
+ past_length = 0
1196
+ if past_key_values is not None:
1197
+ if isinstance(past_key_values, Cache):
1198
+ cache_length = past_key_values.get_seq_length()
1199
+ past_length = past_key_values.seen_tokens
1200
+ max_cache_length = past_key_values.get_max_length()
1201
+ else:
1202
+ cache_length = past_length = past_key_values[0][0].shape[2]
1203
+ max_cache_length = None
1204
+
1205
+ # Keep only the unprocessed tokens:
1206
+ # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
1207
+ # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
1208
+ # input)
1209
+ if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
1210
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
1211
+ # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
1212
+ # input_ids based on the past_length.
1213
+ elif past_length < input_ids.shape[1]:
1214
+ input_ids = input_ids[:, past_length:]
1215
+ # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
1216
+
1217
+ # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
1218
+ if (
1219
+ max_cache_length is not None
1220
+ and attention_mask is not None
1221
+ and cache_length + input_ids.shape[1] > max_cache_length
1222
+ ):
1223
+ attention_mask = attention_mask[:, -max_cache_length:]
1224
+
1225
+ position_ids = kwargs.get("position_ids", None)
1226
+ if attention_mask is not None and position_ids is None:
1227
+ # create position_ids on the fly for batch generation
1228
+ position_ids = attention_mask.long().cumsum(-1) - 1
1229
+ position_ids.masked_fill_(attention_mask == 0, 1)
1230
+ if past_key_values:
1231
+ position_ids = position_ids[:, -input_ids.shape[1] :]
1232
+
1233
+ if self.generation_config.cache_implementation == "static":
1234
+ # generation with static cache
1235
+ cache_position = kwargs.get("cache_position", None)
1236
+ if cache_position is None:
1237
+ past_length = 0
1238
+ else:
1239
+ past_length = cache_position[-1] + 1
1240
+ input_ids = input_ids[:, past_length:]
1241
+ position_ids = position_ids[:, past_length:]
1242
+
1243
+ # TODO @gante we should only keep a `cache_position` in generate, and do +=1.
1244
+ # same goes for position ids. Could also help with continued generation.
1245
+ input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
1246
+ cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
1247
+ position_ids = position_ids.contiguous() if position_ids is not None else None
1248
+
1249
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1250
+ if inputs_embeds is not None and past_key_values is None:
1251
+ model_inputs = {"inputs_embeds": inputs_embeds}
1252
+ else:
1253
+ # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
1254
+ # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114
1255
+ # TODO: use `next_tokens` directly instead.
1256
+ model_inputs = {"input_ids": input_ids.contiguous()}
1257
+
1258
+ model_inputs.update(
1259
+ {
1260
+ "position_ids": position_ids,
1261
+ "cache_position": cache_position,
1262
+ "past_key_values": past_key_values,
1263
+ "use_cache": kwargs.get("use_cache"),
1264
+ "attention_mask": attention_mask,
1265
+ }
1266
+ )
1267
+ return model_inputs
1268
+
1269
+ @staticmethod
1270
+ def _reorder_cache(past_key_values, beam_idx):
1271
+ reordered_past = ()
1272
+ for layer_past in past_key_values:
1273
+ reordered_past += (
1274
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
1275
+ )
1276
+ return reordered_past
1277
+
1278
+
1279
+ # register models as AutoModel and AutoModelForCausalLM
1280
+ AutoModel.register(CohereConfig, CohereModel)
1281
+ AutoModelForCausalLM.register(CohereConfig, CohereForCausalLM)
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<BOS_TOKEN>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|END_OF_TURN_TOKEN|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<PAD>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenization_cohere_fast.py ADDED
@@ -0,0 +1,754 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Cohere and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # This file is based on the tokenization_llama_fast.py file in transformers
17
+
18
+
19
+ import os
20
+ from shutil import copyfile
21
+ from typing import Optional, Tuple, Dict, Union, List, Literal
22
+
23
+ from tokenizers import processors
24
+ from transformers import AutoTokenizer
25
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
26
+ from transformers.utils import logging
27
+ from transformers.utils.versions import require_version
28
+ from transformers.tokenization_utils_base import TensorType
29
+ from transformers.pipelines.conversational import Conversation
30
+
31
+ from .configuration_cohere import CohereConfig
32
+
33
+ require_version("tokenizers>=0.13.3")
34
+
35
+ logger = logging.get_logger(__name__)
36
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.json"}
37
+
38
+ PRETRAINED_VOCAB_FILES_MAP = {
39
+ "vocab_file": {
40
+ "cohere-tokenizer": "https://huggingface.co/Cohere/Command-nightly/blob/main/tokenizer.json",
41
+ },
42
+ }
43
+
44
+ # fmt: off
45
+ DEFAULT_SYSTEM_PROMPT = "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere."
46
+ DEFAULT_RAG_PREAMBLE = """## Task and Context
47
+ You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.
48
+
49
+ ## Style Guide
50
+ Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling."""
51
+ # fmt: on
52
+
53
+
54
+ class CohereTokenizerFast(PreTrainedTokenizerFast):
55
+ """
56
+ Construct a Cohere tokenizer. Based on byte-level Byte-Pair-Encoding.
57
+
58
+ This uses notably ByteFallback and NFC normalization.
59
+
60
+ ```python
61
+ >>> from transformers import AutoTokenizer
62
+
63
+ >>> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-0.1")
64
+ >>> tokenizer.encode("Hello this is a test")
65
+ [1, 15043, 445, 338, 263, 1243]
66
+ ```
67
+
68
+ If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
69
+ call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
70
+ values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
71
+ [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
72
+
73
+
74
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
75
+ refer to this superclass for more information regarding those methods.
76
+
77
+ Args:
78
+ vocab_file (`str`, *optional*):
79
+ [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
80
+ contains the vocabulary necessary to instantiate a tokenizer.
81
+ tokenizer_file (`str`, *optional*):
82
+ [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
83
+ contains everything needed to load the tokenizer.
84
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
85
+ Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
86
+ extra spaces.
87
+ unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
88
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
89
+ token instead.
90
+ bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
91
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
92
+ eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
93
+ The end of sequence token.
94
+ add_bos_token (`bool`, *optional*, defaults to `True`):
95
+ Whether or not to add an `bos_token` at the start of sequences.
96
+ add_eos_token (`bool`, *optional*, defaults to `False`):
97
+ Whether or not to add an `eos_token` at the end of sequences.
98
+ use_default_system_prompt (`bool`, *optional*, defaults to `False`):
99
+ Whether or not the default system prompt for Cohere tokenizer should be used.
100
+ add_prefix_space (`bool`, *optional*):
101
+ Whether or not the tokenizer should automatically add a prefix space
102
+ """
103
+
104
+ vocab_files_names = VOCAB_FILES_NAMES
105
+ padding_side = "left"
106
+ model_input_names = ["input_ids", "attention_mask"]
107
+
108
+ def __init__(
109
+ self,
110
+ vocab_file=None,
111
+ tokenizer_file=None,
112
+ clean_up_tokenization_spaces=False,
113
+ unk_token="<UNK>",
114
+ bos_token="<BOS_TOKEN>",
115
+ eos_token="<EOS_TOKEN>",
116
+ add_bos_token=True,
117
+ add_eos_token=False,
118
+ use_default_system_prompt=False,
119
+ add_prefix_space=None,
120
+ **kwargs,
121
+ ):
122
+ if add_prefix_space is not None:
123
+ logger.warning_once(
124
+ "You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers"
125
+ )
126
+ kwargs["from_slow"] = True
127
+
128
+ super().__init__(
129
+ vocab_file=vocab_file,
130
+ tokenizer_file=tokenizer_file,
131
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
132
+ unk_token=unk_token,
133
+ bos_token=bos_token,
134
+ eos_token=eos_token,
135
+ add_bos_token=add_bos_token,
136
+ add_eos_token=add_eos_token,
137
+ use_default_system_prompt=use_default_system_prompt,
138
+ **kwargs,
139
+ )
140
+ self._add_bos_token = add_bos_token
141
+ self._add_eos_token = add_eos_token
142
+ self.update_post_processor()
143
+ self.use_default_system_prompt = use_default_system_prompt
144
+ self.vocab_file = vocab_file
145
+ self.grounded_generation_template = kwargs.pop("grounded_generation_template", None)
146
+ self.tool_use_template = kwargs.pop("tool_use_template", None)
147
+
148
+ def update_post_processor(self):
149
+ """
150
+ Updates the underlying post processor with the current `bos_token` and `eos_token`.
151
+ """
152
+ bos = self.bos_token
153
+ bos_token_id = self.bos_token_id
154
+ if bos is None and self.add_bos_token:
155
+ raise ValueError("add_bos_token = True but bos_token = None")
156
+
157
+ eos = self.eos_token
158
+ eos_token_id = self.eos_token_id
159
+ if eos is None and self.add_eos_token:
160
+ raise ValueError("add_eos_token = True but eos_token = None")
161
+
162
+ single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
163
+ pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
164
+
165
+ special_tokens = []
166
+ if self.add_bos_token:
167
+ special_tokens.append((bos, bos_token_id))
168
+ if self.add_eos_token:
169
+ special_tokens.append((eos, eos_token_id))
170
+ self._tokenizer.post_processor = processors.TemplateProcessing(
171
+ single=single, pair=pair, special_tokens=special_tokens
172
+ )
173
+
174
+ @property
175
+ def add_eos_token(self):
176
+ return self._add_eos_token
177
+
178
+ @property
179
+ def add_bos_token(self):
180
+ return self._add_bos_token
181
+
182
+ @add_eos_token.setter
183
+ def add_eos_token(self, value):
184
+ self._add_eos_token = value
185
+ self.update_post_processor()
186
+
187
+ @add_bos_token.setter
188
+ def add_bos_token(self, value):
189
+ self._add_bos_token = value
190
+ self.update_post_processor()
191
+
192
+ @property
193
+ def default_chat_template(self):
194
+ """
195
+ Cohere Tokenizer uses <|START_OF_TURN_TOKEN|> and <|END_OF_TURN_TOKEN|> to indicate each turn in a chat.
196
+ Additioanlly, to indicate the source of the message, <|USER_TOKEN|>, <|CHATBOT_TOKEN|> and <|SYSTEM_TOKEN|>
197
+ for user, assitant and system messages respectively.
198
+
199
+ The output should look something like:
200
+ <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ preamble }}<|END_OF_TURN_TOKEN|>
201
+ <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ How are you? }}<|END_OF_TURN_TOKEN|>
202
+ <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{ I am doing well! }}<|END_OF_TURN_TOKEN|>
203
+
204
+ Use add_generation_prompt to add a prompt for the model to generate a response:
205
+
206
+ >>> messages = [{"role": "user", "content": "Hello, how are you?"}]
207
+ >>> tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
208
+ <BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
209
+
210
+ """
211
+ logger.warning_once(
212
+ "\nNo chat template is defined for this tokenizer - using the default template "
213
+ f"for the {self.__class__.__name__} class. If the default is not appropriate for "
214
+ "your model, please set `tokenizer.chat_template` to an appropriate template. "
215
+ "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
216
+ )
217
+ template = (
218
+ "{{ bos_token }}"
219
+ "{% if messages[0]['role'] == 'system' %}"
220
+ "{% set loop_messages = messages[1:] %}" # Extract system message if it's present
221
+ "{% set system_message = messages[0]['content'] %}"
222
+ "{% elif USE_DEFAULT_PROMPT == true %}"
223
+ "{% set loop_messages = messages %}" # Or use the default system message if the flag is set
224
+ "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}"
225
+ "{% else %}"
226
+ "{% set loop_messages = messages %}"
227
+ "{% set system_message = false %}"
228
+ "{% endif %}"
229
+ "{% if system_message != false %}" # Start with system message
230
+ "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}"
231
+ "{% endif %}"
232
+ "{% for message in loop_messages %}" # Loop over all non-system messages
233
+ "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
234
+ "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
235
+ "{% endif %}"
236
+ "{% set content = message['content'] %}"
237
+ "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way
238
+ "{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}"
239
+ "{% elif message['role'] == 'assistant' %}"
240
+ "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}"
241
+ "{% endif %}"
242
+ "{% endfor %}"
243
+ "{% if add_generation_prompt %}"
244
+ "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}"
245
+ "{% endif %}"
246
+ )
247
+ template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
248
+ default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
249
+ template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
250
+
251
+ return template
252
+
253
+ @property
254
+ def default_tool_use_template(self):
255
+ template = (
256
+ "{{ bos_token }}"
257
+ "{% if messages[0]['role'] == 'system' %}"
258
+ "{% set loop_messages = messages[1:] %}" # Extract system message if it's present
259
+ "{% set system_message = messages[0]['content'] %}"
260
+ "{% else %}"
261
+ "{% set loop_messages = messages %}"
262
+ "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}"
263
+ "{% endif %}"
264
+ "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' }}"
265
+ "{{ '# Safety Preamble' }}"
266
+ "{{ '\nThe instructions in this section override those in the task description and style guide sections. Don\\'t answer questions that are harmful or immoral.' }}"
267
+ "{{ '\n\n# System Preamble' }}"
268
+ "{{ '\n## Basic Rules' }}"
269
+ "{{ '\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user\\'s requests, you cite your sources in your answers, according to those instructions.' }}"
270
+ "{{ '\n\n# User Preamble' }}"
271
+ "{{ '\n' + system_message }}"
272
+ "{{'\n\n## Available Tools\nHere is a list of tools that you have available to you:\n\n'}}"
273
+ "{% for tool in tools %}"
274
+ "{% if loop.index0 != 0 %}"
275
+ "{{ '\n\n'}}"
276
+ "{% endif %}"
277
+ "{{'```python\ndef ' + tool.name + '('}}"
278
+ "{% for param_name, param_fields in tool.parameter_definitions.items() %}"
279
+ "{% if loop.index0 != 0 %}"
280
+ "{{ ', '}}"
281
+ "{% endif %}"
282
+ "{{param_name}}: "
283
+ "{% if not param_fields.required %}"
284
+ "{{'Optional[' + param_fields.type + '] = None'}}"
285
+ "{% else %}"
286
+ "{{ param_fields.type }}"
287
+ "{% endif %}"
288
+ "{% endfor %}"
289
+ "{{ ') -> List[Dict]:\n \"\"\"'}}"
290
+ "{{ tool.description }}"
291
+ "{% if tool.parameter_definitions|length != 0 %}"
292
+ "{{ '\n\n Args:\n '}}"
293
+ "{% for param_name, param_fields in tool.parameter_definitions.items() %}"
294
+ "{% if loop.index0 != 0 %}"
295
+ "{{ '\n ' }}"
296
+ "{% endif %}"
297
+ "{{ param_name + ' ('}}"
298
+ "{% if not param_fields.required %}"
299
+ "{{'Optional[' + param_fields.type + ']'}}"
300
+ "{% else %}"
301
+ "{{ param_fields.type }}"
302
+ "{% endif %}"
303
+ "{{ '): ' + param_fields.description }}"
304
+ "{% endfor %}"
305
+ "{% endif %}"
306
+ "{{ '\n \"\"\"\n pass\n```' }}"
307
+ "{% endfor %}"
308
+ "{{ '<|END_OF_TURN_TOKEN|>'}}"
309
+ "{% for message in loop_messages %}"
310
+ "{% set content = message['content'] %}"
311
+ "{% if message['role'] == 'user' %}"
312
+ "{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}"
313
+ "{% elif message['role'] == 'system' %}"
314
+ "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}"
315
+ "{% elif message['role'] == 'assistant' %}"
316
+ "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}"
317
+ "{% endif %}"
318
+ "{% endfor %}"
319
+ "{{'<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write \\'Action:\\' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user\\'s last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example:\n```json\n[\n {\n \"tool_name\": title of the tool in the specification,\n \"parameters\": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters\n }\n]```<|END_OF_TURN_TOKEN|>'}}"
320
+ "{% if add_generation_prompt %}"
321
+ "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}"
322
+ "{% endif %}"
323
+ )
324
+ default_message = DEFAULT_RAG_PREAMBLE.replace("\n", "\\n").replace("'", "\\'")
325
+ template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
326
+ return template
327
+
328
+ @property
329
+ def default_grounded_generation_template(self):
330
+ template = (
331
+ "{{ bos_token }}"
332
+ "{% if messages[0]['role'] == 'system' %}"
333
+ "{% set loop_messages = messages[1:] %}" # Extract system message if it's present
334
+ "{% set system_message = messages[0]['content'] %}"
335
+ "{% else %}"
336
+ "{% set loop_messages = messages %}"
337
+ "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}"
338
+ "{% endif %}"
339
+ "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' }}"
340
+ "{{ '# Safety Preamble' }}"
341
+ "{{ '\nThe instructions in this section override those in the task description and style guide sections. Don\\'t answer questions that are harmful or immoral.' }}"
342
+ "{{ '\n\n# System Preamble' }}"
343
+ "{{ '\n## Basic Rules' }}"
344
+ "{{ '\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user\\'s requests, you cite your sources in your answers, according to those instructions.' }}"
345
+ "{{ '\n\n# User Preamble' }}"
346
+ "{{ '\n' + system_message }}"
347
+ "{{ '<|END_OF_TURN_TOKEN|>'}}"
348
+ "{% for message in loop_messages %}" # Loop over all non-system messages
349
+ "{% set content = message['content'] %}"
350
+ "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way
351
+ "{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}"
352
+ "{% elif message['role'] == 'system' %}"
353
+ "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}"
354
+ "{% elif message['role'] == 'assistant' %}"
355
+ "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}"
356
+ "{% endif %}"
357
+ "{% endfor %}"
358
+ "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>'}}"
359
+ "{{ '<results>' }}"
360
+ "{% for document in documents %}" # Loop over all non-system messages
361
+ "{{ '\nDocument: ' }}"
362
+ "{{ loop.index0 }}\n"
363
+ "{% for key, value in document.items() %}"
364
+ "{{ key }}: {{value}}\n"
365
+ "{% endfor %}"
366
+ "{% endfor %}"
367
+ "{{ '</results>'}}"
368
+ "{{ '<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' }}"
369
+ "{{ 'Carefully perform the following instructions, in order, starting each with a new line.\n' }}"
370
+ "{{ 'Firstly, Decide which of the retrieved documents are relevant to the user\\'s last input by writing \\'Relevant Documents:\\' followed by comma-separated list of document numbers. If none are relevant, you should instead write \\'None\\'.\n' }}"
371
+ "{{ 'Secondly, Decide which of the retrieved documents contain facts that should be cited in a good answer to the user\\'s last input by writing \\'Cited Documents:\\' followed a comma-separated list of document numbers. If you dont want to cite any of them, you should instead write \\'None\\'.\n' }}"
372
+ "{% if citation_mode=='accurate' %}"
373
+ "{{ 'Thirdly, Write \\'Answer:\\' followed by a response to the user\\'s last input in high quality natural english. Use the retrieved documents to help you. Do not insert any citations or grounding markup.\n' }}"
374
+ "{% endif %}"
375
+ "{{ 'Finally, Write \\'Grounded answer:\\' followed by a response to the user\\'s last input in high quality natural english. Use the symbols <co: doc> and </co: doc> to indicate when a fact comes from a document in the search result, e.g <co: 0>my fact</co: 0> for a fact from document 0.' }}"
376
+ "{{ '<|END_OF_TURN_TOKEN|>' }}"
377
+ "{% if add_generation_prompt %}"
378
+ "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}"
379
+ "{% endif %}"
380
+ )
381
+ default_message = DEFAULT_RAG_PREAMBLE.replace("\n", "\\n").replace("'", "\\'")
382
+ template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
383
+ return template
384
+
385
+ def _apply_template_with_arguments(
386
+ self,
387
+ conversation: Union[List[Dict[str, str]], "Conversation"],
388
+ template: Optional[str] = None,
389
+ add_generation_prompt: bool = False,
390
+ tokenize: bool = True,
391
+ padding: bool = False,
392
+ truncation: bool = False,
393
+ max_length: Optional[int] = None,
394
+ return_tensors: Optional[Union[str, TensorType]] = None,
395
+ return_dict: bool = False,
396
+ **kwargs,
397
+ ) -> Union[str, List[int]]:
398
+ """Just tokenization_utils_base.apply_chat_template, but modified so that the jinjia template can take kwargs"""
399
+ if hasattr(conversation, "messages"):
400
+ # Indicates it's a Conversation object
401
+ conversation = conversation.messages
402
+
403
+ # Compilation function uses a cache to avoid recompiling the same template
404
+ compiled_template = self._compile_jinja_template(template)
405
+
406
+ rendered = compiled_template.render(
407
+ messages=conversation,
408
+ add_generation_prompt=add_generation_prompt,
409
+ **kwargs,
410
+ **self.special_tokens_map
411
+ )
412
+
413
+ if padding is True:
414
+ padding = "max_length" # There's only one sequence here, so "longest" makes no sense
415
+ if tokenize:
416
+ if return_dict:
417
+ return self(
418
+ rendered,
419
+ padding=padding,
420
+ truncation=truncation,
421
+ max_length=max_length,
422
+ add_special_tokens=False,
423
+ return_tensors=return_tensors,
424
+ **kwargs,
425
+ )
426
+ else:
427
+ return self.encode(
428
+ rendered,
429
+ padding=padding,
430
+ truncation=truncation,
431
+ max_length=max_length,
432
+ add_special_tokens=False,
433
+ return_tensors=return_tensors,
434
+ **kwargs,
435
+ )
436
+ else:
437
+ return rendered
438
+
439
+ def apply_tool_use_template(
440
+ self,
441
+ conversation: Union[List[Dict[str, str]], "Conversation"],
442
+ tools: List[Dict],
443
+ tool_use_template: Optional[str] = None,
444
+ **kwargs
445
+ ) -> Union[str, List[int]]:
446
+ """Create a Command-R tool-use prompt.
447
+
448
+ Once rendered, the prompt instructs the model to generate a list of actions to perform on a set of user supplied tools
449
+ to help carry out the user's requests.
450
+
451
+ Conceptually, this works in the same way as `apply_chat_format`, but takes an additional `tools` parameter.
452
+
453
+ Converts a Conversation object or a list of dictionaries with `"role"` and `"content"` keys and a list of available
454
+ tools for the model to use into a prompt string, or a list of token ids.
455
+ This method will use the tokenizer's `default_tool_use_template` template specified at the class level.
456
+ You can override the default template using the `tool_use_template` kwarg but the quality of your results may decrease.
457
+
458
+ Args:
459
+ conversation (Union[List[Dict[str, str]], "Conversation"]): A Conversation object or list of dicts
460
+ with "role" and "content" keys, representing the chat history so far.
461
+ tools (List[Dict]): a list of tools to render into the prompt for the model to choose from.
462
+ See an example at the bottom of the docstring.
463
+ The format should be:
464
+ * name (str): The name of the tool to be called. Valid names contain only the characters a-z,
465
+ A-Z, 0-9, _ and must not begin with a digit.
466
+ * description (str): The description of what the tool does, the model uses the description to
467
+ choose when and how to call the function.
468
+ * parameter_definitions (List[Dict]): The input parameters of the tool. Accepts a dictionary
469
+ where the key is the name of the parameter and the value is the parameter spec.
470
+ Valid parameter names contain only the characters a-z, A-Z, 0-9, _ and must not begin with a digit.
471
+ Parameter specs are as follows:
472
+ * description (str): The description of the parameter.
473
+ * type (str): the type of the parameter - most effective for python builtin data types, such as 'str', 'bool'
474
+ * required: boolean: Denotes whether the parameter is always present (required) or not. Defaults to not required.
475
+ tool_use_template (str, *optional*): A Jinja template to use for this conversion. If
476
+ this is not passed, the model's default chat template will be used instead.
477
+ add_generation_prompt (bool, *optional*): Whether to end the prompt with the token(s) that indicate
478
+ the start of an assistant message. This is useful when you want to generate a response from the model.
479
+ Note that this argument will be passed to the chat template, and so it must be supported in the
480
+ template for this argument to have any effect.
481
+ tokenize (`bool`, defaults to `True`):
482
+ Whether to tokenize the output. If `False`, the output will be a string.
483
+ padding (`bool`, defaults to `False`):
484
+ Whether to pad sequences to the maximum length. Has no effect if tokenize is `False`.
485
+ truncation (`bool`, defaults to `False`):
486
+ Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`.
487
+ max_length (`int`, *optional*):
488
+ Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If
489
+ not specified, the tokenizer's `max_length` attribute will be used as a default.
490
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
491
+ If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable
492
+ values are:
493
+ - `'tf'`: Return TensorFlow `tf.Tensor` objects.
494
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
495
+ - `'np'`: Return NumPy `np.ndarray` objects.
496
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
497
+ return_dict (`bool`, *optional*, defaults to `False`):
498
+ Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
499
+ **tokenizer_kwargs: Additional kwargs to pass to the tokenizer.
500
+
501
+ Returns:
502
+ `str`: A rendered prompt string.
503
+ or if tokenize=True:
504
+ `List[int]`: A list of token ids representing the tokenized chat so far, including control tokens. This
505
+ output is ready to pass to the model, either directly or via methods like `generate()`.
506
+
507
+ Examples:
508
+
509
+ ```python
510
+ >>> tokenizer = CohereTokenizerFast.from_pretrained("CohereForAI/c4ai-command-r-0.1")
511
+ >>> tools = [
512
+ {
513
+ "name": "internet_search",
514
+ "description": "Returns a list of relevant document snippets for a textual query retrieved from the internet",
515
+ "parameter_definitions": {
516
+ "query": {
517
+ "description": "Query to search the internet with",
518
+ "type": "str",
519
+ "required": True
520
+ }
521
+ }
522
+ },
523
+ {
524
+ "name': "directly_answer",
525
+ "description": "Calls a standard (un-augmented) AI chatbot to generate a response given the conversation history",
526
+ "parameter_definitions": {}
527
+ }
528
+ ]
529
+ >>> conversation = [
530
+ {"role": "user", "content": "Whats the biggest penguin in the world?"}
531
+ ]
532
+ >>> # render the prompt, ready for user to inspect, or for input into the model:
533
+ >>> prompt = tokenizer.apply_tool_use_template(
534
+ conversation,
535
+ tools=tools,
536
+ tokenize=False,
537
+ add_generation_prompt=True,
538
+ )
539
+ >>> print(prompt)
540
+ <BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble
541
+ The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.
542
+
543
+ # System Preamble
544
+ ## Basic Rules
545
+ You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.
546
+
547
+ # User Preamble
548
+ ## Task and Context
549
+ You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.
550
+
551
+ ## Style Guide
552
+ Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.
553
+
554
+ ## Available Tools
555
+ Here is a list of tools that you have available to you:
556
+
557
+ \`\`\`python
558
+ def internet_search(query: str) -> List[Dict]:
559
+ \"\"\"Returns a list of relevant document snippets for a textual query retrieved from the internet
560
+
561
+ Args:
562
+ query (str): Query to search the internet with
563
+ \"\"\"
564
+ pass
565
+ \`\`\`
566
+
567
+ \`\`\`python
568
+ def directly_answer() -> List[Dict]:
569
+ \"\"\"Calls a standard (un-augmented) AI chatbot to generate a response given the conversation history
570
+ \"\"\"
571
+ pass
572
+ \`\`\`<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Whats the biggest penguin in the world?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write 'Action:' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user's last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example:
573
+ \`\`\`json
574
+ [
575
+ {
576
+ "tool_name": title of the tool in the specification,
577
+ "parameters": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters
578
+ }
579
+ ]\`\`\`<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
580
+ ```
581
+ >>> inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt')
582
+ >>> outputs = model.generate(inputs, max_new_tokens=128)
583
+ >>> print(tokenizer.decode(outputs[0]))
584
+ Action: ```json
585
+ [
586
+ {
587
+ "tool_name": "internet_search",
588
+ "parameters": {
589
+ "query": "biggest penguin in the world"
590
+ }
591
+ }
592
+ ]
593
+ ```
594
+ """
595
+ # priority: `tool_use_template` argument > `tokenizer.tool_use_template` > `tokenizer.default_tool_use_template`
596
+ if tool_use_template is None:
597
+ if self.tool_use_template is not None:
598
+ tool_use_template = self.tool_use_template
599
+ else:
600
+ tool_use_template = self.default_tool_use_template
601
+
602
+ return self._apply_template_with_arguments(
603
+ conversation,
604
+ tools=tools,
605
+ template=tool_use_template,
606
+ **kwargs,
607
+ )
608
+
609
+ def apply_grounded_generation_template(
610
+ self,
611
+ conversation: Union[List[Dict[str, str]], "Conversation"],
612
+ documents: List[Dict],
613
+ citation_mode: Literal["fast", "accurate"] = "accurate",
614
+ grounded_generation_template: Optional[str] = None,
615
+ **kwargs
616
+ ) -> Union[str, List[int]]:
617
+ """Create a Command-R grounded generation (aka RAG) prompt.
618
+
619
+ Once rendered, the prompt instructs the model to generate a response with citations in, based on supplied documents.
620
+
621
+ Conceptually, this works in the same way as `apply_chat_format`, but takes additional `documents`
622
+ and parameter `citation_mode` parameters.
623
+
624
+ Converts a Conversation object or a list of dictionaries with `"role"` and `"content"` keys and a list of
625
+ documents for the model to ground its response on into a prompt string, or a list of token ids.
626
+ This method will use the tokenizer's `grounded_generation_template` template specified at the class level.
627
+ You can override the default template using the `grounded_generation_template` kwarg but the quality of your results may decrease.
628
+
629
+ Args:
630
+ conversation (Union[List[Dict[str, str]], "Conversation"]): A Conversation object or list of dicts
631
+ with "role" and "content" keys, representing the chat history so far.
632
+ documents (List[Dict[str, str]): A list of dicts, representing documents or tool outputs to ground your
633
+ generation on. A document is a semistructured dict, wiht a string to string mapping. Common fields are
634
+ `url`, `title`, `snippet` etc but should be descriptive of the key. They will get rendered into the prompt.
635
+ citation_mode: either "accurate" (prompt the model to generate an answer first, then rewrite it with citation
636
+ spans in) or "fast", where the prompt instructs the model to generate an answer with citations in directly.
637
+ The former has higher quality citations, the latter requires fewer tokens to be generated.
638
+ grounded_generation_template (str, *optional*): A Jinja template to use for this conversion. If
639
+ this is not passed, the model's default grounded_generation_template template will be used instead.
640
+ add_generation_prompt (bool, *optional*): Whether to end the prompt with the token(s) that indicate
641
+ the start of an assistant message. This is useful when you want to generate a response from the model.
642
+ Note that this argument will be passed to the chat template, and so it must be supported in the
643
+ template for this argument to have any effect.
644
+ tokenize (`bool`, defaults to `True`):
645
+ Whether to tokenize the output. If `False`, the output will be a string.
646
+ padding (`bool`, defaults to `False`):
647
+ Whether to pad sequences to the maximum length. Has no effect if tokenize is `False`.
648
+ truncation (`bool`, defaults to `False`):
649
+ Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`.
650
+ max_length (`int`, *optional*):
651
+ Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If
652
+ not specified, the tokenizer's `max_length` attribute will be used as a default.
653
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
654
+ If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable
655
+ values are:
656
+ - `'tf'`: Return TensorFlow `tf.Tensor` objects.
657
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
658
+ - `'np'`: Return NumPy `np.ndarray` objects.
659
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
660
+ return_dict (`bool`, *optional*, defaults to `False`):
661
+ Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
662
+ **tokenizer_kwargs: Additional kwargs to pass to the tokenizer.
663
+
664
+ Returns:
665
+ `str`: A rendered prompt string.
666
+ or if tokenize=True:
667
+ `List[int]`: A list of token ids representing the tokenized chat so far, including control tokens. This
668
+ output is ready to pass to the model, either directly or via methods like `generate()`.
669
+
670
+ Examples:
671
+
672
+ ```python
673
+ >>> tokenizer = CohereTokenizerFast.from_pretrained('CohereForAI/c4ai-command-r-0.1')
674
+
675
+ >>> # define documents:
676
+ >>> documents = [
677
+ { "title": "Tall penguins", "text": "Emperor penguins are the tallest." },
678
+ { "title": "Penguin habitats", "text": "Emperor penguins only live in Antarctica."}
679
+ ]
680
+ >>> # define a conversation:
681
+ >>> conversation = [
682
+ {"role": "user", "content": "Whats the biggest penguin in the world?"}
683
+ ]
684
+ >>> # render the prompt, ready for user to inspect, or for input into the model:
685
+ >>> grounded_generation_prompt = tokenizer.apply_grounded_generation_template(
686
+ conversation,
687
+ documents=documents,
688
+ tokenize=False,
689
+ add_generation_prompt=True,
690
+ )
691
+ >>> print(grounded_generation_prompt)
692
+ <BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble
693
+ The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.
694
+
695
+ ## Basic Rules
696
+ You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.
697
+
698
+ # User Preamble
699
+ ## Task and Context
700
+ You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.
701
+
702
+ ## Style Guide
703
+ Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Whats the biggest penguin in the world?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><results>
704
+ Document: 0
705
+ title: Tall penguins
706
+ text: Emperor penguins are the tallest.
707
+
708
+ Document: 1
709
+ title: Penguin habitats
710
+ text: Emperor penguins only live in Antarctica.
711
+ </results><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Carefully perform the following instructions, in order, starting each with a new line.
712
+ Firstly, Decide which of the retrieved documents are relevant to the user's last input by writing 'Relevant Documents:' followed by comma-separated list of document numbers. If none are relevant, you should instead write 'None'.
713
+ Secondly, Decide which of the retrieved documents contain facts that should be cited in a good answer to the user's last input by writing 'Cited Documents:' followed a comma-separated list of document numbers. If you dont want to cite any of them, you should instead write 'None'.
714
+ Thirdly, Write 'Answer:' followed by a response to the user's last input in high quality natural english. Use the retrieved documents to help you. Do not insert any citations or grounding markup.
715
+ Finally, Write 'Grounded answer:' followed by a response to the user's last input in high quality natural english. Use the symbols <co: doc> and </co: doc> to indicate when a fact comes from a document in the search result, e.g <co: 0>my fact</co: 0> for a fact from document 0.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'''
716
+ ```
717
+ >>> inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt')
718
+ >>> outputs = model.generate(inputs, max_new_tokens=128)
719
+ >>> print(tokenizer.decode(outputs[0]))
720
+ Relevant Documents: 0,1
721
+ Cited Documents: 0,1
722
+ Answer: The Emperor Penguin is the tallest or biggest penguin in the world. It is a bird that lives only in Antarctica and grows to a height of around 122 centimetres.
723
+ Grounded answer: The <co: 0>Emperor Penguin</co: 0> is the <co: 0>tallest</co: 0> or biggest penguin in the world. It is a bird that <co: 1>lives only in Antarctica</co: 1> and <co: 0>grows to a height of around 122 centimetres.</co: 0>
724
+ """
725
+ # priority: `grounded_generation_template` argument > `tokenizer.grounded_generation_template` > `tokenizer.default_grounded_generation_template`
726
+ if grounded_generation_template is None:
727
+ if self.grounded_generation_template is not None:
728
+ grounded_generation_template = self.grounded_generation_template
729
+ else:
730
+ grounded_generation_template = self.default_grounded_generation_template
731
+
732
+ return self._apply_template_with_arguments(
733
+ conversation,
734
+ documents=documents,
735
+ template=grounded_generation_template,
736
+ citation_mode=citation_mode,
737
+ **kwargs,
738
+ )
739
+
740
+ # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers
741
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
742
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
743
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
744
+
745
+ output = bos_token_id + token_ids_0 + eos_token_id
746
+
747
+ if token_ids_1 is not None:
748
+ output = output + bos_token_id + token_ids_1 + eos_token_id
749
+
750
+ return output
751
+
752
+
753
+ # register the tokenizer to AutoTokenizer
754
+ AutoTokenizer.register(CohereConfig, fast_tokenizer_class=CohereTokenizerFast)
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cc8a79eafcf1043fbfad77df083de446a61424b222284d602c4edee497ce1e4
3
+ size 12777405
tokenizer_config.json ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<PAD>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<UNK>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<CLS>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<SEP>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<MASK_TOKEN>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "5": {
46
+ "content": "<BOS_TOKEN>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "6": {
54
+ "content": "<EOS_TOKEN>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "7": {
62
+ "content": "<EOP_TOKEN>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "255000": {
70
+ "content": "<|START_OF_TURN_TOKEN|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "255001": {
78
+ "content": "<|END_OF_TURN_TOKEN|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "255002": {
86
+ "content": "<|YES_TOKEN|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "255003": {
94
+ "content": "<|NO_TOKEN|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "255004": {
102
+ "content": "<|GOOD_TOKEN|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "255005": {
110
+ "content": "<|BAD_TOKEN|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": false
116
+ },
117
+ "255006": {
118
+ "content": "<|USER_TOKEN|>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "255007": {
126
+ "content": "<|CHATBOT_TOKEN|>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "255008": {
134
+ "content": "<|SYSTEM_TOKEN|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "255009": {
142
+ "content": "<|USER_0_TOKEN|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "255010": {
150
+ "content": "<|USER_1_TOKEN|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "255011": {
158
+ "content": "<|USER_2_TOKEN|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "255012": {
166
+ "content": "<|USER_3_TOKEN|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "255013": {
174
+ "content": "<|USER_4_TOKEN|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "255014": {
182
+ "content": "<|USER_5_TOKEN|>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "255015": {
190
+ "content": "<|USER_6_TOKEN|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "255016": {
198
+ "content": "<|USER_7_TOKEN|>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "255017": {
206
+ "content": "<|USER_8_TOKEN|>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "255018": {
214
+ "content": "<|USER_9_TOKEN|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": false
220
+ },
221
+ "255019": {
222
+ "content": "<|EXTRA_0_TOKEN|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": false
228
+ },
229
+ "255020": {
230
+ "content": "<|EXTRA_1_TOKEN|>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": false
236
+ },
237
+ "255021": {
238
+ "content": "<|EXTRA_2_TOKEN|>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": false
244
+ },
245
+ "255022": {
246
+ "content": "<|EXTRA_3_TOKEN|>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": false
252
+ },
253
+ "255023": {
254
+ "content": "<|EXTRA_4_TOKEN|>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": false
260
+ },
261
+ "255024": {
262
+ "content": "<|EXTRA_5_TOKEN|>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": false
268
+ },
269
+ "255025": {
270
+ "content": "<|EXTRA_6_TOKEN|>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": false
276
+ },
277
+ "255026": {
278
+ "content": "<|EXTRA_7_TOKEN|>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": false
284
+ },
285
+ "255027": {
286
+ "content": "<|EXTRA_8_TOKEN|>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": false
292
+ },
293
+ "255028": {
294
+ "content": "<|EXTRA_9_TOKEN|>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": false
300
+ }
301
+ },
302
+ "auto_map": {
303
+ "AutoTokenizer": [
304
+ null,
305
+ "tokenization_cohere_fast.CohereTokenizerFast"
306
+ ]
307
+ },
308
+ "bos_token": "<BOS_TOKEN>",
309
+ "clean_up_tokenization_spaces": false,
310
+ "eos_token": "<|END_OF_TURN_TOKEN|>",
311
+ "legacy": true,
312
+ "model_max_length": 1000000000000000019884624838656,
313
+ "pad_token": "<PAD>",
314
+ "sp_model_kwargs": {},
315
+ "spaces_between_special_tokens": false,
316
+ "tokenizer_class": "CohereTokenizer",
317
+ "unk_token": null,
318
+ "use_default_system_prompt": false
319
+ }