shunxing1234 commited on
Commit
8bdc10d
1 Parent(s): 5e72216

Upload 11 files

Browse files
config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "apply_residual_connection_post_layernorm": false,
3
+ "architectures": [
4
+ "TelechatForCausalLM"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_telechat.TelechatConfig",
8
+ "AutoModelForCausalLM": "modeling_telechat.TelechatForCausalLM"
9
+ },
10
+ "attention_dropout": 0.0,
11
+ "attention_softmax_in_fp32": true,
12
+ "bias_dropout_fusion": true,
13
+ "bos_token_id": 1,
14
+ "eos_token_id": 2,
15
+ "hidden_dropout": 0.0,
16
+ "hidden_size": 8192,
17
+ "initializer_range": 0.02,
18
+ "layer_norm_epsilon": 1e-08,
19
+ "masked_softmax_fusion": true,
20
+ "model_type": "telechat",
21
+ "n_head": 64,
22
+ "n_inner": null,
23
+ "num_key_value_heads": 8,
24
+ "n_layer": 96,
25
+ "pad_token_id": 3,
26
+ "pretraining_tp": 2,
27
+ "skip_bias_add": false,
28
+ "skip_bias_add_qkv": false,
29
+ "slow_but_exact": false,
30
+ "unk_token_id": 0,
31
+ "use_cache": true,
32
+ "vocab_size": 131072,
33
+ "ffn_hidden_size": 40960,
34
+ "flash_attn":true,
35
+ "tie_word_embeddings":false,
36
+ "training_seqlen":8192,
37
+ "base_seqlen":8192,
38
+ "seq_length": 8192
39
+ }
40
+
configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"task":"text-generation"}
configuration_telechat.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """ Telechat configuration"""
17
+
18
+ from packaging import version
19
+ from collections import OrderedDict
20
+ from transformers.utils import is_torch_available, logging
21
+ from transformers.configuration_utils import PretrainedConfig
22
+ from typing import TYPE_CHECKING, Any, List, Mapping, Optional
23
+
24
+ logger = logging.get_logger(__name__)
25
+
26
+ class TelechatConfig(PretrainedConfig):
27
+ """
28
+ Args:
29
+ vocab_size (`int`, *optional*, defaults to 160256): Vocabulary size of the Telechat model.
30
+ hidden_size (`int`, *optional*, defaults to 4096): Dimensionality of the embeddings and hidden states.
31
+ ffn_hidden_size (`int`, *optional*, defaults to 12288): Dimensionality of the feed-forward hidden states.
32
+ n_layer (`int`, *optional*, defaults to 30): Number of hidden layers in the Transformer
33
+ n_head (`int`, *optional*, defaults to 32): Number of attention heads for each attention layer.
34
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): The epsilon to use in the layer normalization layers.
35
+ initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
36
+ apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`): If enabled, use the layer norm of the hidden states as the residual in the transformer blocks
37
+ hidden_dropout (`float`, *optional*, defaults to 0.0): Dropout rate of the dropout function on the bias dropout.
38
+ attention_dropout (`float`, *optional*, defaults to 0.0): Dropout rate applied to the attention probs
39
+ use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions.
40
+ training_seqlen (`int`, *optional*, defaults to 8192): Sequence length during last finetuning.
41
+ logn (`bool`, *optional*, defaults to `True`): Whether or not to use logN during extrapolation.
42
+ embed_layernorm (`bool`, *optional*, defaults to `True`): Whether or not to use embedding layernorm.
43
+
44
+ """
45
+
46
+ model_type = "telechat"
47
+ keys_to_ignore_at_inference = ["past_key_values"]
48
+ attribute_map = {
49
+ "num_hidden_layers": "n_layer",
50
+ "num_attention_heads": "n_head",
51
+ }
52
+
53
+ def __init__(
54
+ self,
55
+ vocab_size=160256,
56
+ hidden_size=4096,
57
+ n_layer=30,
58
+ n_head=32,
59
+ layer_norm_epsilon=1e-5,
60
+ initializer_range=0.02,
61
+ use_cache=True,
62
+ bos_token_id=1,
63
+ eos_token_id=2,
64
+ apply_residual_connection_post_layernorm=False,
65
+ hidden_dropout=0.0,
66
+ attention_dropout=0.0,
67
+ ffn_hidden_size=12288,
68
+ training_seqlen = 8192,
69
+ logn = True,
70
+ embed_layernorm = False,
71
+ **kwargs,
72
+ ):
73
+ self.vocab_size = vocab_size
74
+ n_embed = kwargs.pop("n_embed", None)
75
+ self.hidden_size = hidden_size if n_embed is None else n_embed
76
+ self.n_layer = n_layer
77
+ self.n_head = n_head
78
+ self.layer_norm_epsilon = layer_norm_epsilon
79
+ self.initializer_range = initializer_range
80
+ self.use_cache = use_cache
81
+ self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
82
+ self.hidden_dropout = hidden_dropout
83
+ self.attention_dropout = attention_dropout
84
+ self.bos_token_id = bos_token_id
85
+ self.eos_token_id = eos_token_id
86
+ self.logn = logn
87
+ self.ffn_hidden_size = ffn_hidden_size
88
+ self.training_seqlen = training_seqlen
89
+ self.embed_layernorm = embed_layernorm
90
+ self.num_key_value_heads= kwargs.pop("num_key_value_heads", None)
91
+
92
+
93
+ super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
94
+
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "max_length": 8192,
3
+ "do_sample": false,
4
+ "use_cache": true,
5
+ "temperature": 0.3,
6
+ "top_k": 5,
7
+ "top_p": 0.85,
8
+ "repetition_penalty": 1.03,
9
+ "pad_token_id": 3,
10
+ "bos_token_id": 1,
11
+ "eos_token_id": 2,
12
+ "user_token_id": 4,
13
+ "bot_token_id": 5
14
+ }
generation_utils.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ from collections import deque
3
+ from queue import Queue
4
+ import copy
5
+
6
+
7
+ class History:
8
+
9
+ def __init__(self, tokenizer, history):
10
+ '''
11
+ init from a list of dict
12
+ '''
13
+ # use deque to meet some special situation
14
+ self.input_history = deque()
15
+ self.tokenizer = tokenizer
16
+ if history:
17
+ self._transfer_from_list(history)
18
+
19
+ def _transfer_from_list(self, history):
20
+ for message in history:
21
+ content = message.get("content")
22
+ # the token result may not be equal to the result model gen
23
+ message.update(self.tokenizer(content))
24
+ self.input_history.append(message)
25
+
26
+ def append(self, message):
27
+ content = message.get("content")
28
+ if "input_ids" not in message or "attention_mask" not in message:
29
+ message.update(self.tokenizer(content))
30
+ self.input_history.append(message)
31
+
32
+ def append_left(self, message):
33
+ content = message.get("content")
34
+ if "input_ids" not in message or "attention_mask" not in message:
35
+ message.update(self.tokenizer(content))
36
+ self.input_history.appendleft(message)
37
+
38
+ def pop(self):
39
+ x = self.input_history.pop()
40
+ return x
41
+
42
+ def pop_left(self):
43
+ x = self.input_history.pop_left()
44
+ return x
45
+
46
+ def update(self, message):
47
+ self.input_history.pop()
48
+ self.append(message)
49
+
50
+ def __len__(self):
51
+ return self.input_history.__len__()
52
+
53
+ def __str__(self):
54
+ return self.input_history.__str__()
55
+
56
+ def __copy__(self):
57
+ new_instance = type(self)(self.tokenizer, [])
58
+ new_instance.input_history = copy.copy(self.input_history)
59
+ return new_instance
60
+
61
+ def __deepcopy__(self, memodict={}):
62
+ new_instance = type(self)(self.tokenizer, [])
63
+ new_instance.input_history = copy.deepcopy(self.input_history)
64
+ return new_instance
65
+
66
+
67
+ class TelechatIterTextStreamer:
68
+ """
69
+ With reference to the TextIterStreamers in transformers, we have rewritten this class
70
+ """
71
+
72
+ def __init__(
73
+ self, tokenizer, history: History = None, skip_prompt: bool = False, timeout: Optional[float] = None,
74
+ **decode_kwargs
75
+ ):
76
+
77
+ self.tokenizer = tokenizer
78
+ self.history = history
79
+ self.skip_prompt = skip_prompt
80
+ self.timeout = timeout
81
+ self.decode_kwargs = decode_kwargs
82
+
83
+ self.text_queue = Queue()
84
+ self.cache_time = 0
85
+ self.text_until = ""
86
+ self.token_until = []
87
+ self.stop_signal = None
88
+ self.next_tokens_are_prompt = True
89
+
90
+ self.history.append({"role": "bot", "content": self.text_until})
91
+
92
+ def put(self, value):
93
+ """
94
+ put printable text into queue
95
+ """
96
+ if len(value.shape) > 1 and value.shape[0] > 1:
97
+ raise ValueError("TextStreamer only supports batch size 1")
98
+ elif len(value.shape) > 1:
99
+ value = value[0]
100
+
101
+ if self.skip_prompt and self.next_tokens_are_prompt:
102
+ self.next_tokens_are_prompt = False
103
+ return
104
+
105
+ if value[-1] == self.tokenizer.eos_token_id:
106
+ return
107
+
108
+ # there may be some smart way to decode.
109
+ self.token_until.extend(value.tolist())
110
+ text = self.tokenizer.decode(self.token_until, **self.decode_kwargs)
111
+
112
+
113
+ if self._is_printable(text) or self.cache_time >= 6:
114
+ output_text = text[len(self.text_until):]
115
+ self.text_until = text
116
+
117
+ else:
118
+ self.cache_time+=1
119
+ return
120
+
121
+ self.on_finalized_text(output_text)
122
+
123
+ def end(self):
124
+ """Flushes any remaining cache and prints a newline to stdout."""
125
+ # Flush the cache, if it exists
126
+ text = self.tokenizer.decode(self.token_until, **self.decode_kwargs)
127
+ output_text = text[len(self.text_until):]
128
+ self.text_until = text
129
+ self.on_finalized_text(output_text, stream_end=True)
130
+ self.clear_cache()
131
+
132
+ def clear_cache(self):
133
+ self.cache_time = 0
134
+ self.token_until = []
135
+ self.text_until = ""
136
+ self.history = None
137
+ self.next_tokens_are_prompt = True
138
+
139
+ def on_finalized_text(self, text: str, stream_end: bool = False):
140
+ """Put the text tuple in the queue."""
141
+ self.history.update({"role": "bot", "content": self.text_until, "input_ids": self.token_until,
142
+ "attention_mask": [1] * len(self.token_until)})
143
+ self.text_queue.put((text, self.history), timeout=self.timeout)
144
+ if stream_end:
145
+ self.text_queue.put((self.stop_signal, self.history), timeout=self.timeout)
146
+
147
+ @staticmethod
148
+ def _is_printable(cp):
149
+ """Checks whether tokens can be decoded or not"""
150
+ if "�" in cp:
151
+ return False
152
+ return True
153
+
154
+ def __iter__(self):
155
+ return self
156
+
157
+ def __next__(self):
158
+ value_now, history_until = self.text_queue.get(timeout=self.timeout)
159
+ if value_now == self.stop_signal:
160
+ raise StopIteration()
161
+ else:
162
+ return value_now, history_until
model-00001-of-00050.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2921b909cc707a1721cafc00c2e1561be50c38167e23dbca60b5ea572d5002d1
3
+ size 335676680
model-00002-of-00050.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5ed55aa2b919c1ba154295d40dfde172a24082d6ddd620e0dae839f937a17c2
3
+ size 671353120
model-00003-of-00050.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b246d4f08d7596bc12fabe3b1e1549c65e3ea4adfa899d5e8c9c2025cdddb41
3
+ size 671353120
model-00004-of-00050.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1dc197e9f73d62796c0178ca6dbcc8dddbd14d12099cbadb4ace4fc721fbf5b
3
+ size 671353120
model-00005-of-00050.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3a0961a9ee82913de040c575e72d87596ed30fe197a56815c4a7fdb33874307
3
+ size 671353120
model.safetensors.index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"metadata": {"total_size": 453131665408}, "weight_map": {"transformer.word_embeddings.weight": "model-00001-of-00050.safetensors", "transformer.h.0.input_layernorm.weight": "model-00001-of-00050.safetensors", "transformer.h.0.post_attention_layernorm.weight": "model-00001-of-00050.safetensors", "transformer.h.0.self_attention.dense.weight": "model-00001-of-00050.safetensors", "transformer.h.0.self_attention.dense.bias": "model-00001-of-00050.safetensors", "transformer.h.0.self_attention.query.weight": "model-00001-of-00050.safetensors", "transformer.h.0.self_attention.key_value.weight": "model-00001-of-00050.safetensors", "transformer.h.0.mlp.gate_proj.weight": "model-00001-of-00050.safetensors", "transformer.h.0.mlp.down_proj.weight": "model-00001-of-00050.safetensors", "transformer.h.0.mlp.down_proj.bias": "model-00001-of-00050.safetensors", "transformer.h.0.mlp.up_proj.weight": "model-00001-of-00050.safetensors", "transformer.h.1.input_layernorm.weight": "model-00002-of-00050.safetensors", "transformer.h.1.post_attention_layernorm.weight": "model-00002-of-00050.safetensors", "transformer.h.1.self_attention.dense.weight": "model-00002-of-00050.safetensors", "transformer.h.1.self_attention.dense.bias": "model-00002-of-00050.safetensors", "transformer.h.1.self_attention.query.weight": "model-00002-of-00050.safetensors", "transformer.h.1.self_attention.key_value.weight": "model-00002-of-00050.safetensors", "transformer.h.1.mlp.gate_proj.weight": "model-00002-of-00050.safetensors", "transformer.h.1.mlp.down_proj.weight": "model-00002-of-00050.safetensors", "transformer.h.1.mlp.down_proj.bias": "model-00002-of-00050.safetensors", "transformer.h.1.mlp.up_proj.weight": "model-00002-of-00050.safetensors", "transformer.h.2.input_layernorm.weight": "model-00002-of-00050.safetensors", "transformer.h.2.post_attention_layernorm.weight": "model-00002-of-00050.safetensors", "transformer.h.2.self_attention.dense.weight": "model-00002-of-00050.safetensors", "transformer.h.2.self_attention.dense.bias": "model-00002-of-00050.safetensors", "transformer.h.2.self_attention.query.weight": "model-00002-of-00050.safetensors", "transformer.h.2.self_attention.key_value.weight": "model-00002-of-00050.safetensors", "transformer.h.2.mlp.gate_proj.weight": "model-00002-of-00050.safetensors", "transformer.h.2.mlp.down_proj.weight": "model-00002-of-00050.safetensors", "transformer.h.2.mlp.down_proj.bias": "model-00002-of-00050.safetensors", "transformer.h.2.mlp.up_proj.weight": "model-00002-of-00050.safetensors", "transformer.h.3.input_layernorm.weight": "model-00003-of-00050.safetensors", "transformer.h.3.post_attention_layernorm.weight": "model-00003-of-00050.safetensors", "transformer.h.3.self_attention.dense.weight": "model-00003-of-00050.safetensors", "transformer.h.3.self_attention.dense.bias": "model-00003-of-00050.safetensors", "transformer.h.3.self_attention.query.weight": "model-00003-of-00050.safetensors", "transformer.h.3.self_attention.key_value.weight": "model-00003-of-00050.safetensors", "transformer.h.3.mlp.gate_proj.weight": "model-00003-of-00050.safetensors", "transformer.h.3.mlp.down_proj.weight": "model-00003-of-00050.safetensors", "transformer.h.3.mlp.down_proj.bias": "model-00003-of-00050.safetensors", "transformer.h.3.mlp.up_proj.weight": "model-00003-of-00050.safetensors", "transformer.h.4.input_layernorm.weight": "model-00003-of-00050.safetensors", "transformer.h.4.post_attention_layernorm.weight": "model-00003-of-00050.safetensors", "transformer.h.4.self_attention.dense.weight": "model-00003-of-00050.safetensors", "transformer.h.4.self_attention.dense.bias": "model-00003-of-00050.safetensors", "transformer.h.4.self_attention.query.weight": "model-00003-of-00050.safetensors", "transformer.h.4.self_attention.key_value.weight": "model-00003-of-00050.safetensors", "transformer.h.4.mlp.gate_proj.weight": "model-00003-of-00050.safetensors", "transformer.h.4.mlp.down_proj.weight": "model-00003-of-00050.safetensors", "transformer.h.4.mlp.down_proj.bias": "model-00003-of-00050.safetensors", "transformer.h.4.mlp.up_proj.weight": "model-00003-of-00050.safetensors", "transformer.h.5.input_layernorm.weight": "model-00004-of-00050.safetensors", "transformer.h.5.post_attention_layernorm.weight": "model-00004-of-00050.safetensors", "transformer.h.5.self_attention.dense.weight": "model-00004-of-00050.safetensors", "transformer.h.5.self_attention.dense.bias": "model-00004-of-00050.safetensors", "transformer.h.5.self_attention.query.weight": "model-00004-of-00050.safetensors", "transformer.h.5.self_attention.key_value.weight": "model-00004-of-00050.safetensors", "transformer.h.5.mlp.gate_proj.weight": "model-00004-of-00050.safetensors", "transformer.h.5.mlp.down_proj.weight": "model-00004-of-00050.safetensors", "transformer.h.5.mlp.down_proj.bias": "model-00004-of-00050.safetensors", "transformer.h.5.mlp.up_proj.weight": "model-00004-of-00050.safetensors", "transformer.h.6.input_layernorm.weight": "model-00004-of-00050.safetensors", "transformer.h.6.post_attention_layernorm.weight": "model-00004-of-00050.safetensors", "transformer.h.6.self_attention.dense.weight": "model-00004-of-00050.safetensors", "transformer.h.6.self_attention.dense.bias": "model-00004-of-00050.safetensors", "transformer.h.6.self_attention.query.weight": "model-00004-of-00050.safetensors", "transformer.h.6.self_attention.key_value.weight": "model-00004-of-00050.safetensors", "transformer.h.6.mlp.gate_proj.weight": "model-00004-of-00050.safetensors", "transformer.h.6.mlp.down_proj.weight": "model-00004-of-00050.safetensors", "transformer.h.6.mlp.down_proj.bias": "model-00004-of-00050.safetensors", "transformer.h.6.mlp.up_proj.weight": "model-00004-of-00050.safetensors", "transformer.h.7.input_layernorm.weight": "model-00005-of-00050.safetensors", "transformer.h.7.post_attention_layernorm.weight": "model-00005-of-00050.safetensors", "transformer.h.7.self_attention.dense.weight": "model-00005-of-00050.safetensors", "transformer.h.7.self_attention.dense.bias": "model-00005-of-00050.safetensors", "transformer.h.7.self_attention.query.weight": "model-00005-of-00050.safetensors", "transformer.h.7.self_attention.key_value.weight": "model-00005-of-00050.safetensors", "transformer.h.7.mlp.gate_proj.weight": "model-00005-of-00050.safetensors", "transformer.h.7.mlp.down_proj.weight": "model-00005-of-00050.safetensors", "transformer.h.7.mlp.down_proj.bias": "model-00005-of-00050.safetensors", "transformer.h.7.mlp.up_proj.weight": "model-00005-of-00050.safetensors", "transformer.h.8.input_layernorm.weight": "model-00005-of-00050.safetensors", "transformer.h.8.post_attention_layernorm.weight": "model-00005-of-00050.safetensors", "transformer.h.8.self_attention.dense.weight": "model-00005-of-00050.safetensors", "transformer.h.8.self_attention.dense.bias": "model-00005-of-00050.safetensors", "transformer.h.8.self_attention.query.weight": "model-00005-of-00050.safetensors", "transformer.h.8.self_attention.key_value.weight": "model-00005-of-00050.safetensors", "transformer.h.8.mlp.gate_proj.weight": "model-00005-of-00050.safetensors", "transformer.h.8.mlp.down_proj.weight": "model-00005-of-00050.safetensors", "transformer.h.8.mlp.down_proj.bias": "model-00005-of-00050.safetensors", "transformer.h.8.mlp.up_proj.weight": "model-00005-of-00050.safetensors", "transformer.h.9.input_layernorm.weight": "model-00006-of-00050.safetensors", "transformer.h.9.post_attention_layernorm.weight": "model-00006-of-00050.safetensors", "transformer.h.9.self_attention.dense.weight": "model-00006-of-00050.safetensors", "transformer.h.9.self_attention.dense.bias": "model-00006-of-00050.safetensors", "transformer.h.9.self_attention.query.weight": "model-00006-of-00050.safetensors", "transformer.h.9.self_attention.key_value.weight": "model-00006-of-00050.safetensors", "transformer.h.9.mlp.gate_proj.weight": "model-00006-of-00050.safetensors", "transformer.h.9.mlp.down_proj.weight": "model-00006-of-00050.safetensors", "transformer.h.9.mlp.down_proj.bias": "model-00006-of-00050.safetensors", "transformer.h.9.mlp.up_proj.weight": "model-00006-of-00050.safetensors", "transformer.h.10.input_layernorm.weight": "model-00006-of-00050.safetensors", "transformer.h.10.post_attention_layernorm.weight": "model-00006-of-00050.safetensors", "transformer.h.10.self_attention.dense.weight": "model-00006-of-00050.safetensors", "transformer.h.10.self_attention.dense.bias": "model-00006-of-00050.safetensors", "transformer.h.10.self_attention.query.weight": "model-00006-of-00050.safetensors", "transformer.h.10.self_attention.key_value.weight": "model-00006-of-00050.safetensors", "transformer.h.10.mlp.gate_proj.weight": "model-00006-of-00050.safetensors", "transformer.h.10.mlp.down_proj.weight": "model-00006-of-00050.safetensors", "transformer.h.10.mlp.down_proj.bias": "model-00006-of-00050.safetensors", "transformer.h.10.mlp.up_proj.weight": "model-00006-of-00050.safetensors", "transformer.h.11.input_layernorm.weight": "model-00007-of-00050.safetensors", "transformer.h.11.post_attention_layernorm.weight": "model-00007-of-00050.safetensors", "transformer.h.11.self_attention.dense.weight": "model-00007-of-00050.safetensors", "transformer.h.11.self_attention.dense.bias": "model-00007-of-00050.safetensors", "transformer.h.11.self_attention.query.weight": "model-00007-of-00050.safetensors", "transformer.h.11.self_attention.key_value.weight": "model-00007-of-00050.safetensors", "transformer.h.11.mlp.gate_proj.weight": "model-00007-of-00050.safetensors", "transformer.h.11.mlp.down_proj.weight": "model-00007-of-00050.safetensors", "transformer.h.11.mlp.down_proj.bias": "model-00007-of-00050.safetensors", "transformer.h.11.mlp.up_proj.weight": "model-00007-of-00050.safetensors", "transformer.h.12.input_layernorm.weight": "model-00007-of-00050.safetensors", "transformer.h.12.post_attention_layernorm.weight": "model-00007-of-00050.safetensors", "transformer.h.12.self_attention.dense.weight": "model-00007-of-00050.safetensors", "transformer.h.12.self_attention.dense.bias": "model-00007-of-00050.safetensors", "transformer.h.12.self_attention.query.weight": "model-00007-of-00050.safetensors", "transformer.h.12.self_attention.key_value.weight": "model-00007-of-00050.safetensors", "transformer.h.12.mlp.gate_proj.weight": "model-00007-of-00050.safetensors", "transformer.h.12.mlp.down_proj.weight": "model-00007-of-00050.safetensors", "transformer.h.12.mlp.down_proj.bias": "model-00007-of-00050.safetensors", "transformer.h.12.mlp.up_proj.weight": "model-00007-of-00050.safetensors", "transformer.h.13.input_layernorm.weight": "model-00008-of-00050.safetensors", "transformer.h.13.post_attention_layernorm.weight": "model-00008-of-00050.safetensors", "transformer.h.13.self_attention.dense.weight": "model-00008-of-00050.safetensors", "transformer.h.13.self_attention.dense.bias": "model-00008-of-00050.safetensors", "transformer.h.13.self_attention.query.weight": "model-00008-of-00050.safetensors", "transformer.h.13.self_attention.key_value.weight": "model-00008-of-00050.safetensors", "transformer.h.13.mlp.gate_proj.weight": "model-00008-of-00050.safetensors", "transformer.h.13.mlp.down_proj.weight": "model-00008-of-00050.safetensors", "transformer.h.13.mlp.down_proj.bias": "model-00008-of-00050.safetensors", "transformer.h.13.mlp.up_proj.weight": "model-00008-of-00050.safetensors", "transformer.h.14.input_layernorm.weight": "model-00008-of-00050.safetensors", "transformer.h.14.post_attention_layernorm.weight": "model-00008-of-00050.safetensors", "transformer.h.14.self_attention.dense.weight": "model-00008-of-00050.safetensors", "transformer.h.14.self_attention.dense.bias": "model-00008-of-00050.safetensors", "transformer.h.14.self_attention.query.weight": "model-00008-of-00050.safetensors", "transformer.h.14.self_attention.key_value.weight": "model-00008-of-00050.safetensors", "transformer.h.14.mlp.gate_proj.weight": "model-00008-of-00050.safetensors", "transformer.h.14.mlp.down_proj.weight": "model-00008-of-00050.safetensors", "transformer.h.14.mlp.down_proj.bias": "model-00008-of-00050.safetensors", "transformer.h.14.mlp.up_proj.weight": "model-00008-of-00050.safetensors", "transformer.h.15.input_layernorm.weight": "model-00009-of-00050.safetensors", "transformer.h.15.post_attention_layernorm.weight": "model-00009-of-00050.safetensors", "transformer.h.15.self_attention.dense.weight": "model-00009-of-00050.safetensors", "transformer.h.15.self_attention.dense.bias": "model-00009-of-00050.safetensors", "transformer.h.15.self_attention.query.weight": "model-00009-of-00050.safetensors", "transformer.h.15.self_attention.key_value.weight": "model-00009-of-00050.safetensors", "transformer.h.15.mlp.gate_proj.weight": "model-00009-of-00050.safetensors", "transformer.h.15.mlp.down_proj.weight": "model-00009-of-00050.safetensors", "transformer.h.15.mlp.down_proj.bias": "model-00009-of-00050.safetensors", "transformer.h.15.mlp.up_proj.weight": "model-00009-of-00050.safetensors", "transformer.h.16.input_layernorm.weight": "model-00009-of-00050.safetensors", "transformer.h.16.post_attention_layernorm.weight": "model-00009-of-00050.safetensors", "transformer.h.16.self_attention.dense.weight": "model-00009-of-00050.safetensors", "transformer.h.16.self_attention.dense.bias": "model-00009-of-00050.safetensors", "transformer.h.16.self_attention.query.weight": "model-00009-of-00050.safetensors", "transformer.h.16.self_attention.key_value.weight": "model-00009-of-00050.safetensors", "transformer.h.16.mlp.gate_proj.weight": "model-00009-of-00050.safetensors", "transformer.h.16.mlp.down_proj.weight": "model-00009-of-00050.safetensors", "transformer.h.16.mlp.down_proj.bias": "model-00009-of-00050.safetensors", "transformer.h.16.mlp.up_proj.weight": "model-00009-of-00050.safetensors", "transformer.h.17.input_layernorm.weight": "model-00010-of-00050.safetensors", "transformer.h.17.post_attention_layernorm.weight": "model-00010-of-00050.safetensors", "transformer.h.17.self_attention.dense.weight": "model-00010-of-00050.safetensors", "transformer.h.17.self_attention.dense.bias": "model-00010-of-00050.safetensors", "transformer.h.17.self_attention.query.weight": "model-00010-of-00050.safetensors", "transformer.h.17.self_attention.key_value.weight": "model-00010-of-00050.safetensors", "transformer.h.17.mlp.gate_proj.weight": "model-00010-of-00050.safetensors", "transformer.h.17.mlp.down_proj.weight": "model-00010-of-00050.safetensors", "transformer.h.17.mlp.down_proj.bias": "model-00010-of-00050.safetensors", "transformer.h.17.mlp.up_proj.weight": "model-00010-of-00050.safetensors", "transformer.h.18.input_layernorm.weight": "model-00010-of-00050.safetensors", "transformer.h.18.post_attention_layernorm.weight": "model-00010-of-00050.safetensors", "transformer.h.18.self_attention.dense.weight": "model-00010-of-00050.safetensors", "transformer.h.18.self_attention.dense.bias": "model-00010-of-00050.safetensors", "transformer.h.18.self_attention.query.weight": "model-00010-of-00050.safetensors", "transformer.h.18.self_attention.key_value.weight": "model-00010-of-00050.safetensors", "transformer.h.18.mlp.gate_proj.weight": "model-00010-of-00050.safetensors", "transformer.h.18.mlp.down_proj.weight": "model-00010-of-00050.safetensors", "transformer.h.18.mlp.down_proj.bias": "model-00010-of-00050.safetensors", "transformer.h.18.mlp.up_proj.weight": "model-00010-of-00050.safetensors", "transformer.h.19.input_layernorm.weight": "model-00011-of-00050.safetensors", "transformer.h.19.post_attention_layernorm.weight": "model-00011-of-00050.safetensors", "transformer.h.19.self_attention.dense.weight": "model-00011-of-00050.safetensors", "transformer.h.19.self_attention.dense.bias": "model-00011-of-00050.safetensors", "transformer.h.19.self_attention.query.weight": "model-00011-of-00050.safetensors", "transformer.h.19.self_attention.key_value.weight": "model-00011-of-00050.safetensors", "transformer.h.19.mlp.gate_proj.weight": "model-00011-of-00050.safetensors", "transformer.h.19.mlp.down_proj.weight": "model-00011-of-00050.safetensors", "transformer.h.19.mlp.down_proj.bias": "model-00011-of-00050.safetensors", "transformer.h.19.mlp.up_proj.weight": "model-00011-of-00050.safetensors", "transformer.h.20.input_layernorm.weight": "model-00011-of-00050.safetensors", "transformer.h.20.post_attention_layernorm.weight": "model-00011-of-00050.safetensors", "transformer.h.20.self_attention.dense.weight": "model-00011-of-00050.safetensors", "transformer.h.20.self_attention.dense.bias": "model-00011-of-00050.safetensors", "transformer.h.20.self_attention.query.weight": "model-00011-of-00050.safetensors", "transformer.h.20.self_attention.key_value.weight": "model-00011-of-00050.safetensors", "transformer.h.20.mlp.gate_proj.weight": "model-00011-of-00050.safetensors", "transformer.h.20.mlp.down_proj.weight": "model-00011-of-00050.safetensors", "transformer.h.20.mlp.down_proj.bias": "model-00011-of-00050.safetensors", "transformer.h.20.mlp.up_proj.weight": "model-00011-of-00050.safetensors", "transformer.h.21.input_layernorm.weight": "model-00012-of-00050.safetensors", "transformer.h.21.post_attention_layernorm.weight": "model-00012-of-00050.safetensors", "transformer.h.21.self_attention.dense.weight": "model-00012-of-00050.safetensors", "transformer.h.21.self_attention.dense.bias": "model-00012-of-00050.safetensors", "transformer.h.21.self_attention.query.weight": "model-00012-of-00050.safetensors", "transformer.h.21.self_attention.key_value.weight": "model-00012-of-00050.safetensors", "transformer.h.21.mlp.gate_proj.weight": "model-00012-of-00050.safetensors", "transformer.h.21.mlp.down_proj.weight": "model-00012-of-00050.safetensors", "transformer.h.21.mlp.down_proj.bias": "model-00012-of-00050.safetensors", "transformer.h.21.mlp.up_proj.weight": "model-00012-of-00050.safetensors", "transformer.h.22.input_layernorm.weight": "model-00012-of-00050.safetensors", "transformer.h.22.post_attention_layernorm.weight": "model-00012-of-00050.safetensors", "transformer.h.22.self_attention.dense.weight": "model-00012-of-00050.safetensors", "transformer.h.22.self_attention.dense.bias": "model-00012-of-00050.safetensors", "transformer.h.22.self_attention.query.weight": "model-00012-of-00050.safetensors", "transformer.h.22.self_attention.key_value.weight": "model-00012-of-00050.safetensors", "transformer.h.22.mlp.gate_proj.weight": "model-00012-of-00050.safetensors", "transformer.h.22.mlp.down_proj.weight": "model-00012-of-00050.safetensors", "transformer.h.22.mlp.down_proj.bias": "model-00012-of-00050.safetensors", "transformer.h.22.mlp.up_proj.weight": "model-00012-of-00050.safetensors", "transformer.h.23.input_layernorm.weight": "model-00013-of-00050.safetensors", "transformer.h.23.post_attention_layernorm.weight": "model-00013-of-00050.safetensors", "transformer.h.23.self_attention.dense.weight": "model-00013-of-00050.safetensors", "transformer.h.23.self_attention.dense.bias": "model-00013-of-00050.safetensors", "transformer.h.23.self_attention.query.weight": "model-00013-of-00050.safetensors", "transformer.h.23.self_attention.key_value.weight": "model-00013-of-00050.safetensors", "transformer.h.23.mlp.gate_proj.weight": "model-00013-of-00050.safetensors", "transformer.h.23.mlp.down_proj.weight": "model-00013-of-00050.safetensors", "transformer.h.23.mlp.down_proj.bias": "model-00013-of-00050.safetensors", "transformer.h.23.mlp.up_proj.weight": "model-00013-of-00050.safetensors", "transformer.h.24.input_layernorm.weight": "model-00013-of-00050.safetensors", "transformer.h.24.post_attention_layernorm.weight": "model-00013-of-00050.safetensors", "transformer.h.24.self_attention.dense.weight": "model-00013-of-00050.safetensors", "transformer.h.24.self_attention.dense.bias": "model-00013-of-00050.safetensors", "transformer.h.24.self_attention.query.weight": "model-00013-of-00050.safetensors", "transformer.h.24.self_attention.key_value.weight": "model-00013-of-00050.safetensors", "transformer.h.24.mlp.gate_proj.weight": "model-00013-of-00050.safetensors", "transformer.h.24.mlp.down_proj.weight": "model-00013-of-00050.safetensors", "transformer.h.24.mlp.down_proj.bias": "model-00013-of-00050.safetensors", "transformer.h.24.mlp.up_proj.weight": "model-00013-of-00050.safetensors", "transformer.h.25.input_layernorm.weight": "model-00014-of-00050.safetensors", "transformer.h.25.post_attention_layernorm.weight": "model-00014-of-00050.safetensors", "transformer.h.25.self_attention.dense.weight": "model-00014-of-00050.safetensors", "transformer.h.25.self_attention.dense.bias": "model-00014-of-00050.safetensors", "transformer.h.25.self_attention.query.weight": "model-00014-of-00050.safetensors", "transformer.h.25.self_attention.key_value.weight": "model-00014-of-00050.safetensors", "transformer.h.25.mlp.gate_proj.weight": "model-00014-of-00050.safetensors", "transformer.h.25.mlp.down_proj.weight": "model-00014-of-00050.safetensors", "transformer.h.25.mlp.down_proj.bias": "model-00014-of-00050.safetensors", "transformer.h.25.mlp.up_proj.weight": "model-00014-of-00050.safetensors", "transformer.h.26.input_layernorm.weight": "model-00014-of-00050.safetensors", "transformer.h.26.post_attention_layernorm.weight": "model-00014-of-00050.safetensors", "transformer.h.26.self_attention.dense.weight": "model-00014-of-00050.safetensors", "transformer.h.26.self_attention.dense.bias": "model-00014-of-00050.safetensors", "transformer.h.26.self_attention.query.weight": "model-00014-of-00050.safetensors", "transformer.h.26.self_attention.key_value.weight": "model-00014-of-00050.safetensors", "transformer.h.26.mlp.gate_proj.weight": "model-00014-of-00050.safetensors", "transformer.h.26.mlp.down_proj.weight": "model-00014-of-00050.safetensors", "transformer.h.26.mlp.down_proj.bias": "model-00014-of-00050.safetensors", "transformer.h.26.mlp.up_proj.weight": "model-00014-of-00050.safetensors", "transformer.h.27.input_layernorm.weight": "model-00015-of-00050.safetensors", "transformer.h.27.post_attention_layernorm.weight": "model-00015-of-00050.safetensors", "transformer.h.27.self_attention.dense.weight": "model-00015-of-00050.safetensors", "transformer.h.27.self_attention.dense.bias": "model-00015-of-00050.safetensors", "transformer.h.27.self_attention.query.weight": "model-00015-of-00050.safetensors", "transformer.h.27.self_attention.key_value.weight": "model-00015-of-00050.safetensors", "transformer.h.27.mlp.gate_proj.weight": "model-00015-of-00050.safetensors", "transformer.h.27.mlp.down_proj.weight": "model-00015-of-00050.safetensors", "transformer.h.27.mlp.down_proj.bias": "model-00015-of-00050.safetensors", "transformer.h.27.mlp.up_proj.weight": "model-00015-of-00050.safetensors", "transformer.h.28.input_layernorm.weight": "model-00015-of-00050.safetensors", "transformer.h.28.post_attention_layernorm.weight": "model-00015-of-00050.safetensors", "transformer.h.28.self_attention.dense.weight": "model-00015-of-00050.safetensors", "transformer.h.28.self_attention.dense.bias": "model-00015-of-00050.safetensors", "transformer.h.28.self_attention.query.weight": "model-00015-of-00050.safetensors", "transformer.h.28.self_attention.key_value.weight": "model-00015-of-00050.safetensors", "transformer.h.28.mlp.gate_proj.weight": "model-00015-of-00050.safetensors", "transformer.h.28.mlp.down_proj.weight": "model-00015-of-00050.safetensors", "transformer.h.28.mlp.down_proj.bias": "model-00015-of-00050.safetensors", "transformer.h.28.mlp.up_proj.weight": "model-00015-of-00050.safetensors", "transformer.h.29.input_layernorm.weight": "model-00016-of-00050.safetensors", "transformer.h.29.post_attention_layernorm.weight": "model-00016-of-00050.safetensors", "transformer.h.29.self_attention.dense.weight": "model-00016-of-00050.safetensors", "transformer.h.29.self_attention.dense.bias": "model-00016-of-00050.safetensors", "transformer.h.29.self_attention.query.weight": "model-00016-of-00050.safetensors", "transformer.h.29.self_attention.key_value.weight": "model-00016-of-00050.safetensors", "transformer.h.29.mlp.gate_proj.weight": "model-00016-of-00050.safetensors", "transformer.h.29.mlp.down_proj.weight": "model-00016-of-00050.safetensors", "transformer.h.29.mlp.down_proj.bias": "model-00016-of-00050.safetensors", "transformer.h.29.mlp.up_proj.weight": "model-00016-of-00050.safetensors", "transformer.h.30.input_layernorm.weight": "model-00016-of-00050.safetensors", "transformer.h.30.post_attention_layernorm.weight": "model-00016-of-00050.safetensors", "transformer.h.30.self_attention.dense.weight": "model-00016-of-00050.safetensors", "transformer.h.30.self_attention.dense.bias": "model-00016-of-00050.safetensors", "transformer.h.30.self_attention.query.weight": "model-00016-of-00050.safetensors", "transformer.h.30.self_attention.key_value.weight": "model-00016-of-00050.safetensors", "transformer.h.30.mlp.gate_proj.weight": "model-00016-of-00050.safetensors", "transformer.h.30.mlp.down_proj.weight": "model-00016-of-00050.safetensors", "transformer.h.30.mlp.down_proj.bias": "model-00016-of-00050.safetensors", "transformer.h.30.mlp.up_proj.weight": "model-00016-of-00050.safetensors", "transformer.h.31.input_layernorm.weight": "model-00017-of-00050.safetensors", "transformer.h.31.post_attention_layernorm.weight": "model-00017-of-00050.safetensors", "transformer.h.31.self_attention.dense.weight": "model-00017-of-00050.safetensors", "transformer.h.31.self_attention.dense.bias": "model-00017-of-00050.safetensors", "transformer.h.31.self_attention.query.weight": "model-00017-of-00050.safetensors", "transformer.h.31.self_attention.key_value.weight": "model-00017-of-00050.safetensors", "transformer.h.31.mlp.gate_proj.weight": "model-00017-of-00050.safetensors", "transformer.h.31.mlp.down_proj.weight": "model-00017-of-00050.safetensors", "transformer.h.31.mlp.down_proj.bias": "model-00017-of-00050.safetensors", "transformer.h.31.mlp.up_proj.weight": "model-00017-of-00050.safetensors", "transformer.h.32.input_layernorm.weight": "model-00017-of-00050.safetensors", "transformer.h.32.post_attention_layernorm.weight": "model-00017-of-00050.safetensors", "transformer.h.32.self_attention.dense.weight": "model-00017-of-00050.safetensors", "transformer.h.32.self_attention.dense.bias": "model-00017-of-00050.safetensors", "transformer.h.32.self_attention.query.weight": "model-00017-of-00050.safetensors", "transformer.h.32.self_attention.key_value.weight": "model-00017-of-00050.safetensors", "transformer.h.32.mlp.gate_proj.weight": "model-00017-of-00050.safetensors", "transformer.h.32.mlp.down_proj.weight": "model-00017-of-00050.safetensors", "transformer.h.32.mlp.down_proj.bias": "model-00017-of-00050.safetensors", "transformer.h.32.mlp.up_proj.weight": "model-00017-of-00050.safetensors", "transformer.h.33.input_layernorm.weight": "model-00018-of-00050.safetensors", "transformer.h.33.post_attention_layernorm.weight": "model-00018-of-00050.safetensors", "transformer.h.33.self_attention.dense.weight": "model-00018-of-00050.safetensors", "transformer.h.33.self_attention.dense.bias": "model-00018-of-00050.safetensors", "transformer.h.33.self_attention.query.weight": "model-00018-of-00050.safetensors", "transformer.h.33.self_attention.key_value.weight": "model-00018-of-00050.safetensors", "transformer.h.33.mlp.gate_proj.weight": "model-00018-of-00050.safetensors", "transformer.h.33.mlp.down_proj.weight": "model-00018-of-00050.safetensors", "transformer.h.33.mlp.down_proj.bias": "model-00018-of-00050.safetensors", "transformer.h.33.mlp.up_proj.weight": "model-00018-of-00050.safetensors", "transformer.h.34.input_layernorm.weight": "model-00018-of-00050.safetensors", "transformer.h.34.post_attention_layernorm.weight": "model-00018-of-00050.safetensors", "transformer.h.34.self_attention.dense.weight": "model-00018-of-00050.safetensors", "transformer.h.34.self_attention.dense.bias": "model-00018-of-00050.safetensors", "transformer.h.34.self_attention.query.weight": "model-00018-of-00050.safetensors", "transformer.h.34.self_attention.key_value.weight": "model-00018-of-00050.safetensors", "transformer.h.34.mlp.gate_proj.weight": "model-00018-of-00050.safetensors", "transformer.h.34.mlp.down_proj.weight": "model-00018-of-00050.safetensors", "transformer.h.34.mlp.down_proj.bias": "model-00018-of-00050.safetensors", "transformer.h.34.mlp.up_proj.weight": "model-00018-of-00050.safetensors", "transformer.h.35.input_layernorm.weight": "model-00019-of-00050.safetensors", "transformer.h.35.post_attention_layernorm.weight": "model-00019-of-00050.safetensors", "transformer.h.35.self_attention.dense.weight": "model-00019-of-00050.safetensors", "transformer.h.35.self_attention.dense.bias": "model-00019-of-00050.safetensors", "transformer.h.35.self_attention.query.weight": "model-00019-of-00050.safetensors", "transformer.h.35.self_attention.key_value.weight": "model-00019-of-00050.safetensors", "transformer.h.35.mlp.gate_proj.weight": "model-00019-of-00050.safetensors", "transformer.h.35.mlp.down_proj.weight": "model-00019-of-00050.safetensors", "transformer.h.35.mlp.down_proj.bias": "model-00019-of-00050.safetensors", "transformer.h.35.mlp.up_proj.weight": "model-00019-of-00050.safetensors", "transformer.h.36.input_layernorm.weight": "model-00019-of-00050.safetensors", "transformer.h.36.post_attention_layernorm.weight": "model-00019-of-00050.safetensors", "transformer.h.36.self_attention.dense.weight": "model-00019-of-00050.safetensors", "transformer.h.36.self_attention.dense.bias": "model-00019-of-00050.safetensors", "transformer.h.36.self_attention.query.weight": "model-00019-of-00050.safetensors", "transformer.h.36.self_attention.key_value.weight": "model-00019-of-00050.safetensors", "transformer.h.36.mlp.gate_proj.weight": "model-00019-of-00050.safetensors", "transformer.h.36.mlp.down_proj.weight": "model-00019-of-00050.safetensors", "transformer.h.36.mlp.down_proj.bias": "model-00019-of-00050.safetensors", "transformer.h.36.mlp.up_proj.weight": "model-00019-of-00050.safetensors", "transformer.h.37.input_layernorm.weight": "model-00020-of-00050.safetensors", "transformer.h.37.post_attention_layernorm.weight": "model-00020-of-00050.safetensors", "transformer.h.37.self_attention.dense.weight": "model-00020-of-00050.safetensors", "transformer.h.37.self_attention.dense.bias": "model-00020-of-00050.safetensors", "transformer.h.37.self_attention.query.weight": "model-00020-of-00050.safetensors", "transformer.h.37.self_attention.key_value.weight": "model-00020-of-00050.safetensors", "transformer.h.37.mlp.gate_proj.weight": "model-00020-of-00050.safetensors", "transformer.h.37.mlp.down_proj.weight": "model-00020-of-00050.safetensors", "transformer.h.37.mlp.down_proj.bias": "model-00020-of-00050.safetensors", "transformer.h.37.mlp.up_proj.weight": "model-00020-of-00050.safetensors", "transformer.h.38.input_layernorm.weight": "model-00020-of-00050.safetensors", "transformer.h.38.post_attention_layernorm.weight": "model-00020-of-00050.safetensors", "transformer.h.38.self_attention.dense.weight": "model-00020-of-00050.safetensors", "transformer.h.38.self_attention.dense.bias": "model-00020-of-00050.safetensors", "transformer.h.38.self_attention.query.weight": "model-00020-of-00050.safetensors", "transformer.h.38.self_attention.key_value.weight": "model-00020-of-00050.safetensors", "transformer.h.38.mlp.gate_proj.weight": "model-00020-of-00050.safetensors", "transformer.h.38.mlp.down_proj.weight": "model-00020-of-00050.safetensors", "transformer.h.38.mlp.down_proj.bias": "model-00020-of-00050.safetensors", "transformer.h.38.mlp.up_proj.weight": "model-00020-of-00050.safetensors", "transformer.h.39.input_layernorm.weight": "model-00021-of-00050.safetensors", "transformer.h.39.post_attention_layernorm.weight": "model-00021-of-00050.safetensors", "transformer.h.39.self_attention.dense.weight": "model-00021-of-00050.safetensors", "transformer.h.39.self_attention.dense.bias": "model-00021-of-00050.safetensors", "transformer.h.39.self_attention.query.weight": "model-00021-of-00050.safetensors", "transformer.h.39.self_attention.key_value.weight": "model-00021-of-00050.safetensors", "transformer.h.39.mlp.gate_proj.weight": "model-00021-of-00050.safetensors", "transformer.h.39.mlp.down_proj.weight": "model-00021-of-00050.safetensors", "transformer.h.39.mlp.down_proj.bias": "model-00021-of-00050.safetensors", "transformer.h.39.mlp.up_proj.weight": "model-00021-of-00050.safetensors", "transformer.h.40.input_layernorm.weight": "model-00021-of-00050.safetensors", "transformer.h.40.post_attention_layernorm.weight": "model-00021-of-00050.safetensors", "transformer.h.40.self_attention.dense.weight": "model-00021-of-00050.safetensors", "transformer.h.40.self_attention.dense.bias": "model-00021-of-00050.safetensors", "transformer.h.40.self_attention.query.weight": "model-00021-of-00050.safetensors", "transformer.h.40.self_attention.key_value.weight": "model-00021-of-00050.safetensors", "transformer.h.40.mlp.gate_proj.weight": "model-00021-of-00050.safetensors", "transformer.h.40.mlp.down_proj.weight": "model-00021-of-00050.safetensors", "transformer.h.40.mlp.down_proj.bias": "model-00021-of-00050.safetensors", "transformer.h.40.mlp.up_proj.weight": "model-00021-of-00050.safetensors", "transformer.h.41.input_layernorm.weight": "model-00022-of-00050.safetensors", "transformer.h.41.post_attention_layernorm.weight": "model-00022-of-00050.safetensors", "transformer.h.41.self_attention.dense.weight": "model-00022-of-00050.safetensors", "transformer.h.41.self_attention.dense.bias": "model-00022-of-00050.safetensors", "transformer.h.41.self_attention.query.weight": "model-00022-of-00050.safetensors", "transformer.h.41.self_attention.key_value.weight": "model-00022-of-00050.safetensors", "transformer.h.41.mlp.gate_proj.weight": "model-00022-of-00050.safetensors", "transformer.h.41.mlp.down_proj.weight": "model-00022-of-00050.safetensors", "transformer.h.41.mlp.down_proj.bias": "model-00022-of-00050.safetensors", "transformer.h.41.mlp.up_proj.weight": "model-00022-of-00050.safetensors", "transformer.h.42.input_layernorm.weight": "model-00022-of-00050.safetensors", "transformer.h.42.post_attention_layernorm.weight": "model-00022-of-00050.safetensors", "transformer.h.42.self_attention.dense.weight": "model-00022-of-00050.safetensors", "transformer.h.42.self_attention.dense.bias": "model-00022-of-00050.safetensors", "transformer.h.42.self_attention.query.weight": "model-00022-of-00050.safetensors", "transformer.h.42.self_attention.key_value.weight": "model-00022-of-00050.safetensors", "transformer.h.42.mlp.gate_proj.weight": "model-00022-of-00050.safetensors", "transformer.h.42.mlp.down_proj.weight": "model-00022-of-00050.safetensors", "transformer.h.42.mlp.down_proj.bias": "model-00022-of-00050.safetensors", "transformer.h.42.mlp.up_proj.weight": "model-00022-of-00050.safetensors", "transformer.h.43.input_layernorm.weight": "model-00023-of-00050.safetensors", "transformer.h.43.post_attention_layernorm.weight": "model-00023-of-00050.safetensors", "transformer.h.43.self_attention.dense.weight": "model-00023-of-00050.safetensors", "transformer.h.43.self_attention.dense.bias": "model-00023-of-00050.safetensors", "transformer.h.43.self_attention.query.weight": "model-00023-of-00050.safetensors", "transformer.h.43.self_attention.key_value.weight": "model-00023-of-00050.safetensors", "transformer.h.43.mlp.gate_proj.weight": "model-00023-of-00050.safetensors", "transformer.h.43.mlp.down_proj.weight": "model-00023-of-00050.safetensors", "transformer.h.43.mlp.down_proj.bias": "model-00023-of-00050.safetensors", "transformer.h.43.mlp.up_proj.weight": "model-00023-of-00050.safetensors", "transformer.h.44.input_layernorm.weight": "model-00023-of-00050.safetensors", "transformer.h.44.post_attention_layernorm.weight": "model-00023-of-00050.safetensors", "transformer.h.44.self_attention.dense.weight": "model-00023-of-00050.safetensors", "transformer.h.44.self_attention.dense.bias": "model-00023-of-00050.safetensors", "transformer.h.44.self_attention.query.weight": "model-00023-of-00050.safetensors", "transformer.h.44.self_attention.key_value.weight": "model-00023-of-00050.safetensors", "transformer.h.44.mlp.gate_proj.weight": "model-00023-of-00050.safetensors", "transformer.h.44.mlp.down_proj.weight": "model-00023-of-00050.safetensors", "transformer.h.44.mlp.down_proj.bias": "model-00023-of-00050.safetensors", "transformer.h.44.mlp.up_proj.weight": "model-00023-of-00050.safetensors", "transformer.h.45.input_layernorm.weight": "model-00024-of-00050.safetensors", "transformer.h.45.post_attention_layernorm.weight": "model-00024-of-00050.safetensors", "transformer.h.45.self_attention.dense.weight": "model-00024-of-00050.safetensors", "transformer.h.45.self_attention.dense.bias": "model-00024-of-00050.safetensors", "transformer.h.45.self_attention.query.weight": "model-00024-of-00050.safetensors", "transformer.h.45.self_attention.key_value.weight": "model-00024-of-00050.safetensors", "transformer.h.45.mlp.gate_proj.weight": "model-00024-of-00050.safetensors", "transformer.h.45.mlp.down_proj.weight": "model-00024-of-00050.safetensors", "transformer.h.45.mlp.down_proj.bias": "model-00024-of-00050.safetensors", "transformer.h.45.mlp.up_proj.weight": "model-00024-of-00050.safetensors", "transformer.h.46.input_layernorm.weight": "model-00024-of-00050.safetensors", "transformer.h.46.post_attention_layernorm.weight": "model-00024-of-00050.safetensors", "transformer.h.46.self_attention.dense.weight": "model-00024-of-00050.safetensors", "transformer.h.46.self_attention.dense.bias": "model-00024-of-00050.safetensors", "transformer.h.46.self_attention.query.weight": "model-00024-of-00050.safetensors", "transformer.h.46.self_attention.key_value.weight": "model-00024-of-00050.safetensors", "transformer.h.46.mlp.gate_proj.weight": "model-00024-of-00050.safetensors", "transformer.h.46.mlp.down_proj.weight": "model-00024-of-00050.safetensors", "transformer.h.46.mlp.down_proj.bias": "model-00024-of-00050.safetensors", "transformer.h.46.mlp.up_proj.weight": "model-00024-of-00050.safetensors", "transformer.h.47.input_layernorm.weight": "model-00025-of-00050.safetensors", "transformer.h.47.post_attention_layernorm.weight": "model-00025-of-00050.safetensors", "transformer.h.47.self_attention.dense.weight": "model-00025-of-00050.safetensors", "transformer.h.47.self_attention.dense.bias": "model-00025-of-00050.safetensors", "transformer.h.47.self_attention.query.weight": "model-00025-of-00050.safetensors", "transformer.h.47.self_attention.key_value.weight": "model-00025-of-00050.safetensors", "transformer.h.47.mlp.gate_proj.weight": "model-00025-of-00050.safetensors", "transformer.h.47.mlp.down_proj.weight": "model-00025-of-00050.safetensors", "transformer.h.47.mlp.down_proj.bias": "model-00025-of-00050.safetensors", "transformer.h.47.mlp.up_proj.weight": "model-00025-of-00050.safetensors", "transformer.h.48.input_layernorm.weight": "model-00025-of-00050.safetensors", "transformer.h.48.post_attention_layernorm.weight": "model-00025-of-00050.safetensors", "transformer.h.48.self_attention.dense.weight": "model-00025-of-00050.safetensors", "transformer.h.48.self_attention.dense.bias": "model-00025-of-00050.safetensors", "transformer.h.48.self_attention.query.weight": "model-00025-of-00050.safetensors", "transformer.h.48.self_attention.key_value.weight": "model-00025-of-00050.safetensors", "transformer.h.48.mlp.gate_proj.weight": "model-00025-of-00050.safetensors", "transformer.h.48.mlp.down_proj.weight": "model-00025-of-00050.safetensors", "transformer.h.48.mlp.down_proj.bias": "model-00025-of-00050.safetensors", "transformer.h.48.mlp.up_proj.weight": "model-00025-of-00050.safetensors", "transformer.h.49.input_layernorm.weight": "model-00026-of-00050.safetensors", "transformer.h.49.post_attention_layernorm.weight": "model-00026-of-00050.safetensors", "transformer.h.49.self_attention.dense.weight": "model-00026-of-00050.safetensors", "transformer.h.49.self_attention.dense.bias": "model-00026-of-00050.safetensors", "transformer.h.49.self_attention.query.weight": "model-00026-of-00050.safetensors", "transformer.h.49.self_attention.key_value.weight": "model-00026-of-00050.safetensors", "transformer.h.49.mlp.gate_proj.weight": "model-00026-of-00050.safetensors", "transformer.h.49.mlp.down_proj.weight": "model-00026-of-00050.safetensors", "transformer.h.49.mlp.down_proj.bias": "model-00026-of-00050.safetensors", "transformer.h.49.mlp.up_proj.weight": "model-00026-of-00050.safetensors", "transformer.h.50.input_layernorm.weight": "model-00026-of-00050.safetensors", "transformer.h.50.post_attention_layernorm.weight": "model-00026-of-00050.safetensors", "transformer.h.50.self_attention.dense.weight": "model-00026-of-00050.safetensors", "transformer.h.50.self_attention.dense.bias": "model-00026-of-00050.safetensors", "transformer.h.50.self_attention.query.weight": "model-00026-of-00050.safetensors", "transformer.h.50.self_attention.key_value.weight": "model-00026-of-00050.safetensors", "transformer.h.50.mlp.gate_proj.weight": "model-00026-of-00050.safetensors", "transformer.h.50.mlp.down_proj.weight": "model-00026-of-00050.safetensors", "transformer.h.50.mlp.down_proj.bias": "model-00026-of-00050.safetensors", "transformer.h.50.mlp.up_proj.weight": "model-00026-of-00050.safetensors", "transformer.h.51.input_layernorm.weight": "model-00027-of-00050.safetensors", "transformer.h.51.post_attention_layernorm.weight": "model-00027-of-00050.safetensors", "transformer.h.51.self_attention.dense.weight": "model-00027-of-00050.safetensors", "transformer.h.51.self_attention.dense.bias": "model-00027-of-00050.safetensors", "transformer.h.51.self_attention.query.weight": "model-00027-of-00050.safetensors", "transformer.h.51.self_attention.key_value.weight": "model-00027-of-00050.safetensors", "transformer.h.51.mlp.gate_proj.weight": "model-00027-of-00050.safetensors", "transformer.h.51.mlp.down_proj.weight": "model-00027-of-00050.safetensors", "transformer.h.51.mlp.down_proj.bias": "model-00027-of-00050.safetensors", "transformer.h.51.mlp.up_proj.weight": "model-00027-of-00050.safetensors", "transformer.h.52.input_layernorm.weight": "model-00027-of-00050.safetensors", "transformer.h.52.post_attention_layernorm.weight": "model-00027-of-00050.safetensors", "transformer.h.52.self_attention.dense.weight": "model-00027-of-00050.safetensors", "transformer.h.52.self_attention.dense.bias": "model-00027-of-00050.safetensors", "transformer.h.52.self_attention.query.weight": "model-00027-of-00050.safetensors", "transformer.h.52.self_attention.key_value.weight": "model-00027-of-00050.safetensors", "transformer.h.52.mlp.gate_proj.weight": "model-00027-of-00050.safetensors", "transformer.h.52.mlp.down_proj.weight": "model-00027-of-00050.safetensors", "transformer.h.52.mlp.down_proj.bias": "model-00027-of-00050.safetensors", "transformer.h.52.mlp.up_proj.weight": "model-00027-of-00050.safetensors", "transformer.h.53.input_layernorm.weight": "model-00028-of-00050.safetensors", "transformer.h.53.post_attention_layernorm.weight": "model-00028-of-00050.safetensors", "transformer.h.53.self_attention.dense.weight": "model-00028-of-00050.safetensors", "transformer.h.53.self_attention.dense.bias": "model-00028-of-00050.safetensors", "transformer.h.53.self_attention.query.weight": "model-00028-of-00050.safetensors", "transformer.h.53.self_attention.key_value.weight": "model-00028-of-00050.safetensors", "transformer.h.53.mlp.gate_proj.weight": "model-00028-of-00050.safetensors", "transformer.h.53.mlp.down_proj.weight": "model-00028-of-00050.safetensors", "transformer.h.53.mlp.down_proj.bias": "model-00028-of-00050.safetensors", "transformer.h.53.mlp.up_proj.weight": "model-00028-of-00050.safetensors", "transformer.h.54.input_layernorm.weight": "model-00028-of-00050.safetensors", "transformer.h.54.post_attention_layernorm.weight": "model-00028-of-00050.safetensors", "transformer.h.54.self_attention.dense.weight": "model-00028-of-00050.safetensors", "transformer.h.54.self_attention.dense.bias": "model-00028-of-00050.safetensors", "transformer.h.54.self_attention.query.weight": "model-00028-of-00050.safetensors", "transformer.h.54.self_attention.key_value.weight": "model-00028-of-00050.safetensors", "transformer.h.54.mlp.gate_proj.weight": "model-00028-of-00050.safetensors", "transformer.h.54.mlp.down_proj.weight": "model-00028-of-00050.safetensors", "transformer.h.54.mlp.down_proj.bias": "model-00028-of-00050.safetensors", "transformer.h.54.mlp.up_proj.weight": "model-00028-of-00050.safetensors", "transformer.h.55.input_layernorm.weight": "model-00029-of-00050.safetensors", "transformer.h.55.post_attention_layernorm.weight": "model-00029-of-00050.safetensors", "transformer.h.55.self_attention.dense.weight": "model-00029-of-00050.safetensors", "transformer.h.55.self_attention.dense.bias": "model-00029-of-00050.safetensors", "transformer.h.55.self_attention.query.weight": "model-00029-of-00050.safetensors", "transformer.h.55.self_attention.key_value.weight": "model-00029-of-00050.safetensors", "transformer.h.55.mlp.gate_proj.weight": "model-00029-of-00050.safetensors", "transformer.h.55.mlp.down_proj.weight": "model-00029-of-00050.safetensors", "transformer.h.55.mlp.down_proj.bias": "model-00029-of-00050.safetensors", "transformer.h.55.mlp.up_proj.weight": "model-00029-of-00050.safetensors", "transformer.h.56.input_layernorm.weight": "model-00029-of-00050.safetensors", "transformer.h.56.post_attention_layernorm.weight": "model-00029-of-00050.safetensors", "transformer.h.56.self_attention.dense.weight": "model-00029-of-00050.safetensors", "transformer.h.56.self_attention.dense.bias": "model-00029-of-00050.safetensors", "transformer.h.56.self_attention.query.weight": "model-00029-of-00050.safetensors", "transformer.h.56.self_attention.key_value.weight": "model-00029-of-00050.safetensors", "transformer.h.56.mlp.gate_proj.weight": "model-00029-of-00050.safetensors", "transformer.h.56.mlp.down_proj.weight": "model-00029-of-00050.safetensors", "transformer.h.56.mlp.down_proj.bias": "model-00029-of-00050.safetensors", "transformer.h.56.mlp.up_proj.weight": "model-00029-of-00050.safetensors", "transformer.h.57.input_layernorm.weight": "model-00030-of-00050.safetensors", "transformer.h.57.post_attention_layernorm.weight": "model-00030-of-00050.safetensors", "transformer.h.57.self_attention.dense.weight": "model-00030-of-00050.safetensors", "transformer.h.57.self_attention.dense.bias": "model-00030-of-00050.safetensors", "transformer.h.57.self_attention.query.weight": "model-00030-of-00050.safetensors", "transformer.h.57.self_attention.key_value.weight": "model-00030-of-00050.safetensors", "transformer.h.57.mlp.gate_proj.weight": "model-00030-of-00050.safetensors", "transformer.h.57.mlp.down_proj.weight": "model-00030-of-00050.safetensors", "transformer.h.57.mlp.down_proj.bias": "model-00030-of-00050.safetensors", "transformer.h.57.mlp.up_proj.weight": "model-00030-of-00050.safetensors", "transformer.h.58.input_layernorm.weight": "model-00030-of-00050.safetensors", "transformer.h.58.post_attention_layernorm.weight": "model-00030-of-00050.safetensors", "transformer.h.58.self_attention.dense.weight": "model-00030-of-00050.safetensors", "transformer.h.58.self_attention.dense.bias": "model-00030-of-00050.safetensors", "transformer.h.58.self_attention.query.weight": "model-00030-of-00050.safetensors", "transformer.h.58.self_attention.key_value.weight": "model-00030-of-00050.safetensors", "transformer.h.58.mlp.gate_proj.weight": "model-00030-of-00050.safetensors", "transformer.h.58.mlp.down_proj.weight": "model-00030-of-00050.safetensors", "transformer.h.58.mlp.down_proj.bias": "model-00030-of-00050.safetensors", "transformer.h.58.mlp.up_proj.weight": "model-00030-of-00050.safetensors", "transformer.h.59.input_layernorm.weight": "model-00031-of-00050.safetensors", "transformer.h.59.post_attention_layernorm.weight": "model-00031-of-00050.safetensors", "transformer.h.59.self_attention.dense.weight": "model-00031-of-00050.safetensors", "transformer.h.59.self_attention.dense.bias": "model-00031-of-00050.safetensors", "transformer.h.59.self_attention.query.weight": "model-00031-of-00050.safetensors", "transformer.h.59.self_attention.key_value.weight": "model-00031-of-00050.safetensors", "transformer.h.59.mlp.gate_proj.weight": "model-00031-of-00050.safetensors", "transformer.h.59.mlp.down_proj.weight": "model-00031-of-00050.safetensors", "transformer.h.59.mlp.down_proj.bias": "model-00031-of-00050.safetensors", "transformer.h.59.mlp.up_proj.weight": "model-00031-of-00050.safetensors", "transformer.h.60.input_layernorm.weight": "model-00031-of-00050.safetensors", "transformer.h.60.post_attention_layernorm.weight": "model-00031-of-00050.safetensors", "transformer.h.60.self_attention.dense.weight": "model-00031-of-00050.safetensors", "transformer.h.60.self_attention.dense.bias": "model-00031-of-00050.safetensors", "transformer.h.60.self_attention.query.weight": "model-00031-of-00050.safetensors", "transformer.h.60.self_attention.key_value.weight": "model-00031-of-00050.safetensors", "transformer.h.60.mlp.gate_proj.weight": "model-00031-of-00050.safetensors", "transformer.h.60.mlp.down_proj.weight": "model-00031-of-00050.safetensors", "transformer.h.60.mlp.down_proj.bias": "model-00031-of-00050.safetensors", "transformer.h.60.mlp.up_proj.weight": "model-00031-of-00050.safetensors", "transformer.h.61.input_layernorm.weight": "model-00032-of-00050.safetensors", "transformer.h.61.post_attention_layernorm.weight": "model-00032-of-00050.safetensors", "transformer.h.61.self_attention.dense.weight": "model-00032-of-00050.safetensors", "transformer.h.61.self_attention.dense.bias": "model-00032-of-00050.safetensors", "transformer.h.61.self_attention.query.weight": "model-00032-of-00050.safetensors", "transformer.h.61.self_attention.key_value.weight": "model-00032-of-00050.safetensors", "transformer.h.61.mlp.gate_proj.weight": "model-00032-of-00050.safetensors", "transformer.h.61.mlp.down_proj.weight": "model-00032-of-00050.safetensors", "transformer.h.61.mlp.down_proj.bias": "model-00032-of-00050.safetensors", "transformer.h.61.mlp.up_proj.weight": "model-00032-of-00050.safetensors", "transformer.h.62.input_layernorm.weight": "model-00032-of-00050.safetensors", "transformer.h.62.post_attention_layernorm.weight": "model-00032-of-00050.safetensors", "transformer.h.62.self_attention.dense.weight": "model-00032-of-00050.safetensors", "transformer.h.62.self_attention.dense.bias": "model-00032-of-00050.safetensors", "transformer.h.62.self_attention.query.weight": "model-00032-of-00050.safetensors", "transformer.h.62.self_attention.key_value.weight": "model-00032-of-00050.safetensors", "transformer.h.62.mlp.gate_proj.weight": "model-00032-of-00050.safetensors", "transformer.h.62.mlp.down_proj.weight": "model-00032-of-00050.safetensors", "transformer.h.62.mlp.down_proj.bias": "model-00032-of-00050.safetensors", "transformer.h.62.mlp.up_proj.weight": "model-00032-of-00050.safetensors", "transformer.h.63.input_layernorm.weight": "model-00033-of-00050.safetensors", "transformer.h.63.post_attention_layernorm.weight": "model-00033-of-00050.safetensors", "transformer.h.63.self_attention.dense.weight": "model-00033-of-00050.safetensors", "transformer.h.63.self_attention.dense.bias": "model-00033-of-00050.safetensors", "transformer.h.63.self_attention.query.weight": "model-00033-of-00050.safetensors", "transformer.h.63.self_attention.key_value.weight": "model-00033-of-00050.safetensors", "transformer.h.63.mlp.gate_proj.weight": "model-00033-of-00050.safetensors", "transformer.h.63.mlp.down_proj.weight": "model-00033-of-00050.safetensors", "transformer.h.63.mlp.down_proj.bias": "model-00033-of-00050.safetensors", "transformer.h.63.mlp.up_proj.weight": "model-00033-of-00050.safetensors", "transformer.h.64.input_layernorm.weight": "model-00033-of-00050.safetensors", "transformer.h.64.post_attention_layernorm.weight": "model-00033-of-00050.safetensors", "transformer.h.64.self_attention.dense.weight": "model-00033-of-00050.safetensors", "transformer.h.64.self_attention.dense.bias": "model-00033-of-00050.safetensors", "transformer.h.64.self_attention.query.weight": "model-00033-of-00050.safetensors", "transformer.h.64.self_attention.key_value.weight": "model-00033-of-00050.safetensors", "transformer.h.64.mlp.gate_proj.weight": "model-00033-of-00050.safetensors", "transformer.h.64.mlp.down_proj.weight": "model-00033-of-00050.safetensors", "transformer.h.64.mlp.down_proj.bias": "model-00033-of-00050.safetensors", "transformer.h.64.mlp.up_proj.weight": "model-00033-of-00050.safetensors", "transformer.h.65.input_layernorm.weight": "model-00034-of-00050.safetensors", "transformer.h.65.post_attention_layernorm.weight": "model-00034-of-00050.safetensors", "transformer.h.65.self_attention.dense.weight": "model-00034-of-00050.safetensors", "transformer.h.65.self_attention.dense.bias": "model-00034-of-00050.safetensors", "transformer.h.65.self_attention.query.weight": "model-00034-of-00050.safetensors", "transformer.h.65.self_attention.key_value.weight": "model-00034-of-00050.safetensors", "transformer.h.65.mlp.gate_proj.weight": "model-00034-of-00050.safetensors", "transformer.h.65.mlp.down_proj.weight": "model-00034-of-00050.safetensors", "transformer.h.65.mlp.down_proj.bias": "model-00034-of-00050.safetensors", "transformer.h.65.mlp.up_proj.weight": "model-00034-of-00050.safetensors", "transformer.h.66.input_layernorm.weight": "model-00034-of-00050.safetensors", "transformer.h.66.post_attention_layernorm.weight": "model-00034-of-00050.safetensors", "transformer.h.66.self_attention.dense.weight": "model-00034-of-00050.safetensors", "transformer.h.66.self_attention.dense.bias": "model-00034-of-00050.safetensors", "transformer.h.66.self_attention.query.weight": "model-00034-of-00050.safetensors", "transformer.h.66.self_attention.key_value.weight": "model-00034-of-00050.safetensors", "transformer.h.66.mlp.gate_proj.weight": "model-00034-of-00050.safetensors", "transformer.h.66.mlp.down_proj.weight": "model-00034-of-00050.safetensors", "transformer.h.66.mlp.down_proj.bias": "model-00034-of-00050.safetensors", "transformer.h.66.mlp.up_proj.weight": "model-00034-of-00050.safetensors", "transformer.h.67.input_layernorm.weight": "model-00035-of-00050.safetensors", "transformer.h.67.post_attention_layernorm.weight": "model-00035-of-00050.safetensors", "transformer.h.67.self_attention.dense.weight": "model-00035-of-00050.safetensors", "transformer.h.67.self_attention.dense.bias": "model-00035-of-00050.safetensors", "transformer.h.67.self_attention.query.weight": "model-00035-of-00050.safetensors", "transformer.h.67.self_attention.key_value.weight": "model-00035-of-00050.safetensors", "transformer.h.67.mlp.gate_proj.weight": "model-00035-of-00050.safetensors", "transformer.h.67.mlp.down_proj.weight": "model-00035-of-00050.safetensors", "transformer.h.67.mlp.down_proj.bias": "model-00035-of-00050.safetensors", "transformer.h.67.mlp.up_proj.weight": "model-00035-of-00050.safetensors", "transformer.h.68.input_layernorm.weight": "model-00035-of-00050.safetensors", "transformer.h.68.post_attention_layernorm.weight": "model-00035-of-00050.safetensors", "transformer.h.68.self_attention.dense.weight": "model-00035-of-00050.safetensors", "transformer.h.68.self_attention.dense.bias": "model-00035-of-00050.safetensors", "transformer.h.68.self_attention.query.weight": "model-00035-of-00050.safetensors", "transformer.h.68.self_attention.key_value.weight": "model-00035-of-00050.safetensors", "transformer.h.68.mlp.gate_proj.weight": "model-00035-of-00050.safetensors", "transformer.h.68.mlp.down_proj.weight": "model-00035-of-00050.safetensors", "transformer.h.68.mlp.down_proj.bias": "model-00035-of-00050.safetensors", "transformer.h.68.mlp.up_proj.weight": "model-00035-of-00050.safetensors", "transformer.h.69.input_layernorm.weight": "model-00036-of-00050.safetensors", "transformer.h.69.post_attention_layernorm.weight": "model-00036-of-00050.safetensors", "transformer.h.69.self_attention.dense.weight": "model-00036-of-00050.safetensors", "transformer.h.69.self_attention.dense.bias": "model-00036-of-00050.safetensors", "transformer.h.69.self_attention.query.weight": "model-00036-of-00050.safetensors", "transformer.h.69.self_attention.key_value.weight": "model-00036-of-00050.safetensors", "transformer.h.69.mlp.gate_proj.weight": "model-00036-of-00050.safetensors", "transformer.h.69.mlp.down_proj.weight": "model-00036-of-00050.safetensors", "transformer.h.69.mlp.down_proj.bias": "model-00036-of-00050.safetensors", "transformer.h.69.mlp.up_proj.weight": "model-00036-of-00050.safetensors", "transformer.h.70.input_layernorm.weight": "model-00036-of-00050.safetensors", "transformer.h.70.post_attention_layernorm.weight": "model-00036-of-00050.safetensors", "transformer.h.70.self_attention.dense.weight": "model-00036-of-00050.safetensors", "transformer.h.70.self_attention.dense.bias": "model-00036-of-00050.safetensors", "transformer.h.70.self_attention.query.weight": "model-00036-of-00050.safetensors", "transformer.h.70.self_attention.key_value.weight": "model-00036-of-00050.safetensors", "transformer.h.70.mlp.gate_proj.weight": "model-00036-of-00050.safetensors", "transformer.h.70.mlp.down_proj.weight": "model-00036-of-00050.safetensors", "transformer.h.70.mlp.down_proj.bias": "model-00036-of-00050.safetensors", "transformer.h.70.mlp.up_proj.weight": "model-00036-of-00050.safetensors", "transformer.h.71.input_layernorm.weight": "model-00037-of-00050.safetensors", "transformer.h.71.post_attention_layernorm.weight": "model-00037-of-00050.safetensors", "transformer.h.71.self_attention.dense.weight": "model-00037-of-00050.safetensors", "transformer.h.71.self_attention.dense.bias": "model-00037-of-00050.safetensors", "transformer.h.71.self_attention.query.weight": "model-00037-of-00050.safetensors", "transformer.h.71.self_attention.key_value.weight": "model-00037-of-00050.safetensors", "transformer.h.71.mlp.gate_proj.weight": "model-00037-of-00050.safetensors", "transformer.h.71.mlp.down_proj.weight": "model-00037-of-00050.safetensors", "transformer.h.71.mlp.down_proj.bias": "model-00037-of-00050.safetensors", "transformer.h.71.mlp.up_proj.weight": "model-00037-of-00050.safetensors", "transformer.h.72.input_layernorm.weight": "model-00037-of-00050.safetensors", "transformer.h.72.post_attention_layernorm.weight": "model-00037-of-00050.safetensors", "transformer.h.72.self_attention.dense.weight": "model-00037-of-00050.safetensors", "transformer.h.72.self_attention.dense.bias": "model-00037-of-00050.safetensors", "transformer.h.72.self_attention.query.weight": "model-00037-of-00050.safetensors", "transformer.h.72.self_attention.key_value.weight": "model-00037-of-00050.safetensors", "transformer.h.72.mlp.gate_proj.weight": "model-00037-of-00050.safetensors", "transformer.h.72.mlp.down_proj.weight": "model-00037-of-00050.safetensors", "transformer.h.72.mlp.down_proj.bias": "model-00037-of-00050.safetensors", "transformer.h.72.mlp.up_proj.weight": "model-00037-of-00050.safetensors", "transformer.h.73.input_layernorm.weight": "model-00038-of-00050.safetensors", "transformer.h.73.post_attention_layernorm.weight": "model-00038-of-00050.safetensors", "transformer.h.73.self_attention.dense.weight": "model-00038-of-00050.safetensors", "transformer.h.73.self_attention.dense.bias": "model-00038-of-00050.safetensors", "transformer.h.73.self_attention.query.weight": "model-00038-of-00050.safetensors", "transformer.h.73.self_attention.key_value.weight": "model-00038-of-00050.safetensors", "transformer.h.73.mlp.gate_proj.weight": "model-00038-of-00050.safetensors", "transformer.h.73.mlp.down_proj.weight": "model-00038-of-00050.safetensors", "transformer.h.73.mlp.down_proj.bias": "model-00038-of-00050.safetensors", "transformer.h.73.mlp.up_proj.weight": "model-00038-of-00050.safetensors", "transformer.h.74.input_layernorm.weight": "model-00038-of-00050.safetensors", "transformer.h.74.post_attention_layernorm.weight": "model-00038-of-00050.safetensors", "transformer.h.74.self_attention.dense.weight": "model-00038-of-00050.safetensors", "transformer.h.74.self_attention.dense.bias": "model-00038-of-00050.safetensors", "transformer.h.74.self_attention.query.weight": "model-00038-of-00050.safetensors", "transformer.h.74.self_attention.key_value.weight": "model-00038-of-00050.safetensors", "transformer.h.74.mlp.gate_proj.weight": "model-00038-of-00050.safetensors", "transformer.h.74.mlp.down_proj.weight": "model-00038-of-00050.safetensors", "transformer.h.74.mlp.down_proj.bias": "model-00038-of-00050.safetensors", "transformer.h.74.mlp.up_proj.weight": "model-00038-of-00050.safetensors", "transformer.h.75.input_layernorm.weight": "model-00039-of-00050.safetensors", "transformer.h.75.post_attention_layernorm.weight": "model-00039-of-00050.safetensors", "transformer.h.75.self_attention.dense.weight": "model-00039-of-00050.safetensors", "transformer.h.75.self_attention.dense.bias": "model-00039-of-00050.safetensors", "transformer.h.75.self_attention.query.weight": "model-00039-of-00050.safetensors", "transformer.h.75.self_attention.key_value.weight": "model-00039-of-00050.safetensors", "transformer.h.75.mlp.gate_proj.weight": "model-00039-of-00050.safetensors", "transformer.h.75.mlp.down_proj.weight": "model-00039-of-00050.safetensors", "transformer.h.75.mlp.down_proj.bias": "model-00039-of-00050.safetensors", "transformer.h.75.mlp.up_proj.weight": "model-00039-of-00050.safetensors", "transformer.h.76.input_layernorm.weight": "model-00039-of-00050.safetensors", "transformer.h.76.post_attention_layernorm.weight": "model-00039-of-00050.safetensors", "transformer.h.76.self_attention.dense.weight": "model-00039-of-00050.safetensors", "transformer.h.76.self_attention.dense.bias": "model-00039-of-00050.safetensors", "transformer.h.76.self_attention.query.weight": "model-00039-of-00050.safetensors", "transformer.h.76.self_attention.key_value.weight": "model-00039-of-00050.safetensors", "transformer.h.76.mlp.gate_proj.weight": "model-00039-of-00050.safetensors", "transformer.h.76.mlp.down_proj.weight": "model-00039-of-00050.safetensors", "transformer.h.76.mlp.down_proj.bias": "model-00039-of-00050.safetensors", "transformer.h.76.mlp.up_proj.weight": "model-00039-of-00050.safetensors", "transformer.h.77.input_layernorm.weight": "model-00040-of-00050.safetensors", "transformer.h.77.post_attention_layernorm.weight": "model-00040-of-00050.safetensors", "transformer.h.77.self_attention.dense.weight": "model-00040-of-00050.safetensors", "transformer.h.77.self_attention.dense.bias": "model-00040-of-00050.safetensors", "transformer.h.77.self_attention.query.weight": "model-00040-of-00050.safetensors", "transformer.h.77.self_attention.key_value.weight": "model-00040-of-00050.safetensors", "transformer.h.77.mlp.gate_proj.weight": "model-00040-of-00050.safetensors", "transformer.h.77.mlp.down_proj.weight": "model-00040-of-00050.safetensors", "transformer.h.77.mlp.down_proj.bias": "model-00040-of-00050.safetensors", "transformer.h.77.mlp.up_proj.weight": "model-00040-of-00050.safetensors", "transformer.h.78.input_layernorm.weight": "model-00040-of-00050.safetensors", "transformer.h.78.post_attention_layernorm.weight": "model-00040-of-00050.safetensors", "transformer.h.78.self_attention.dense.weight": "model-00040-of-00050.safetensors", "transformer.h.78.self_attention.dense.bias": "model-00040-of-00050.safetensors", "transformer.h.78.self_attention.query.weight": "model-00040-of-00050.safetensors", "transformer.h.78.self_attention.key_value.weight": "model-00040-of-00050.safetensors", "transformer.h.78.mlp.gate_proj.weight": "model-00040-of-00050.safetensors", "transformer.h.78.mlp.down_proj.weight": "model-00040-of-00050.safetensors", "transformer.h.78.mlp.down_proj.bias": "model-00040-of-00050.safetensors", "transformer.h.78.mlp.up_proj.weight": "model-00040-of-00050.safetensors", "transformer.h.79.input_layernorm.weight": "model-00041-of-00050.safetensors", "transformer.h.79.post_attention_layernorm.weight": "model-00041-of-00050.safetensors", "transformer.h.79.self_attention.dense.weight": "model-00041-of-00050.safetensors", "transformer.h.79.self_attention.dense.bias": "model-00041-of-00050.safetensors", "transformer.h.79.self_attention.query.weight": "model-00041-of-00050.safetensors", "transformer.h.79.self_attention.key_value.weight": "model-00041-of-00050.safetensors", "transformer.h.79.mlp.gate_proj.weight": "model-00041-of-00050.safetensors", "transformer.h.79.mlp.down_proj.weight": "model-00041-of-00050.safetensors", "transformer.h.79.mlp.down_proj.bias": "model-00041-of-00050.safetensors", "transformer.h.79.mlp.up_proj.weight": "model-00041-of-00050.safetensors", "transformer.h.80.input_layernorm.weight": "model-00041-of-00050.safetensors", "transformer.h.80.post_attention_layernorm.weight": "model-00041-of-00050.safetensors", "transformer.h.80.self_attention.dense.weight": "model-00041-of-00050.safetensors", "transformer.h.80.self_attention.dense.bias": "model-00041-of-00050.safetensors", "transformer.h.80.self_attention.query.weight": "model-00041-of-00050.safetensors", "transformer.h.80.self_attention.key_value.weight": "model-00041-of-00050.safetensors", "transformer.h.80.mlp.gate_proj.weight": "model-00041-of-00050.safetensors", "transformer.h.80.mlp.down_proj.weight": "model-00041-of-00050.safetensors", "transformer.h.80.mlp.down_proj.bias": "model-00041-of-00050.safetensors", "transformer.h.80.mlp.up_proj.weight": "model-00041-of-00050.safetensors", "transformer.h.81.input_layernorm.weight": "model-00042-of-00050.safetensors", "transformer.h.81.post_attention_layernorm.weight": "model-00042-of-00050.safetensors", "transformer.h.81.self_attention.dense.weight": "model-00042-of-00050.safetensors", "transformer.h.81.self_attention.dense.bias": "model-00042-of-00050.safetensors", "transformer.h.81.self_attention.query.weight": "model-00042-of-00050.safetensors", "transformer.h.81.self_attention.key_value.weight": "model-00042-of-00050.safetensors", "transformer.h.81.mlp.gate_proj.weight": "model-00042-of-00050.safetensors", "transformer.h.81.mlp.down_proj.weight": "model-00042-of-00050.safetensors", "transformer.h.81.mlp.down_proj.bias": "model-00042-of-00050.safetensors", "transformer.h.81.mlp.up_proj.weight": "model-00042-of-00050.safetensors", "transformer.h.82.input_layernorm.weight": "model-00042-of-00050.safetensors", "transformer.h.82.post_attention_layernorm.weight": "model-00042-of-00050.safetensors", "transformer.h.82.self_attention.dense.weight": "model-00042-of-00050.safetensors", "transformer.h.82.self_attention.dense.bias": "model-00042-of-00050.safetensors", "transformer.h.82.self_attention.query.weight": "model-00042-of-00050.safetensors", "transformer.h.82.self_attention.key_value.weight": "model-00042-of-00050.safetensors", "transformer.h.82.mlp.gate_proj.weight": "model-00042-of-00050.safetensors", "transformer.h.82.mlp.down_proj.weight": "model-00042-of-00050.safetensors", "transformer.h.82.mlp.down_proj.bias": "model-00042-of-00050.safetensors", "transformer.h.82.mlp.up_proj.weight": "model-00042-of-00050.safetensors", "transformer.h.83.input_layernorm.weight": "model-00043-of-00050.safetensors", "transformer.h.83.post_attention_layernorm.weight": "model-00043-of-00050.safetensors", "transformer.h.83.self_attention.dense.weight": "model-00043-of-00050.safetensors", "transformer.h.83.self_attention.dense.bias": "model-00043-of-00050.safetensors", "transformer.h.83.self_attention.query.weight": "model-00043-of-00050.safetensors", "transformer.h.83.self_attention.key_value.weight": "model-00043-of-00050.safetensors", "transformer.h.83.mlp.gate_proj.weight": "model-00043-of-00050.safetensors", "transformer.h.83.mlp.down_proj.weight": "model-00043-of-00050.safetensors", "transformer.h.83.mlp.down_proj.bias": "model-00043-of-00050.safetensors", "transformer.h.83.mlp.up_proj.weight": "model-00043-of-00050.safetensors", "transformer.h.84.input_layernorm.weight": "model-00043-of-00050.safetensors", "transformer.h.84.post_attention_layernorm.weight": "model-00043-of-00050.safetensors", "transformer.h.84.self_attention.dense.weight": "model-00043-of-00050.safetensors", "transformer.h.84.self_attention.dense.bias": "model-00043-of-00050.safetensors", "transformer.h.84.self_attention.query.weight": "model-00043-of-00050.safetensors", "transformer.h.84.self_attention.key_value.weight": "model-00043-of-00050.safetensors", "transformer.h.84.mlp.gate_proj.weight": "model-00043-of-00050.safetensors", "transformer.h.84.mlp.down_proj.weight": "model-00043-of-00050.safetensors", "transformer.h.84.mlp.down_proj.bias": "model-00043-of-00050.safetensors", "transformer.h.84.mlp.up_proj.weight": "model-00043-of-00050.safetensors", "transformer.h.85.input_layernorm.weight": "model-00044-of-00050.safetensors", "transformer.h.85.post_attention_layernorm.weight": "model-00044-of-00050.safetensors", "transformer.h.85.self_attention.dense.weight": "model-00044-of-00050.safetensors", "transformer.h.85.self_attention.dense.bias": "model-00044-of-00050.safetensors", "transformer.h.85.self_attention.query.weight": "model-00044-of-00050.safetensors", "transformer.h.85.self_attention.key_value.weight": "model-00044-of-00050.safetensors", "transformer.h.85.mlp.gate_proj.weight": "model-00044-of-00050.safetensors", "transformer.h.85.mlp.down_proj.weight": "model-00044-of-00050.safetensors", "transformer.h.85.mlp.down_proj.bias": "model-00044-of-00050.safetensors", "transformer.h.85.mlp.up_proj.weight": "model-00044-of-00050.safetensors", "transformer.h.86.input_layernorm.weight": "model-00044-of-00050.safetensors", "transformer.h.86.post_attention_layernorm.weight": "model-00044-of-00050.safetensors", "transformer.h.86.self_attention.dense.weight": "model-00044-of-00050.safetensors", "transformer.h.86.self_attention.dense.bias": "model-00044-of-00050.safetensors", "transformer.h.86.self_attention.query.weight": "model-00044-of-00050.safetensors", "transformer.h.86.self_attention.key_value.weight": "model-00044-of-00050.safetensors", "transformer.h.86.mlp.gate_proj.weight": "model-00044-of-00050.safetensors", "transformer.h.86.mlp.down_proj.weight": "model-00044-of-00050.safetensors", "transformer.h.86.mlp.down_proj.bias": "model-00044-of-00050.safetensors", "transformer.h.86.mlp.up_proj.weight": "model-00044-of-00050.safetensors", "transformer.h.87.input_layernorm.weight": "model-00045-of-00050.safetensors", "transformer.h.87.post_attention_layernorm.weight": "model-00045-of-00050.safetensors", "transformer.h.87.self_attention.dense.weight": "model-00045-of-00050.safetensors", "transformer.h.87.self_attention.dense.bias": "model-00045-of-00050.safetensors", "transformer.h.87.self_attention.query.weight": "model-00045-of-00050.safetensors", "transformer.h.87.self_attention.key_value.weight": "model-00045-of-00050.safetensors", "transformer.h.87.mlp.gate_proj.weight": "model-00045-of-00050.safetensors", "transformer.h.87.mlp.down_proj.weight": "model-00045-of-00050.safetensors", "transformer.h.87.mlp.down_proj.bias": "model-00045-of-00050.safetensors", "transformer.h.87.mlp.up_proj.weight": "model-00045-of-00050.safetensors", "transformer.h.88.input_layernorm.weight": "model-00045-of-00050.safetensors", "transformer.h.88.post_attention_layernorm.weight": "model-00045-of-00050.safetensors", "transformer.h.88.self_attention.dense.weight": "model-00045-of-00050.safetensors", "transformer.h.88.self_attention.dense.bias": "model-00045-of-00050.safetensors", "transformer.h.88.self_attention.query.weight": "model-00045-of-00050.safetensors", "transformer.h.88.self_attention.key_value.weight": "model-00045-of-00050.safetensors", "transformer.h.88.mlp.gate_proj.weight": "model-00045-of-00050.safetensors", "transformer.h.88.mlp.down_proj.weight": "model-00045-of-00050.safetensors", "transformer.h.88.mlp.down_proj.bias": "model-00045-of-00050.safetensors", "transformer.h.88.mlp.up_proj.weight": "model-00045-of-00050.safetensors", "transformer.h.89.input_layernorm.weight": "model-00046-of-00050.safetensors", "transformer.h.89.post_attention_layernorm.weight": "model-00046-of-00050.safetensors", "transformer.h.89.self_attention.dense.weight": "model-00046-of-00050.safetensors", "transformer.h.89.self_attention.dense.bias": "model-00046-of-00050.safetensors", "transformer.h.89.self_attention.query.weight": "model-00046-of-00050.safetensors", "transformer.h.89.self_attention.key_value.weight": "model-00046-of-00050.safetensors", "transformer.h.89.mlp.gate_proj.weight": "model-00046-of-00050.safetensors", "transformer.h.89.mlp.down_proj.weight": "model-00046-of-00050.safetensors", "transformer.h.89.mlp.down_proj.bias": "model-00046-of-00050.safetensors", "transformer.h.89.mlp.up_proj.weight": "model-00046-of-00050.safetensors", "transformer.h.90.input_layernorm.weight": "model-00046-of-00050.safetensors", "transformer.h.90.post_attention_layernorm.weight": "model-00046-of-00050.safetensors", "transformer.h.90.self_attention.dense.weight": "model-00046-of-00050.safetensors", "transformer.h.90.self_attention.dense.bias": "model-00046-of-00050.safetensors", "transformer.h.90.self_attention.query.weight": "model-00046-of-00050.safetensors", "transformer.h.90.self_attention.key_value.weight": "model-00046-of-00050.safetensors", "transformer.h.90.mlp.gate_proj.weight": "model-00046-of-00050.safetensors", "transformer.h.90.mlp.down_proj.weight": "model-00046-of-00050.safetensors", "transformer.h.90.mlp.down_proj.bias": "model-00046-of-00050.safetensors", "transformer.h.90.mlp.up_proj.weight": "model-00046-of-00050.safetensors", "transformer.h.91.input_layernorm.weight": "model-00047-of-00050.safetensors", "transformer.h.91.post_attention_layernorm.weight": "model-00047-of-00050.safetensors", "transformer.h.91.self_attention.dense.weight": "model-00047-of-00050.safetensors", "transformer.h.91.self_attention.dense.bias": "model-00047-of-00050.safetensors", "transformer.h.91.self_attention.query.weight": "model-00047-of-00050.safetensors", "transformer.h.91.self_attention.key_value.weight": "model-00047-of-00050.safetensors", "transformer.h.91.mlp.gate_proj.weight": "model-00047-of-00050.safetensors", "transformer.h.91.mlp.down_proj.weight": "model-00047-of-00050.safetensors", "transformer.h.91.mlp.down_proj.bias": "model-00047-of-00050.safetensors", "transformer.h.91.mlp.up_proj.weight": "model-00047-of-00050.safetensors", "transformer.h.92.input_layernorm.weight": "model-00047-of-00050.safetensors", "transformer.h.92.post_attention_layernorm.weight": "model-00047-of-00050.safetensors", "transformer.h.92.self_attention.dense.weight": "model-00047-of-00050.safetensors", "transformer.h.92.self_attention.dense.bias": "model-00047-of-00050.safetensors", "transformer.h.92.self_attention.query.weight": "model-00047-of-00050.safetensors", "transformer.h.92.self_attention.key_value.weight": "model-00047-of-00050.safetensors", "transformer.h.92.mlp.gate_proj.weight": "model-00047-of-00050.safetensors", "transformer.h.92.mlp.down_proj.weight": "model-00047-of-00050.safetensors", "transformer.h.92.mlp.down_proj.bias": "model-00047-of-00050.safetensors", "transformer.h.92.mlp.up_proj.weight": "model-00047-of-00050.safetensors", "transformer.h.93.input_layernorm.weight": "model-00048-of-00050.safetensors", "transformer.h.93.post_attention_layernorm.weight": "model-00048-of-00050.safetensors", "transformer.h.93.self_attention.dense.weight": "model-00048-of-00050.safetensors", "transformer.h.93.self_attention.dense.bias": "model-00048-of-00050.safetensors", "transformer.h.93.self_attention.query.weight": "model-00048-of-00050.safetensors", "transformer.h.93.self_attention.key_value.weight": "model-00048-of-00050.safetensors", "transformer.h.93.mlp.gate_proj.weight": "model-00048-of-00050.safetensors", "transformer.h.93.mlp.down_proj.weight": "model-00048-of-00050.safetensors", "transformer.h.93.mlp.down_proj.bias": "model-00048-of-00050.safetensors", "transformer.h.93.mlp.up_proj.weight": "model-00048-of-00050.safetensors", "transformer.h.94.input_layernorm.weight": "model-00048-of-00050.safetensors", "transformer.h.94.post_attention_layernorm.weight": "model-00048-of-00050.safetensors", "transformer.h.94.self_attention.dense.weight": "model-00048-of-00050.safetensors", "transformer.h.94.self_attention.dense.bias": "model-00048-of-00050.safetensors", "transformer.h.94.self_attention.query.weight": "model-00048-of-00050.safetensors", "transformer.h.94.self_attention.key_value.weight": "model-00048-of-00050.safetensors", "transformer.h.94.mlp.gate_proj.weight": "model-00048-of-00050.safetensors", "transformer.h.94.mlp.down_proj.weight": "model-00048-of-00050.safetensors", "transformer.h.94.mlp.down_proj.bias": "model-00048-of-00050.safetensors", "transformer.h.94.mlp.up_proj.weight": "model-00048-of-00050.safetensors", "transformer.h.95.input_layernorm.weight": "model-00049-of-00050.safetensors", "transformer.h.95.post_attention_layernorm.weight": "model-00049-of-00050.safetensors", "transformer.h.95.self_attention.dense.weight": "model-00049-of-00050.safetensors", "transformer.h.95.self_attention.dense.bias": "model-00049-of-00050.safetensors", "transformer.h.95.self_attention.query.weight": "model-00049-of-00050.safetensors", "transformer.h.95.self_attention.key_value.weight": "model-00049-of-00050.safetensors", "transformer.h.95.mlp.gate_proj.weight": "model-00049-of-00050.safetensors", "transformer.h.95.mlp.down_proj.weight": "model-00049-of-00050.safetensors", "transformer.h.95.mlp.down_proj.bias": "model-00049-of-00050.safetensors", "transformer.h.95.mlp.up_proj.weight": "model-00049-of-00050.safetensors", "transformer.ln_f.weight": "model-00050-of-00050.safetensors", "lm_head.weight": "model-00050-of-00050.safetensors"}}