shunxing1234
commited on
Commit
•
8bdc10d
1
Parent(s):
5e72216
Upload 11 files
Browse files- config.json +40 -0
- configuration.json +1 -0
- configuration_telechat.py +94 -0
- generation_config.json +14 -0
- generation_utils.py +162 -0
- model-00001-of-00050.safetensors +3 -0
- model-00002-of-00050.safetensors +3 -0
- model-00003-of-00050.safetensors +3 -0
- model-00004-of-00050.safetensors +3 -0
- model-00005-of-00050.safetensors +3 -0
- model.safetensors.index.json +1 -0
config.json
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"apply_residual_connection_post_layernorm": false,
|
3 |
+
"architectures": [
|
4 |
+
"TelechatForCausalLM"
|
5 |
+
],
|
6 |
+
"auto_map": {
|
7 |
+
"AutoConfig": "configuration_telechat.TelechatConfig",
|
8 |
+
"AutoModelForCausalLM": "modeling_telechat.TelechatForCausalLM"
|
9 |
+
},
|
10 |
+
"attention_dropout": 0.0,
|
11 |
+
"attention_softmax_in_fp32": true,
|
12 |
+
"bias_dropout_fusion": true,
|
13 |
+
"bos_token_id": 1,
|
14 |
+
"eos_token_id": 2,
|
15 |
+
"hidden_dropout": 0.0,
|
16 |
+
"hidden_size": 8192,
|
17 |
+
"initializer_range": 0.02,
|
18 |
+
"layer_norm_epsilon": 1e-08,
|
19 |
+
"masked_softmax_fusion": true,
|
20 |
+
"model_type": "telechat",
|
21 |
+
"n_head": 64,
|
22 |
+
"n_inner": null,
|
23 |
+
"num_key_value_heads": 8,
|
24 |
+
"n_layer": 96,
|
25 |
+
"pad_token_id": 3,
|
26 |
+
"pretraining_tp": 2,
|
27 |
+
"skip_bias_add": false,
|
28 |
+
"skip_bias_add_qkv": false,
|
29 |
+
"slow_but_exact": false,
|
30 |
+
"unk_token_id": 0,
|
31 |
+
"use_cache": true,
|
32 |
+
"vocab_size": 131072,
|
33 |
+
"ffn_hidden_size": 40960,
|
34 |
+
"flash_attn":true,
|
35 |
+
"tie_word_embeddings":false,
|
36 |
+
"training_seqlen":8192,
|
37 |
+
"base_seqlen":8192,
|
38 |
+
"seq_length": 8192
|
39 |
+
}
|
40 |
+
|
configuration.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"task":"text-generation"}
|
configuration_telechat.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. All rights reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
|
16 |
+
""" Telechat configuration"""
|
17 |
+
|
18 |
+
from packaging import version
|
19 |
+
from collections import OrderedDict
|
20 |
+
from transformers.utils import is_torch_available, logging
|
21 |
+
from transformers.configuration_utils import PretrainedConfig
|
22 |
+
from typing import TYPE_CHECKING, Any, List, Mapping, Optional
|
23 |
+
|
24 |
+
logger = logging.get_logger(__name__)
|
25 |
+
|
26 |
+
class TelechatConfig(PretrainedConfig):
|
27 |
+
"""
|
28 |
+
Args:
|
29 |
+
vocab_size (`int`, *optional*, defaults to 160256): Vocabulary size of the Telechat model.
|
30 |
+
hidden_size (`int`, *optional*, defaults to 4096): Dimensionality of the embeddings and hidden states.
|
31 |
+
ffn_hidden_size (`int`, *optional*, defaults to 12288): Dimensionality of the feed-forward hidden states.
|
32 |
+
n_layer (`int`, *optional*, defaults to 30): Number of hidden layers in the Transformer
|
33 |
+
n_head (`int`, *optional*, defaults to 32): Number of attention heads for each attention layer.
|
34 |
+
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): The epsilon to use in the layer normalization layers.
|
35 |
+
initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
36 |
+
apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`): If enabled, use the layer norm of the hidden states as the residual in the transformer blocks
|
37 |
+
hidden_dropout (`float`, *optional*, defaults to 0.0): Dropout rate of the dropout function on the bias dropout.
|
38 |
+
attention_dropout (`float`, *optional*, defaults to 0.0): Dropout rate applied to the attention probs
|
39 |
+
use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions.
|
40 |
+
training_seqlen (`int`, *optional*, defaults to 8192): Sequence length during last finetuning.
|
41 |
+
logn (`bool`, *optional*, defaults to `True`): Whether or not to use logN during extrapolation.
|
42 |
+
embed_layernorm (`bool`, *optional*, defaults to `True`): Whether or not to use embedding layernorm.
|
43 |
+
|
44 |
+
"""
|
45 |
+
|
46 |
+
model_type = "telechat"
|
47 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
48 |
+
attribute_map = {
|
49 |
+
"num_hidden_layers": "n_layer",
|
50 |
+
"num_attention_heads": "n_head",
|
51 |
+
}
|
52 |
+
|
53 |
+
def __init__(
|
54 |
+
self,
|
55 |
+
vocab_size=160256,
|
56 |
+
hidden_size=4096,
|
57 |
+
n_layer=30,
|
58 |
+
n_head=32,
|
59 |
+
layer_norm_epsilon=1e-5,
|
60 |
+
initializer_range=0.02,
|
61 |
+
use_cache=True,
|
62 |
+
bos_token_id=1,
|
63 |
+
eos_token_id=2,
|
64 |
+
apply_residual_connection_post_layernorm=False,
|
65 |
+
hidden_dropout=0.0,
|
66 |
+
attention_dropout=0.0,
|
67 |
+
ffn_hidden_size=12288,
|
68 |
+
training_seqlen = 8192,
|
69 |
+
logn = True,
|
70 |
+
embed_layernorm = False,
|
71 |
+
**kwargs,
|
72 |
+
):
|
73 |
+
self.vocab_size = vocab_size
|
74 |
+
n_embed = kwargs.pop("n_embed", None)
|
75 |
+
self.hidden_size = hidden_size if n_embed is None else n_embed
|
76 |
+
self.n_layer = n_layer
|
77 |
+
self.n_head = n_head
|
78 |
+
self.layer_norm_epsilon = layer_norm_epsilon
|
79 |
+
self.initializer_range = initializer_range
|
80 |
+
self.use_cache = use_cache
|
81 |
+
self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
|
82 |
+
self.hidden_dropout = hidden_dropout
|
83 |
+
self.attention_dropout = attention_dropout
|
84 |
+
self.bos_token_id = bos_token_id
|
85 |
+
self.eos_token_id = eos_token_id
|
86 |
+
self.logn = logn
|
87 |
+
self.ffn_hidden_size = ffn_hidden_size
|
88 |
+
self.training_seqlen = training_seqlen
|
89 |
+
self.embed_layernorm = embed_layernorm
|
90 |
+
self.num_key_value_heads= kwargs.pop("num_key_value_heads", None)
|
91 |
+
|
92 |
+
|
93 |
+
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
94 |
+
|
generation_config.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_length": 8192,
|
3 |
+
"do_sample": false,
|
4 |
+
"use_cache": true,
|
5 |
+
"temperature": 0.3,
|
6 |
+
"top_k": 5,
|
7 |
+
"top_p": 0.85,
|
8 |
+
"repetition_penalty": 1.03,
|
9 |
+
"pad_token_id": 3,
|
10 |
+
"bos_token_id": 1,
|
11 |
+
"eos_token_id": 2,
|
12 |
+
"user_token_id": 4,
|
13 |
+
"bot_token_id": 5
|
14 |
+
}
|
generation_utils.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
from collections import deque
|
3 |
+
from queue import Queue
|
4 |
+
import copy
|
5 |
+
|
6 |
+
|
7 |
+
class History:
|
8 |
+
|
9 |
+
def __init__(self, tokenizer, history):
|
10 |
+
'''
|
11 |
+
init from a list of dict
|
12 |
+
'''
|
13 |
+
# use deque to meet some special situation
|
14 |
+
self.input_history = deque()
|
15 |
+
self.tokenizer = tokenizer
|
16 |
+
if history:
|
17 |
+
self._transfer_from_list(history)
|
18 |
+
|
19 |
+
def _transfer_from_list(self, history):
|
20 |
+
for message in history:
|
21 |
+
content = message.get("content")
|
22 |
+
# the token result may not be equal to the result model gen
|
23 |
+
message.update(self.tokenizer(content))
|
24 |
+
self.input_history.append(message)
|
25 |
+
|
26 |
+
def append(self, message):
|
27 |
+
content = message.get("content")
|
28 |
+
if "input_ids" not in message or "attention_mask" not in message:
|
29 |
+
message.update(self.tokenizer(content))
|
30 |
+
self.input_history.append(message)
|
31 |
+
|
32 |
+
def append_left(self, message):
|
33 |
+
content = message.get("content")
|
34 |
+
if "input_ids" not in message or "attention_mask" not in message:
|
35 |
+
message.update(self.tokenizer(content))
|
36 |
+
self.input_history.appendleft(message)
|
37 |
+
|
38 |
+
def pop(self):
|
39 |
+
x = self.input_history.pop()
|
40 |
+
return x
|
41 |
+
|
42 |
+
def pop_left(self):
|
43 |
+
x = self.input_history.pop_left()
|
44 |
+
return x
|
45 |
+
|
46 |
+
def update(self, message):
|
47 |
+
self.input_history.pop()
|
48 |
+
self.append(message)
|
49 |
+
|
50 |
+
def __len__(self):
|
51 |
+
return self.input_history.__len__()
|
52 |
+
|
53 |
+
def __str__(self):
|
54 |
+
return self.input_history.__str__()
|
55 |
+
|
56 |
+
def __copy__(self):
|
57 |
+
new_instance = type(self)(self.tokenizer, [])
|
58 |
+
new_instance.input_history = copy.copy(self.input_history)
|
59 |
+
return new_instance
|
60 |
+
|
61 |
+
def __deepcopy__(self, memodict={}):
|
62 |
+
new_instance = type(self)(self.tokenizer, [])
|
63 |
+
new_instance.input_history = copy.deepcopy(self.input_history)
|
64 |
+
return new_instance
|
65 |
+
|
66 |
+
|
67 |
+
class TelechatIterTextStreamer:
|
68 |
+
"""
|
69 |
+
With reference to the TextIterStreamers in transformers, we have rewritten this class
|
70 |
+
"""
|
71 |
+
|
72 |
+
def __init__(
|
73 |
+
self, tokenizer, history: History = None, skip_prompt: bool = False, timeout: Optional[float] = None,
|
74 |
+
**decode_kwargs
|
75 |
+
):
|
76 |
+
|
77 |
+
self.tokenizer = tokenizer
|
78 |
+
self.history = history
|
79 |
+
self.skip_prompt = skip_prompt
|
80 |
+
self.timeout = timeout
|
81 |
+
self.decode_kwargs = decode_kwargs
|
82 |
+
|
83 |
+
self.text_queue = Queue()
|
84 |
+
self.cache_time = 0
|
85 |
+
self.text_until = ""
|
86 |
+
self.token_until = []
|
87 |
+
self.stop_signal = None
|
88 |
+
self.next_tokens_are_prompt = True
|
89 |
+
|
90 |
+
self.history.append({"role": "bot", "content": self.text_until})
|
91 |
+
|
92 |
+
def put(self, value):
|
93 |
+
"""
|
94 |
+
put printable text into queue
|
95 |
+
"""
|
96 |
+
if len(value.shape) > 1 and value.shape[0] > 1:
|
97 |
+
raise ValueError("TextStreamer only supports batch size 1")
|
98 |
+
elif len(value.shape) > 1:
|
99 |
+
value = value[0]
|
100 |
+
|
101 |
+
if self.skip_prompt and self.next_tokens_are_prompt:
|
102 |
+
self.next_tokens_are_prompt = False
|
103 |
+
return
|
104 |
+
|
105 |
+
if value[-1] == self.tokenizer.eos_token_id:
|
106 |
+
return
|
107 |
+
|
108 |
+
# there may be some smart way to decode.
|
109 |
+
self.token_until.extend(value.tolist())
|
110 |
+
text = self.tokenizer.decode(self.token_until, **self.decode_kwargs)
|
111 |
+
|
112 |
+
|
113 |
+
if self._is_printable(text) or self.cache_time >= 6:
|
114 |
+
output_text = text[len(self.text_until):]
|
115 |
+
self.text_until = text
|
116 |
+
|
117 |
+
else:
|
118 |
+
self.cache_time+=1
|
119 |
+
return
|
120 |
+
|
121 |
+
self.on_finalized_text(output_text)
|
122 |
+
|
123 |
+
def end(self):
|
124 |
+
"""Flushes any remaining cache and prints a newline to stdout."""
|
125 |
+
# Flush the cache, if it exists
|
126 |
+
text = self.tokenizer.decode(self.token_until, **self.decode_kwargs)
|
127 |
+
output_text = text[len(self.text_until):]
|
128 |
+
self.text_until = text
|
129 |
+
self.on_finalized_text(output_text, stream_end=True)
|
130 |
+
self.clear_cache()
|
131 |
+
|
132 |
+
def clear_cache(self):
|
133 |
+
self.cache_time = 0
|
134 |
+
self.token_until = []
|
135 |
+
self.text_until = ""
|
136 |
+
self.history = None
|
137 |
+
self.next_tokens_are_prompt = True
|
138 |
+
|
139 |
+
def on_finalized_text(self, text: str, stream_end: bool = False):
|
140 |
+
"""Put the text tuple in the queue."""
|
141 |
+
self.history.update({"role": "bot", "content": self.text_until, "input_ids": self.token_until,
|
142 |
+
"attention_mask": [1] * len(self.token_until)})
|
143 |
+
self.text_queue.put((text, self.history), timeout=self.timeout)
|
144 |
+
if stream_end:
|
145 |
+
self.text_queue.put((self.stop_signal, self.history), timeout=self.timeout)
|
146 |
+
|
147 |
+
@staticmethod
|
148 |
+
def _is_printable(cp):
|
149 |
+
"""Checks whether tokens can be decoded or not"""
|
150 |
+
if "�" in cp:
|
151 |
+
return False
|
152 |
+
return True
|
153 |
+
|
154 |
+
def __iter__(self):
|
155 |
+
return self
|
156 |
+
|
157 |
+
def __next__(self):
|
158 |
+
value_now, history_until = self.text_queue.get(timeout=self.timeout)
|
159 |
+
if value_now == self.stop_signal:
|
160 |
+
raise StopIteration()
|
161 |
+
else:
|
162 |
+
return value_now, history_until
|
model-00001-of-00050.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2921b909cc707a1721cafc00c2e1561be50c38167e23dbca60b5ea572d5002d1
|
3 |
+
size 335676680
|
model-00002-of-00050.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a5ed55aa2b919c1ba154295d40dfde172a24082d6ddd620e0dae839f937a17c2
|
3 |
+
size 671353120
|
model-00003-of-00050.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4b246d4f08d7596bc12fabe3b1e1549c65e3ea4adfa899d5e8c9c2025cdddb41
|
3 |
+
size 671353120
|
model-00004-of-00050.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e1dc197e9f73d62796c0178ca6dbcc8dddbd14d12099cbadb4ace4fc721fbf5b
|
3 |
+
size 671353120
|
model-00005-of-00050.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c3a0961a9ee82913de040c575e72d87596ed30fe197a56815c4a7fdb33874307
|
3 |
+
size 671353120
|
model.safetensors.index.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"metadata": {"total_size": 453131665408}, "weight_map": {"transformer.word_embeddings.weight": "model-00001-of-00050.safetensors", "transformer.h.0.input_layernorm.weight": "model-00001-of-00050.safetensors", "transformer.h.0.post_attention_layernorm.weight": "model-00001-of-00050.safetensors", "transformer.h.0.self_attention.dense.weight": "model-00001-of-00050.safetensors", "transformer.h.0.self_attention.dense.bias": "model-00001-of-00050.safetensors", "transformer.h.0.self_attention.query.weight": "model-00001-of-00050.safetensors", "transformer.h.0.self_attention.key_value.weight": "model-00001-of-00050.safetensors", "transformer.h.0.mlp.gate_proj.weight": "model-00001-of-00050.safetensors", "transformer.h.0.mlp.down_proj.weight": "model-00001-of-00050.safetensors", "transformer.h.0.mlp.down_proj.bias": "model-00001-of-00050.safetensors", "transformer.h.0.mlp.up_proj.weight": "model-00001-of-00050.safetensors", "transformer.h.1.input_layernorm.weight": "model-00002-of-00050.safetensors", "transformer.h.1.post_attention_layernorm.weight": "model-00002-of-00050.safetensors", "transformer.h.1.self_attention.dense.weight": "model-00002-of-00050.safetensors", "transformer.h.1.self_attention.dense.bias": "model-00002-of-00050.safetensors", "transformer.h.1.self_attention.query.weight": "model-00002-of-00050.safetensors", "transformer.h.1.self_attention.key_value.weight": "model-00002-of-00050.safetensors", "transformer.h.1.mlp.gate_proj.weight": "model-00002-of-00050.safetensors", "transformer.h.1.mlp.down_proj.weight": "model-00002-of-00050.safetensors", "transformer.h.1.mlp.down_proj.bias": "model-00002-of-00050.safetensors", "transformer.h.1.mlp.up_proj.weight": "model-00002-of-00050.safetensors", "transformer.h.2.input_layernorm.weight": "model-00002-of-00050.safetensors", "transformer.h.2.post_attention_layernorm.weight": "model-00002-of-00050.safetensors", "transformer.h.2.self_attention.dense.weight": "model-00002-of-00050.safetensors", "transformer.h.2.self_attention.dense.bias": "model-00002-of-00050.safetensors", "transformer.h.2.self_attention.query.weight": "model-00002-of-00050.safetensors", "transformer.h.2.self_attention.key_value.weight": "model-00002-of-00050.safetensors", "transformer.h.2.mlp.gate_proj.weight": "model-00002-of-00050.safetensors", "transformer.h.2.mlp.down_proj.weight": "model-00002-of-00050.safetensors", "transformer.h.2.mlp.down_proj.bias": "model-00002-of-00050.safetensors", "transformer.h.2.mlp.up_proj.weight": "model-00002-of-00050.safetensors", "transformer.h.3.input_layernorm.weight": "model-00003-of-00050.safetensors", "transformer.h.3.post_attention_layernorm.weight": "model-00003-of-00050.safetensors", "transformer.h.3.self_attention.dense.weight": "model-00003-of-00050.safetensors", "transformer.h.3.self_attention.dense.bias": "model-00003-of-00050.safetensors", "transformer.h.3.self_attention.query.weight": "model-00003-of-00050.safetensors", "transformer.h.3.self_attention.key_value.weight": "model-00003-of-00050.safetensors", "transformer.h.3.mlp.gate_proj.weight": "model-00003-of-00050.safetensors", "transformer.h.3.mlp.down_proj.weight": "model-00003-of-00050.safetensors", "transformer.h.3.mlp.down_proj.bias": "model-00003-of-00050.safetensors", "transformer.h.3.mlp.up_proj.weight": "model-00003-of-00050.safetensors", "transformer.h.4.input_layernorm.weight": "model-00003-of-00050.safetensors", "transformer.h.4.post_attention_layernorm.weight": "model-00003-of-00050.safetensors", "transformer.h.4.self_attention.dense.weight": "model-00003-of-00050.safetensors", "transformer.h.4.self_attention.dense.bias": "model-00003-of-00050.safetensors", "transformer.h.4.self_attention.query.weight": "model-00003-of-00050.safetensors", "transformer.h.4.self_attention.key_value.weight": "model-00003-of-00050.safetensors", "transformer.h.4.mlp.gate_proj.weight": "model-00003-of-00050.safetensors", "transformer.h.4.mlp.down_proj.weight": "model-00003-of-00050.safetensors", "transformer.h.4.mlp.down_proj.bias": "model-00003-of-00050.safetensors", "transformer.h.4.mlp.up_proj.weight": "model-00003-of-00050.safetensors", "transformer.h.5.input_layernorm.weight": "model-00004-of-00050.safetensors", "transformer.h.5.post_attention_layernorm.weight": "model-00004-of-00050.safetensors", "transformer.h.5.self_attention.dense.weight": "model-00004-of-00050.safetensors", "transformer.h.5.self_attention.dense.bias": "model-00004-of-00050.safetensors", "transformer.h.5.self_attention.query.weight": "model-00004-of-00050.safetensors", "transformer.h.5.self_attention.key_value.weight": "model-00004-of-00050.safetensors", "transformer.h.5.mlp.gate_proj.weight": "model-00004-of-00050.safetensors", "transformer.h.5.mlp.down_proj.weight": "model-00004-of-00050.safetensors", "transformer.h.5.mlp.down_proj.bias": "model-00004-of-00050.safetensors", "transformer.h.5.mlp.up_proj.weight": "model-00004-of-00050.safetensors", "transformer.h.6.input_layernorm.weight": "model-00004-of-00050.safetensors", "transformer.h.6.post_attention_layernorm.weight": "model-00004-of-00050.safetensors", "transformer.h.6.self_attention.dense.weight": "model-00004-of-00050.safetensors", "transformer.h.6.self_attention.dense.bias": "model-00004-of-00050.safetensors", "transformer.h.6.self_attention.query.weight": "model-00004-of-00050.safetensors", "transformer.h.6.self_attention.key_value.weight": "model-00004-of-00050.safetensors", "transformer.h.6.mlp.gate_proj.weight": "model-00004-of-00050.safetensors", "transformer.h.6.mlp.down_proj.weight": "model-00004-of-00050.safetensors", "transformer.h.6.mlp.down_proj.bias": "model-00004-of-00050.safetensors", "transformer.h.6.mlp.up_proj.weight": "model-00004-of-00050.safetensors", "transformer.h.7.input_layernorm.weight": "model-00005-of-00050.safetensors", "transformer.h.7.post_attention_layernorm.weight": "model-00005-of-00050.safetensors", "transformer.h.7.self_attention.dense.weight": "model-00005-of-00050.safetensors", "transformer.h.7.self_attention.dense.bias": "model-00005-of-00050.safetensors", "transformer.h.7.self_attention.query.weight": "model-00005-of-00050.safetensors", "transformer.h.7.self_attention.key_value.weight": "model-00005-of-00050.safetensors", "transformer.h.7.mlp.gate_proj.weight": "model-00005-of-00050.safetensors", "transformer.h.7.mlp.down_proj.weight": "model-00005-of-00050.safetensors", "transformer.h.7.mlp.down_proj.bias": "model-00005-of-00050.safetensors", "transformer.h.7.mlp.up_proj.weight": "model-00005-of-00050.safetensors", "transformer.h.8.input_layernorm.weight": "model-00005-of-00050.safetensors", "transformer.h.8.post_attention_layernorm.weight": "model-00005-of-00050.safetensors", "transformer.h.8.self_attention.dense.weight": "model-00005-of-00050.safetensors", "transformer.h.8.self_attention.dense.bias": "model-00005-of-00050.safetensors", "transformer.h.8.self_attention.query.weight": "model-00005-of-00050.safetensors", "transformer.h.8.self_attention.key_value.weight": "model-00005-of-00050.safetensors", "transformer.h.8.mlp.gate_proj.weight": "model-00005-of-00050.safetensors", "transformer.h.8.mlp.down_proj.weight": "model-00005-of-00050.safetensors", "transformer.h.8.mlp.down_proj.bias": "model-00005-of-00050.safetensors", "transformer.h.8.mlp.up_proj.weight": "model-00005-of-00050.safetensors", "transformer.h.9.input_layernorm.weight": "model-00006-of-00050.safetensors", "transformer.h.9.post_attention_layernorm.weight": "model-00006-of-00050.safetensors", "transformer.h.9.self_attention.dense.weight": "model-00006-of-00050.safetensors", "transformer.h.9.self_attention.dense.bias": "model-00006-of-00050.safetensors", "transformer.h.9.self_attention.query.weight": "model-00006-of-00050.safetensors", "transformer.h.9.self_attention.key_value.weight": "model-00006-of-00050.safetensors", "transformer.h.9.mlp.gate_proj.weight": "model-00006-of-00050.safetensors", "transformer.h.9.mlp.down_proj.weight": "model-00006-of-00050.safetensors", "transformer.h.9.mlp.down_proj.bias": "model-00006-of-00050.safetensors", "transformer.h.9.mlp.up_proj.weight": "model-00006-of-00050.safetensors", "transformer.h.10.input_layernorm.weight": "model-00006-of-00050.safetensors", "transformer.h.10.post_attention_layernorm.weight": "model-00006-of-00050.safetensors", "transformer.h.10.self_attention.dense.weight": "model-00006-of-00050.safetensors", "transformer.h.10.self_attention.dense.bias": "model-00006-of-00050.safetensors", "transformer.h.10.self_attention.query.weight": "model-00006-of-00050.safetensors", "transformer.h.10.self_attention.key_value.weight": "model-00006-of-00050.safetensors", "transformer.h.10.mlp.gate_proj.weight": "model-00006-of-00050.safetensors", "transformer.h.10.mlp.down_proj.weight": "model-00006-of-00050.safetensors", "transformer.h.10.mlp.down_proj.bias": "model-00006-of-00050.safetensors", "transformer.h.10.mlp.up_proj.weight": "model-00006-of-00050.safetensors", "transformer.h.11.input_layernorm.weight": "model-00007-of-00050.safetensors", "transformer.h.11.post_attention_layernorm.weight": "model-00007-of-00050.safetensors", "transformer.h.11.self_attention.dense.weight": "model-00007-of-00050.safetensors", "transformer.h.11.self_attention.dense.bias": "model-00007-of-00050.safetensors", "transformer.h.11.self_attention.query.weight": "model-00007-of-00050.safetensors", "transformer.h.11.self_attention.key_value.weight": "model-00007-of-00050.safetensors", "transformer.h.11.mlp.gate_proj.weight": "model-00007-of-00050.safetensors", "transformer.h.11.mlp.down_proj.weight": "model-00007-of-00050.safetensors", "transformer.h.11.mlp.down_proj.bias": "model-00007-of-00050.safetensors", "transformer.h.11.mlp.up_proj.weight": "model-00007-of-00050.safetensors", "transformer.h.12.input_layernorm.weight": "model-00007-of-00050.safetensors", "transformer.h.12.post_attention_layernorm.weight": "model-00007-of-00050.safetensors", "transformer.h.12.self_attention.dense.weight": "model-00007-of-00050.safetensors", "transformer.h.12.self_attention.dense.bias": "model-00007-of-00050.safetensors", "transformer.h.12.self_attention.query.weight": "model-00007-of-00050.safetensors", "transformer.h.12.self_attention.key_value.weight": "model-00007-of-00050.safetensors", "transformer.h.12.mlp.gate_proj.weight": "model-00007-of-00050.safetensors", "transformer.h.12.mlp.down_proj.weight": "model-00007-of-00050.safetensors", "transformer.h.12.mlp.down_proj.bias": "model-00007-of-00050.safetensors", "transformer.h.12.mlp.up_proj.weight": "model-00007-of-00050.safetensors", "transformer.h.13.input_layernorm.weight": "model-00008-of-00050.safetensors", "transformer.h.13.post_attention_layernorm.weight": "model-00008-of-00050.safetensors", "transformer.h.13.self_attention.dense.weight": "model-00008-of-00050.safetensors", "transformer.h.13.self_attention.dense.bias": "model-00008-of-00050.safetensors", "transformer.h.13.self_attention.query.weight": "model-00008-of-00050.safetensors", "transformer.h.13.self_attention.key_value.weight": "model-00008-of-00050.safetensors", "transformer.h.13.mlp.gate_proj.weight": "model-00008-of-00050.safetensors", "transformer.h.13.mlp.down_proj.weight": "model-00008-of-00050.safetensors", "transformer.h.13.mlp.down_proj.bias": "model-00008-of-00050.safetensors", "transformer.h.13.mlp.up_proj.weight": "model-00008-of-00050.safetensors", "transformer.h.14.input_layernorm.weight": "model-00008-of-00050.safetensors", "transformer.h.14.post_attention_layernorm.weight": "model-00008-of-00050.safetensors", "transformer.h.14.self_attention.dense.weight": "model-00008-of-00050.safetensors", "transformer.h.14.self_attention.dense.bias": "model-00008-of-00050.safetensors", "transformer.h.14.self_attention.query.weight": "model-00008-of-00050.safetensors", "transformer.h.14.self_attention.key_value.weight": "model-00008-of-00050.safetensors", "transformer.h.14.mlp.gate_proj.weight": "model-00008-of-00050.safetensors", "transformer.h.14.mlp.down_proj.weight": "model-00008-of-00050.safetensors", "transformer.h.14.mlp.down_proj.bias": "model-00008-of-00050.safetensors", "transformer.h.14.mlp.up_proj.weight": "model-00008-of-00050.safetensors", "transformer.h.15.input_layernorm.weight": "model-00009-of-00050.safetensors", "transformer.h.15.post_attention_layernorm.weight": "model-00009-of-00050.safetensors", "transformer.h.15.self_attention.dense.weight": "model-00009-of-00050.safetensors", "transformer.h.15.self_attention.dense.bias": "model-00009-of-00050.safetensors", "transformer.h.15.self_attention.query.weight": "model-00009-of-00050.safetensors", "transformer.h.15.self_attention.key_value.weight": "model-00009-of-00050.safetensors", "transformer.h.15.mlp.gate_proj.weight": "model-00009-of-00050.safetensors", "transformer.h.15.mlp.down_proj.weight": "model-00009-of-00050.safetensors", "transformer.h.15.mlp.down_proj.bias": "model-00009-of-00050.safetensors", "transformer.h.15.mlp.up_proj.weight": "model-00009-of-00050.safetensors", "transformer.h.16.input_layernorm.weight": "model-00009-of-00050.safetensors", "transformer.h.16.post_attention_layernorm.weight": "model-00009-of-00050.safetensors", "transformer.h.16.self_attention.dense.weight": "model-00009-of-00050.safetensors", "transformer.h.16.self_attention.dense.bias": "model-00009-of-00050.safetensors", "transformer.h.16.self_attention.query.weight": "model-00009-of-00050.safetensors", "transformer.h.16.self_attention.key_value.weight": "model-00009-of-00050.safetensors", "transformer.h.16.mlp.gate_proj.weight": "model-00009-of-00050.safetensors", "transformer.h.16.mlp.down_proj.weight": "model-00009-of-00050.safetensors", "transformer.h.16.mlp.down_proj.bias": "model-00009-of-00050.safetensors", "transformer.h.16.mlp.up_proj.weight": "model-00009-of-00050.safetensors", "transformer.h.17.input_layernorm.weight": "model-00010-of-00050.safetensors", "transformer.h.17.post_attention_layernorm.weight": "model-00010-of-00050.safetensors", "transformer.h.17.self_attention.dense.weight": "model-00010-of-00050.safetensors", "transformer.h.17.self_attention.dense.bias": "model-00010-of-00050.safetensors", "transformer.h.17.self_attention.query.weight": "model-00010-of-00050.safetensors", "transformer.h.17.self_attention.key_value.weight": "model-00010-of-00050.safetensors", "transformer.h.17.mlp.gate_proj.weight": "model-00010-of-00050.safetensors", "transformer.h.17.mlp.down_proj.weight": "model-00010-of-00050.safetensors", "transformer.h.17.mlp.down_proj.bias": "model-00010-of-00050.safetensors", "transformer.h.17.mlp.up_proj.weight": "model-00010-of-00050.safetensors", "transformer.h.18.input_layernorm.weight": "model-00010-of-00050.safetensors", "transformer.h.18.post_attention_layernorm.weight": "model-00010-of-00050.safetensors", "transformer.h.18.self_attention.dense.weight": "model-00010-of-00050.safetensors", "transformer.h.18.self_attention.dense.bias": "model-00010-of-00050.safetensors", "transformer.h.18.self_attention.query.weight": "model-00010-of-00050.safetensors", "transformer.h.18.self_attention.key_value.weight": "model-00010-of-00050.safetensors", "transformer.h.18.mlp.gate_proj.weight": "model-00010-of-00050.safetensors", "transformer.h.18.mlp.down_proj.weight": "model-00010-of-00050.safetensors", "transformer.h.18.mlp.down_proj.bias": "model-00010-of-00050.safetensors", "transformer.h.18.mlp.up_proj.weight": "model-00010-of-00050.safetensors", "transformer.h.19.input_layernorm.weight": "model-00011-of-00050.safetensors", "transformer.h.19.post_attention_layernorm.weight": "model-00011-of-00050.safetensors", "transformer.h.19.self_attention.dense.weight": "model-00011-of-00050.safetensors", "transformer.h.19.self_attention.dense.bias": "model-00011-of-00050.safetensors", "transformer.h.19.self_attention.query.weight": "model-00011-of-00050.safetensors", "transformer.h.19.self_attention.key_value.weight": "model-00011-of-00050.safetensors", "transformer.h.19.mlp.gate_proj.weight": "model-00011-of-00050.safetensors", "transformer.h.19.mlp.down_proj.weight": "model-00011-of-00050.safetensors", "transformer.h.19.mlp.down_proj.bias": "model-00011-of-00050.safetensors", "transformer.h.19.mlp.up_proj.weight": "model-00011-of-00050.safetensors", "transformer.h.20.input_layernorm.weight": "model-00011-of-00050.safetensors", "transformer.h.20.post_attention_layernorm.weight": "model-00011-of-00050.safetensors", "transformer.h.20.self_attention.dense.weight": "model-00011-of-00050.safetensors", "transformer.h.20.self_attention.dense.bias": "model-00011-of-00050.safetensors", "transformer.h.20.self_attention.query.weight": "model-00011-of-00050.safetensors", "transformer.h.20.self_attention.key_value.weight": "model-00011-of-00050.safetensors", "transformer.h.20.mlp.gate_proj.weight": "model-00011-of-00050.safetensors", "transformer.h.20.mlp.down_proj.weight": "model-00011-of-00050.safetensors", "transformer.h.20.mlp.down_proj.bias": "model-00011-of-00050.safetensors", "transformer.h.20.mlp.up_proj.weight": "model-00011-of-00050.safetensors", "transformer.h.21.input_layernorm.weight": "model-00012-of-00050.safetensors", "transformer.h.21.post_attention_layernorm.weight": "model-00012-of-00050.safetensors", "transformer.h.21.self_attention.dense.weight": "model-00012-of-00050.safetensors", "transformer.h.21.self_attention.dense.bias": "model-00012-of-00050.safetensors", "transformer.h.21.self_attention.query.weight": "model-00012-of-00050.safetensors", "transformer.h.21.self_attention.key_value.weight": "model-00012-of-00050.safetensors", "transformer.h.21.mlp.gate_proj.weight": "model-00012-of-00050.safetensors", "transformer.h.21.mlp.down_proj.weight": "model-00012-of-00050.safetensors", "transformer.h.21.mlp.down_proj.bias": "model-00012-of-00050.safetensors", "transformer.h.21.mlp.up_proj.weight": "model-00012-of-00050.safetensors", "transformer.h.22.input_layernorm.weight": "model-00012-of-00050.safetensors", "transformer.h.22.post_attention_layernorm.weight": "model-00012-of-00050.safetensors", "transformer.h.22.self_attention.dense.weight": "model-00012-of-00050.safetensors", "transformer.h.22.self_attention.dense.bias": "model-00012-of-00050.safetensors", "transformer.h.22.self_attention.query.weight": "model-00012-of-00050.safetensors", "transformer.h.22.self_attention.key_value.weight": "model-00012-of-00050.safetensors", "transformer.h.22.mlp.gate_proj.weight": "model-00012-of-00050.safetensors", "transformer.h.22.mlp.down_proj.weight": "model-00012-of-00050.safetensors", "transformer.h.22.mlp.down_proj.bias": "model-00012-of-00050.safetensors", "transformer.h.22.mlp.up_proj.weight": "model-00012-of-00050.safetensors", "transformer.h.23.input_layernorm.weight": "model-00013-of-00050.safetensors", "transformer.h.23.post_attention_layernorm.weight": "model-00013-of-00050.safetensors", "transformer.h.23.self_attention.dense.weight": "model-00013-of-00050.safetensors", "transformer.h.23.self_attention.dense.bias": "model-00013-of-00050.safetensors", "transformer.h.23.self_attention.query.weight": "model-00013-of-00050.safetensors", "transformer.h.23.self_attention.key_value.weight": "model-00013-of-00050.safetensors", "transformer.h.23.mlp.gate_proj.weight": "model-00013-of-00050.safetensors", "transformer.h.23.mlp.down_proj.weight": "model-00013-of-00050.safetensors", "transformer.h.23.mlp.down_proj.bias": "model-00013-of-00050.safetensors", "transformer.h.23.mlp.up_proj.weight": "model-00013-of-00050.safetensors", "transformer.h.24.input_layernorm.weight": "model-00013-of-00050.safetensors", "transformer.h.24.post_attention_layernorm.weight": "model-00013-of-00050.safetensors", "transformer.h.24.self_attention.dense.weight": "model-00013-of-00050.safetensors", "transformer.h.24.self_attention.dense.bias": "model-00013-of-00050.safetensors", "transformer.h.24.self_attention.query.weight": "model-00013-of-00050.safetensors", "transformer.h.24.self_attention.key_value.weight": "model-00013-of-00050.safetensors", "transformer.h.24.mlp.gate_proj.weight": "model-00013-of-00050.safetensors", "transformer.h.24.mlp.down_proj.weight": "model-00013-of-00050.safetensors", "transformer.h.24.mlp.down_proj.bias": "model-00013-of-00050.safetensors", "transformer.h.24.mlp.up_proj.weight": "model-00013-of-00050.safetensors", "transformer.h.25.input_layernorm.weight": "model-00014-of-00050.safetensors", "transformer.h.25.post_attention_layernorm.weight": "model-00014-of-00050.safetensors", "transformer.h.25.self_attention.dense.weight": "model-00014-of-00050.safetensors", "transformer.h.25.self_attention.dense.bias": "model-00014-of-00050.safetensors", "transformer.h.25.self_attention.query.weight": "model-00014-of-00050.safetensors", "transformer.h.25.self_attention.key_value.weight": "model-00014-of-00050.safetensors", "transformer.h.25.mlp.gate_proj.weight": "model-00014-of-00050.safetensors", "transformer.h.25.mlp.down_proj.weight": "model-00014-of-00050.safetensors", "transformer.h.25.mlp.down_proj.bias": "model-00014-of-00050.safetensors", "transformer.h.25.mlp.up_proj.weight": "model-00014-of-00050.safetensors", "transformer.h.26.input_layernorm.weight": "model-00014-of-00050.safetensors", "transformer.h.26.post_attention_layernorm.weight": "model-00014-of-00050.safetensors", "transformer.h.26.self_attention.dense.weight": "model-00014-of-00050.safetensors", "transformer.h.26.self_attention.dense.bias": "model-00014-of-00050.safetensors", "transformer.h.26.self_attention.query.weight": "model-00014-of-00050.safetensors", "transformer.h.26.self_attention.key_value.weight": "model-00014-of-00050.safetensors", "transformer.h.26.mlp.gate_proj.weight": "model-00014-of-00050.safetensors", "transformer.h.26.mlp.down_proj.weight": "model-00014-of-00050.safetensors", "transformer.h.26.mlp.down_proj.bias": "model-00014-of-00050.safetensors", "transformer.h.26.mlp.up_proj.weight": "model-00014-of-00050.safetensors", "transformer.h.27.input_layernorm.weight": "model-00015-of-00050.safetensors", "transformer.h.27.post_attention_layernorm.weight": "model-00015-of-00050.safetensors", "transformer.h.27.self_attention.dense.weight": "model-00015-of-00050.safetensors", "transformer.h.27.self_attention.dense.bias": "model-00015-of-00050.safetensors", "transformer.h.27.self_attention.query.weight": "model-00015-of-00050.safetensors", "transformer.h.27.self_attention.key_value.weight": "model-00015-of-00050.safetensors", "transformer.h.27.mlp.gate_proj.weight": "model-00015-of-00050.safetensors", "transformer.h.27.mlp.down_proj.weight": "model-00015-of-00050.safetensors", "transformer.h.27.mlp.down_proj.bias": "model-00015-of-00050.safetensors", "transformer.h.27.mlp.up_proj.weight": "model-00015-of-00050.safetensors", "transformer.h.28.input_layernorm.weight": "model-00015-of-00050.safetensors", "transformer.h.28.post_attention_layernorm.weight": "model-00015-of-00050.safetensors", "transformer.h.28.self_attention.dense.weight": "model-00015-of-00050.safetensors", "transformer.h.28.self_attention.dense.bias": "model-00015-of-00050.safetensors", "transformer.h.28.self_attention.query.weight": "model-00015-of-00050.safetensors", "transformer.h.28.self_attention.key_value.weight": "model-00015-of-00050.safetensors", "transformer.h.28.mlp.gate_proj.weight": "model-00015-of-00050.safetensors", "transformer.h.28.mlp.down_proj.weight": "model-00015-of-00050.safetensors", "transformer.h.28.mlp.down_proj.bias": "model-00015-of-00050.safetensors", "transformer.h.28.mlp.up_proj.weight": "model-00015-of-00050.safetensors", "transformer.h.29.input_layernorm.weight": "model-00016-of-00050.safetensors", "transformer.h.29.post_attention_layernorm.weight": "model-00016-of-00050.safetensors", "transformer.h.29.self_attention.dense.weight": "model-00016-of-00050.safetensors", "transformer.h.29.self_attention.dense.bias": "model-00016-of-00050.safetensors", "transformer.h.29.self_attention.query.weight": "model-00016-of-00050.safetensors", "transformer.h.29.self_attention.key_value.weight": "model-00016-of-00050.safetensors", "transformer.h.29.mlp.gate_proj.weight": "model-00016-of-00050.safetensors", "transformer.h.29.mlp.down_proj.weight": "model-00016-of-00050.safetensors", "transformer.h.29.mlp.down_proj.bias": "model-00016-of-00050.safetensors", "transformer.h.29.mlp.up_proj.weight": "model-00016-of-00050.safetensors", "transformer.h.30.input_layernorm.weight": "model-00016-of-00050.safetensors", "transformer.h.30.post_attention_layernorm.weight": "model-00016-of-00050.safetensors", "transformer.h.30.self_attention.dense.weight": "model-00016-of-00050.safetensors", "transformer.h.30.self_attention.dense.bias": "model-00016-of-00050.safetensors", "transformer.h.30.self_attention.query.weight": "model-00016-of-00050.safetensors", "transformer.h.30.self_attention.key_value.weight": "model-00016-of-00050.safetensors", "transformer.h.30.mlp.gate_proj.weight": "model-00016-of-00050.safetensors", "transformer.h.30.mlp.down_proj.weight": "model-00016-of-00050.safetensors", "transformer.h.30.mlp.down_proj.bias": "model-00016-of-00050.safetensors", "transformer.h.30.mlp.up_proj.weight": "model-00016-of-00050.safetensors", "transformer.h.31.input_layernorm.weight": "model-00017-of-00050.safetensors", "transformer.h.31.post_attention_layernorm.weight": "model-00017-of-00050.safetensors", "transformer.h.31.self_attention.dense.weight": "model-00017-of-00050.safetensors", "transformer.h.31.self_attention.dense.bias": "model-00017-of-00050.safetensors", "transformer.h.31.self_attention.query.weight": "model-00017-of-00050.safetensors", "transformer.h.31.self_attention.key_value.weight": "model-00017-of-00050.safetensors", "transformer.h.31.mlp.gate_proj.weight": "model-00017-of-00050.safetensors", "transformer.h.31.mlp.down_proj.weight": "model-00017-of-00050.safetensors", "transformer.h.31.mlp.down_proj.bias": "model-00017-of-00050.safetensors", "transformer.h.31.mlp.up_proj.weight": "model-00017-of-00050.safetensors", "transformer.h.32.input_layernorm.weight": "model-00017-of-00050.safetensors", "transformer.h.32.post_attention_layernorm.weight": "model-00017-of-00050.safetensors", "transformer.h.32.self_attention.dense.weight": "model-00017-of-00050.safetensors", "transformer.h.32.self_attention.dense.bias": "model-00017-of-00050.safetensors", "transformer.h.32.self_attention.query.weight": "model-00017-of-00050.safetensors", "transformer.h.32.self_attention.key_value.weight": "model-00017-of-00050.safetensors", "transformer.h.32.mlp.gate_proj.weight": "model-00017-of-00050.safetensors", "transformer.h.32.mlp.down_proj.weight": "model-00017-of-00050.safetensors", "transformer.h.32.mlp.down_proj.bias": "model-00017-of-00050.safetensors", "transformer.h.32.mlp.up_proj.weight": "model-00017-of-00050.safetensors", "transformer.h.33.input_layernorm.weight": "model-00018-of-00050.safetensors", "transformer.h.33.post_attention_layernorm.weight": "model-00018-of-00050.safetensors", "transformer.h.33.self_attention.dense.weight": "model-00018-of-00050.safetensors", "transformer.h.33.self_attention.dense.bias": "model-00018-of-00050.safetensors", "transformer.h.33.self_attention.query.weight": "model-00018-of-00050.safetensors", "transformer.h.33.self_attention.key_value.weight": "model-00018-of-00050.safetensors", "transformer.h.33.mlp.gate_proj.weight": "model-00018-of-00050.safetensors", "transformer.h.33.mlp.down_proj.weight": "model-00018-of-00050.safetensors", "transformer.h.33.mlp.down_proj.bias": "model-00018-of-00050.safetensors", "transformer.h.33.mlp.up_proj.weight": "model-00018-of-00050.safetensors", "transformer.h.34.input_layernorm.weight": "model-00018-of-00050.safetensors", "transformer.h.34.post_attention_layernorm.weight": "model-00018-of-00050.safetensors", "transformer.h.34.self_attention.dense.weight": "model-00018-of-00050.safetensors", "transformer.h.34.self_attention.dense.bias": "model-00018-of-00050.safetensors", "transformer.h.34.self_attention.query.weight": "model-00018-of-00050.safetensors", "transformer.h.34.self_attention.key_value.weight": "model-00018-of-00050.safetensors", "transformer.h.34.mlp.gate_proj.weight": "model-00018-of-00050.safetensors", "transformer.h.34.mlp.down_proj.weight": "model-00018-of-00050.safetensors", "transformer.h.34.mlp.down_proj.bias": "model-00018-of-00050.safetensors", "transformer.h.34.mlp.up_proj.weight": "model-00018-of-00050.safetensors", "transformer.h.35.input_layernorm.weight": "model-00019-of-00050.safetensors", "transformer.h.35.post_attention_layernorm.weight": "model-00019-of-00050.safetensors", "transformer.h.35.self_attention.dense.weight": "model-00019-of-00050.safetensors", "transformer.h.35.self_attention.dense.bias": "model-00019-of-00050.safetensors", "transformer.h.35.self_attention.query.weight": "model-00019-of-00050.safetensors", "transformer.h.35.self_attention.key_value.weight": "model-00019-of-00050.safetensors", "transformer.h.35.mlp.gate_proj.weight": "model-00019-of-00050.safetensors", "transformer.h.35.mlp.down_proj.weight": "model-00019-of-00050.safetensors", "transformer.h.35.mlp.down_proj.bias": "model-00019-of-00050.safetensors", "transformer.h.35.mlp.up_proj.weight": "model-00019-of-00050.safetensors", "transformer.h.36.input_layernorm.weight": "model-00019-of-00050.safetensors", "transformer.h.36.post_attention_layernorm.weight": "model-00019-of-00050.safetensors", "transformer.h.36.self_attention.dense.weight": "model-00019-of-00050.safetensors", "transformer.h.36.self_attention.dense.bias": "model-00019-of-00050.safetensors", "transformer.h.36.self_attention.query.weight": "model-00019-of-00050.safetensors", "transformer.h.36.self_attention.key_value.weight": "model-00019-of-00050.safetensors", "transformer.h.36.mlp.gate_proj.weight": "model-00019-of-00050.safetensors", "transformer.h.36.mlp.down_proj.weight": "model-00019-of-00050.safetensors", "transformer.h.36.mlp.down_proj.bias": "model-00019-of-00050.safetensors", "transformer.h.36.mlp.up_proj.weight": "model-00019-of-00050.safetensors", "transformer.h.37.input_layernorm.weight": "model-00020-of-00050.safetensors", "transformer.h.37.post_attention_layernorm.weight": "model-00020-of-00050.safetensors", "transformer.h.37.self_attention.dense.weight": "model-00020-of-00050.safetensors", "transformer.h.37.self_attention.dense.bias": "model-00020-of-00050.safetensors", "transformer.h.37.self_attention.query.weight": "model-00020-of-00050.safetensors", "transformer.h.37.self_attention.key_value.weight": "model-00020-of-00050.safetensors", "transformer.h.37.mlp.gate_proj.weight": "model-00020-of-00050.safetensors", "transformer.h.37.mlp.down_proj.weight": "model-00020-of-00050.safetensors", "transformer.h.37.mlp.down_proj.bias": "model-00020-of-00050.safetensors", "transformer.h.37.mlp.up_proj.weight": "model-00020-of-00050.safetensors", "transformer.h.38.input_layernorm.weight": "model-00020-of-00050.safetensors", "transformer.h.38.post_attention_layernorm.weight": "model-00020-of-00050.safetensors", "transformer.h.38.self_attention.dense.weight": "model-00020-of-00050.safetensors", "transformer.h.38.self_attention.dense.bias": "model-00020-of-00050.safetensors", "transformer.h.38.self_attention.query.weight": "model-00020-of-00050.safetensors", "transformer.h.38.self_attention.key_value.weight": "model-00020-of-00050.safetensors", "transformer.h.38.mlp.gate_proj.weight": "model-00020-of-00050.safetensors", "transformer.h.38.mlp.down_proj.weight": "model-00020-of-00050.safetensors", "transformer.h.38.mlp.down_proj.bias": "model-00020-of-00050.safetensors", "transformer.h.38.mlp.up_proj.weight": "model-00020-of-00050.safetensors", "transformer.h.39.input_layernorm.weight": "model-00021-of-00050.safetensors", "transformer.h.39.post_attention_layernorm.weight": "model-00021-of-00050.safetensors", "transformer.h.39.self_attention.dense.weight": "model-00021-of-00050.safetensors", "transformer.h.39.self_attention.dense.bias": "model-00021-of-00050.safetensors", "transformer.h.39.self_attention.query.weight": "model-00021-of-00050.safetensors", "transformer.h.39.self_attention.key_value.weight": "model-00021-of-00050.safetensors", "transformer.h.39.mlp.gate_proj.weight": "model-00021-of-00050.safetensors", "transformer.h.39.mlp.down_proj.weight": "model-00021-of-00050.safetensors", "transformer.h.39.mlp.down_proj.bias": "model-00021-of-00050.safetensors", "transformer.h.39.mlp.up_proj.weight": "model-00021-of-00050.safetensors", "transformer.h.40.input_layernorm.weight": "model-00021-of-00050.safetensors", "transformer.h.40.post_attention_layernorm.weight": "model-00021-of-00050.safetensors", "transformer.h.40.self_attention.dense.weight": "model-00021-of-00050.safetensors", "transformer.h.40.self_attention.dense.bias": "model-00021-of-00050.safetensors", "transformer.h.40.self_attention.query.weight": "model-00021-of-00050.safetensors", "transformer.h.40.self_attention.key_value.weight": "model-00021-of-00050.safetensors", "transformer.h.40.mlp.gate_proj.weight": "model-00021-of-00050.safetensors", "transformer.h.40.mlp.down_proj.weight": "model-00021-of-00050.safetensors", "transformer.h.40.mlp.down_proj.bias": "model-00021-of-00050.safetensors", "transformer.h.40.mlp.up_proj.weight": "model-00021-of-00050.safetensors", "transformer.h.41.input_layernorm.weight": "model-00022-of-00050.safetensors", "transformer.h.41.post_attention_layernorm.weight": "model-00022-of-00050.safetensors", "transformer.h.41.self_attention.dense.weight": "model-00022-of-00050.safetensors", "transformer.h.41.self_attention.dense.bias": "model-00022-of-00050.safetensors", "transformer.h.41.self_attention.query.weight": "model-00022-of-00050.safetensors", "transformer.h.41.self_attention.key_value.weight": "model-00022-of-00050.safetensors", "transformer.h.41.mlp.gate_proj.weight": "model-00022-of-00050.safetensors", "transformer.h.41.mlp.down_proj.weight": "model-00022-of-00050.safetensors", "transformer.h.41.mlp.down_proj.bias": "model-00022-of-00050.safetensors", "transformer.h.41.mlp.up_proj.weight": "model-00022-of-00050.safetensors", "transformer.h.42.input_layernorm.weight": "model-00022-of-00050.safetensors", "transformer.h.42.post_attention_layernorm.weight": "model-00022-of-00050.safetensors", "transformer.h.42.self_attention.dense.weight": "model-00022-of-00050.safetensors", "transformer.h.42.self_attention.dense.bias": "model-00022-of-00050.safetensors", "transformer.h.42.self_attention.query.weight": "model-00022-of-00050.safetensors", "transformer.h.42.self_attention.key_value.weight": "model-00022-of-00050.safetensors", "transformer.h.42.mlp.gate_proj.weight": "model-00022-of-00050.safetensors", "transformer.h.42.mlp.down_proj.weight": "model-00022-of-00050.safetensors", "transformer.h.42.mlp.down_proj.bias": "model-00022-of-00050.safetensors", "transformer.h.42.mlp.up_proj.weight": "model-00022-of-00050.safetensors", "transformer.h.43.input_layernorm.weight": "model-00023-of-00050.safetensors", "transformer.h.43.post_attention_layernorm.weight": "model-00023-of-00050.safetensors", "transformer.h.43.self_attention.dense.weight": "model-00023-of-00050.safetensors", "transformer.h.43.self_attention.dense.bias": "model-00023-of-00050.safetensors", "transformer.h.43.self_attention.query.weight": "model-00023-of-00050.safetensors", "transformer.h.43.self_attention.key_value.weight": "model-00023-of-00050.safetensors", "transformer.h.43.mlp.gate_proj.weight": "model-00023-of-00050.safetensors", "transformer.h.43.mlp.down_proj.weight": "model-00023-of-00050.safetensors", "transformer.h.43.mlp.down_proj.bias": "model-00023-of-00050.safetensors", "transformer.h.43.mlp.up_proj.weight": "model-00023-of-00050.safetensors", "transformer.h.44.input_layernorm.weight": "model-00023-of-00050.safetensors", "transformer.h.44.post_attention_layernorm.weight": "model-00023-of-00050.safetensors", "transformer.h.44.self_attention.dense.weight": "model-00023-of-00050.safetensors", "transformer.h.44.self_attention.dense.bias": "model-00023-of-00050.safetensors", "transformer.h.44.self_attention.query.weight": "model-00023-of-00050.safetensors", "transformer.h.44.self_attention.key_value.weight": "model-00023-of-00050.safetensors", "transformer.h.44.mlp.gate_proj.weight": "model-00023-of-00050.safetensors", "transformer.h.44.mlp.down_proj.weight": "model-00023-of-00050.safetensors", "transformer.h.44.mlp.down_proj.bias": "model-00023-of-00050.safetensors", "transformer.h.44.mlp.up_proj.weight": "model-00023-of-00050.safetensors", "transformer.h.45.input_layernorm.weight": "model-00024-of-00050.safetensors", "transformer.h.45.post_attention_layernorm.weight": "model-00024-of-00050.safetensors", "transformer.h.45.self_attention.dense.weight": "model-00024-of-00050.safetensors", "transformer.h.45.self_attention.dense.bias": "model-00024-of-00050.safetensors", "transformer.h.45.self_attention.query.weight": "model-00024-of-00050.safetensors", "transformer.h.45.self_attention.key_value.weight": "model-00024-of-00050.safetensors", "transformer.h.45.mlp.gate_proj.weight": "model-00024-of-00050.safetensors", "transformer.h.45.mlp.down_proj.weight": "model-00024-of-00050.safetensors", "transformer.h.45.mlp.down_proj.bias": "model-00024-of-00050.safetensors", "transformer.h.45.mlp.up_proj.weight": "model-00024-of-00050.safetensors", "transformer.h.46.input_layernorm.weight": "model-00024-of-00050.safetensors", "transformer.h.46.post_attention_layernorm.weight": "model-00024-of-00050.safetensors", "transformer.h.46.self_attention.dense.weight": "model-00024-of-00050.safetensors", "transformer.h.46.self_attention.dense.bias": "model-00024-of-00050.safetensors", "transformer.h.46.self_attention.query.weight": "model-00024-of-00050.safetensors", "transformer.h.46.self_attention.key_value.weight": "model-00024-of-00050.safetensors", "transformer.h.46.mlp.gate_proj.weight": "model-00024-of-00050.safetensors", "transformer.h.46.mlp.down_proj.weight": "model-00024-of-00050.safetensors", "transformer.h.46.mlp.down_proj.bias": "model-00024-of-00050.safetensors", "transformer.h.46.mlp.up_proj.weight": "model-00024-of-00050.safetensors", "transformer.h.47.input_layernorm.weight": "model-00025-of-00050.safetensors", "transformer.h.47.post_attention_layernorm.weight": "model-00025-of-00050.safetensors", "transformer.h.47.self_attention.dense.weight": "model-00025-of-00050.safetensors", "transformer.h.47.self_attention.dense.bias": "model-00025-of-00050.safetensors", "transformer.h.47.self_attention.query.weight": "model-00025-of-00050.safetensors", "transformer.h.47.self_attention.key_value.weight": "model-00025-of-00050.safetensors", "transformer.h.47.mlp.gate_proj.weight": "model-00025-of-00050.safetensors", "transformer.h.47.mlp.down_proj.weight": "model-00025-of-00050.safetensors", "transformer.h.47.mlp.down_proj.bias": "model-00025-of-00050.safetensors", "transformer.h.47.mlp.up_proj.weight": "model-00025-of-00050.safetensors", "transformer.h.48.input_layernorm.weight": "model-00025-of-00050.safetensors", "transformer.h.48.post_attention_layernorm.weight": "model-00025-of-00050.safetensors", "transformer.h.48.self_attention.dense.weight": "model-00025-of-00050.safetensors", "transformer.h.48.self_attention.dense.bias": "model-00025-of-00050.safetensors", "transformer.h.48.self_attention.query.weight": "model-00025-of-00050.safetensors", "transformer.h.48.self_attention.key_value.weight": "model-00025-of-00050.safetensors", "transformer.h.48.mlp.gate_proj.weight": "model-00025-of-00050.safetensors", "transformer.h.48.mlp.down_proj.weight": "model-00025-of-00050.safetensors", "transformer.h.48.mlp.down_proj.bias": "model-00025-of-00050.safetensors", "transformer.h.48.mlp.up_proj.weight": "model-00025-of-00050.safetensors", "transformer.h.49.input_layernorm.weight": "model-00026-of-00050.safetensors", "transformer.h.49.post_attention_layernorm.weight": "model-00026-of-00050.safetensors", "transformer.h.49.self_attention.dense.weight": "model-00026-of-00050.safetensors", "transformer.h.49.self_attention.dense.bias": "model-00026-of-00050.safetensors", "transformer.h.49.self_attention.query.weight": "model-00026-of-00050.safetensors", "transformer.h.49.self_attention.key_value.weight": "model-00026-of-00050.safetensors", "transformer.h.49.mlp.gate_proj.weight": "model-00026-of-00050.safetensors", "transformer.h.49.mlp.down_proj.weight": "model-00026-of-00050.safetensors", "transformer.h.49.mlp.down_proj.bias": "model-00026-of-00050.safetensors", "transformer.h.49.mlp.up_proj.weight": "model-00026-of-00050.safetensors", "transformer.h.50.input_layernorm.weight": "model-00026-of-00050.safetensors", "transformer.h.50.post_attention_layernorm.weight": "model-00026-of-00050.safetensors", "transformer.h.50.self_attention.dense.weight": "model-00026-of-00050.safetensors", "transformer.h.50.self_attention.dense.bias": "model-00026-of-00050.safetensors", "transformer.h.50.self_attention.query.weight": "model-00026-of-00050.safetensors", "transformer.h.50.self_attention.key_value.weight": "model-00026-of-00050.safetensors", "transformer.h.50.mlp.gate_proj.weight": "model-00026-of-00050.safetensors", "transformer.h.50.mlp.down_proj.weight": "model-00026-of-00050.safetensors", "transformer.h.50.mlp.down_proj.bias": "model-00026-of-00050.safetensors", "transformer.h.50.mlp.up_proj.weight": "model-00026-of-00050.safetensors", "transformer.h.51.input_layernorm.weight": "model-00027-of-00050.safetensors", "transformer.h.51.post_attention_layernorm.weight": "model-00027-of-00050.safetensors", "transformer.h.51.self_attention.dense.weight": "model-00027-of-00050.safetensors", "transformer.h.51.self_attention.dense.bias": "model-00027-of-00050.safetensors", "transformer.h.51.self_attention.query.weight": "model-00027-of-00050.safetensors", "transformer.h.51.self_attention.key_value.weight": "model-00027-of-00050.safetensors", "transformer.h.51.mlp.gate_proj.weight": "model-00027-of-00050.safetensors", "transformer.h.51.mlp.down_proj.weight": "model-00027-of-00050.safetensors", "transformer.h.51.mlp.down_proj.bias": "model-00027-of-00050.safetensors", "transformer.h.51.mlp.up_proj.weight": "model-00027-of-00050.safetensors", "transformer.h.52.input_layernorm.weight": "model-00027-of-00050.safetensors", "transformer.h.52.post_attention_layernorm.weight": "model-00027-of-00050.safetensors", "transformer.h.52.self_attention.dense.weight": "model-00027-of-00050.safetensors", "transformer.h.52.self_attention.dense.bias": "model-00027-of-00050.safetensors", "transformer.h.52.self_attention.query.weight": "model-00027-of-00050.safetensors", "transformer.h.52.self_attention.key_value.weight": "model-00027-of-00050.safetensors", "transformer.h.52.mlp.gate_proj.weight": "model-00027-of-00050.safetensors", "transformer.h.52.mlp.down_proj.weight": "model-00027-of-00050.safetensors", "transformer.h.52.mlp.down_proj.bias": "model-00027-of-00050.safetensors", "transformer.h.52.mlp.up_proj.weight": "model-00027-of-00050.safetensors", "transformer.h.53.input_layernorm.weight": "model-00028-of-00050.safetensors", "transformer.h.53.post_attention_layernorm.weight": "model-00028-of-00050.safetensors", "transformer.h.53.self_attention.dense.weight": "model-00028-of-00050.safetensors", "transformer.h.53.self_attention.dense.bias": "model-00028-of-00050.safetensors", "transformer.h.53.self_attention.query.weight": "model-00028-of-00050.safetensors", "transformer.h.53.self_attention.key_value.weight": "model-00028-of-00050.safetensors", "transformer.h.53.mlp.gate_proj.weight": "model-00028-of-00050.safetensors", "transformer.h.53.mlp.down_proj.weight": "model-00028-of-00050.safetensors", "transformer.h.53.mlp.down_proj.bias": "model-00028-of-00050.safetensors", "transformer.h.53.mlp.up_proj.weight": "model-00028-of-00050.safetensors", "transformer.h.54.input_layernorm.weight": "model-00028-of-00050.safetensors", "transformer.h.54.post_attention_layernorm.weight": "model-00028-of-00050.safetensors", "transformer.h.54.self_attention.dense.weight": "model-00028-of-00050.safetensors", "transformer.h.54.self_attention.dense.bias": "model-00028-of-00050.safetensors", "transformer.h.54.self_attention.query.weight": "model-00028-of-00050.safetensors", "transformer.h.54.self_attention.key_value.weight": "model-00028-of-00050.safetensors", "transformer.h.54.mlp.gate_proj.weight": "model-00028-of-00050.safetensors", "transformer.h.54.mlp.down_proj.weight": "model-00028-of-00050.safetensors", "transformer.h.54.mlp.down_proj.bias": "model-00028-of-00050.safetensors", "transformer.h.54.mlp.up_proj.weight": "model-00028-of-00050.safetensors", "transformer.h.55.input_layernorm.weight": "model-00029-of-00050.safetensors", "transformer.h.55.post_attention_layernorm.weight": "model-00029-of-00050.safetensors", "transformer.h.55.self_attention.dense.weight": "model-00029-of-00050.safetensors", "transformer.h.55.self_attention.dense.bias": "model-00029-of-00050.safetensors", "transformer.h.55.self_attention.query.weight": "model-00029-of-00050.safetensors", "transformer.h.55.self_attention.key_value.weight": "model-00029-of-00050.safetensors", "transformer.h.55.mlp.gate_proj.weight": "model-00029-of-00050.safetensors", "transformer.h.55.mlp.down_proj.weight": "model-00029-of-00050.safetensors", "transformer.h.55.mlp.down_proj.bias": "model-00029-of-00050.safetensors", "transformer.h.55.mlp.up_proj.weight": "model-00029-of-00050.safetensors", "transformer.h.56.input_layernorm.weight": "model-00029-of-00050.safetensors", "transformer.h.56.post_attention_layernorm.weight": "model-00029-of-00050.safetensors", "transformer.h.56.self_attention.dense.weight": "model-00029-of-00050.safetensors", "transformer.h.56.self_attention.dense.bias": "model-00029-of-00050.safetensors", "transformer.h.56.self_attention.query.weight": "model-00029-of-00050.safetensors", "transformer.h.56.self_attention.key_value.weight": "model-00029-of-00050.safetensors", "transformer.h.56.mlp.gate_proj.weight": "model-00029-of-00050.safetensors", "transformer.h.56.mlp.down_proj.weight": "model-00029-of-00050.safetensors", "transformer.h.56.mlp.down_proj.bias": "model-00029-of-00050.safetensors", "transformer.h.56.mlp.up_proj.weight": "model-00029-of-00050.safetensors", "transformer.h.57.input_layernorm.weight": "model-00030-of-00050.safetensors", "transformer.h.57.post_attention_layernorm.weight": "model-00030-of-00050.safetensors", "transformer.h.57.self_attention.dense.weight": "model-00030-of-00050.safetensors", "transformer.h.57.self_attention.dense.bias": "model-00030-of-00050.safetensors", "transformer.h.57.self_attention.query.weight": "model-00030-of-00050.safetensors", "transformer.h.57.self_attention.key_value.weight": "model-00030-of-00050.safetensors", "transformer.h.57.mlp.gate_proj.weight": "model-00030-of-00050.safetensors", "transformer.h.57.mlp.down_proj.weight": "model-00030-of-00050.safetensors", "transformer.h.57.mlp.down_proj.bias": "model-00030-of-00050.safetensors", "transformer.h.57.mlp.up_proj.weight": "model-00030-of-00050.safetensors", "transformer.h.58.input_layernorm.weight": "model-00030-of-00050.safetensors", "transformer.h.58.post_attention_layernorm.weight": "model-00030-of-00050.safetensors", "transformer.h.58.self_attention.dense.weight": "model-00030-of-00050.safetensors", "transformer.h.58.self_attention.dense.bias": "model-00030-of-00050.safetensors", "transformer.h.58.self_attention.query.weight": "model-00030-of-00050.safetensors", "transformer.h.58.self_attention.key_value.weight": "model-00030-of-00050.safetensors", "transformer.h.58.mlp.gate_proj.weight": "model-00030-of-00050.safetensors", "transformer.h.58.mlp.down_proj.weight": "model-00030-of-00050.safetensors", "transformer.h.58.mlp.down_proj.bias": "model-00030-of-00050.safetensors", "transformer.h.58.mlp.up_proj.weight": "model-00030-of-00050.safetensors", "transformer.h.59.input_layernorm.weight": "model-00031-of-00050.safetensors", "transformer.h.59.post_attention_layernorm.weight": "model-00031-of-00050.safetensors", "transformer.h.59.self_attention.dense.weight": "model-00031-of-00050.safetensors", "transformer.h.59.self_attention.dense.bias": "model-00031-of-00050.safetensors", "transformer.h.59.self_attention.query.weight": "model-00031-of-00050.safetensors", "transformer.h.59.self_attention.key_value.weight": "model-00031-of-00050.safetensors", "transformer.h.59.mlp.gate_proj.weight": "model-00031-of-00050.safetensors", "transformer.h.59.mlp.down_proj.weight": "model-00031-of-00050.safetensors", "transformer.h.59.mlp.down_proj.bias": "model-00031-of-00050.safetensors", "transformer.h.59.mlp.up_proj.weight": "model-00031-of-00050.safetensors", "transformer.h.60.input_layernorm.weight": "model-00031-of-00050.safetensors", "transformer.h.60.post_attention_layernorm.weight": "model-00031-of-00050.safetensors", "transformer.h.60.self_attention.dense.weight": "model-00031-of-00050.safetensors", "transformer.h.60.self_attention.dense.bias": "model-00031-of-00050.safetensors", "transformer.h.60.self_attention.query.weight": "model-00031-of-00050.safetensors", "transformer.h.60.self_attention.key_value.weight": "model-00031-of-00050.safetensors", "transformer.h.60.mlp.gate_proj.weight": "model-00031-of-00050.safetensors", "transformer.h.60.mlp.down_proj.weight": "model-00031-of-00050.safetensors", "transformer.h.60.mlp.down_proj.bias": "model-00031-of-00050.safetensors", "transformer.h.60.mlp.up_proj.weight": "model-00031-of-00050.safetensors", "transformer.h.61.input_layernorm.weight": "model-00032-of-00050.safetensors", "transformer.h.61.post_attention_layernorm.weight": "model-00032-of-00050.safetensors", "transformer.h.61.self_attention.dense.weight": "model-00032-of-00050.safetensors", "transformer.h.61.self_attention.dense.bias": "model-00032-of-00050.safetensors", "transformer.h.61.self_attention.query.weight": "model-00032-of-00050.safetensors", "transformer.h.61.self_attention.key_value.weight": "model-00032-of-00050.safetensors", "transformer.h.61.mlp.gate_proj.weight": "model-00032-of-00050.safetensors", "transformer.h.61.mlp.down_proj.weight": "model-00032-of-00050.safetensors", "transformer.h.61.mlp.down_proj.bias": "model-00032-of-00050.safetensors", "transformer.h.61.mlp.up_proj.weight": "model-00032-of-00050.safetensors", "transformer.h.62.input_layernorm.weight": "model-00032-of-00050.safetensors", "transformer.h.62.post_attention_layernorm.weight": "model-00032-of-00050.safetensors", "transformer.h.62.self_attention.dense.weight": "model-00032-of-00050.safetensors", "transformer.h.62.self_attention.dense.bias": "model-00032-of-00050.safetensors", "transformer.h.62.self_attention.query.weight": "model-00032-of-00050.safetensors", "transformer.h.62.self_attention.key_value.weight": "model-00032-of-00050.safetensors", "transformer.h.62.mlp.gate_proj.weight": "model-00032-of-00050.safetensors", "transformer.h.62.mlp.down_proj.weight": "model-00032-of-00050.safetensors", "transformer.h.62.mlp.down_proj.bias": "model-00032-of-00050.safetensors", "transformer.h.62.mlp.up_proj.weight": "model-00032-of-00050.safetensors", "transformer.h.63.input_layernorm.weight": "model-00033-of-00050.safetensors", "transformer.h.63.post_attention_layernorm.weight": "model-00033-of-00050.safetensors", "transformer.h.63.self_attention.dense.weight": "model-00033-of-00050.safetensors", "transformer.h.63.self_attention.dense.bias": "model-00033-of-00050.safetensors", "transformer.h.63.self_attention.query.weight": "model-00033-of-00050.safetensors", "transformer.h.63.self_attention.key_value.weight": "model-00033-of-00050.safetensors", "transformer.h.63.mlp.gate_proj.weight": "model-00033-of-00050.safetensors", "transformer.h.63.mlp.down_proj.weight": "model-00033-of-00050.safetensors", "transformer.h.63.mlp.down_proj.bias": "model-00033-of-00050.safetensors", "transformer.h.63.mlp.up_proj.weight": "model-00033-of-00050.safetensors", "transformer.h.64.input_layernorm.weight": "model-00033-of-00050.safetensors", "transformer.h.64.post_attention_layernorm.weight": "model-00033-of-00050.safetensors", "transformer.h.64.self_attention.dense.weight": "model-00033-of-00050.safetensors", "transformer.h.64.self_attention.dense.bias": "model-00033-of-00050.safetensors", "transformer.h.64.self_attention.query.weight": "model-00033-of-00050.safetensors", "transformer.h.64.self_attention.key_value.weight": "model-00033-of-00050.safetensors", "transformer.h.64.mlp.gate_proj.weight": "model-00033-of-00050.safetensors", "transformer.h.64.mlp.down_proj.weight": "model-00033-of-00050.safetensors", "transformer.h.64.mlp.down_proj.bias": "model-00033-of-00050.safetensors", "transformer.h.64.mlp.up_proj.weight": "model-00033-of-00050.safetensors", "transformer.h.65.input_layernorm.weight": "model-00034-of-00050.safetensors", "transformer.h.65.post_attention_layernorm.weight": "model-00034-of-00050.safetensors", "transformer.h.65.self_attention.dense.weight": "model-00034-of-00050.safetensors", "transformer.h.65.self_attention.dense.bias": "model-00034-of-00050.safetensors", "transformer.h.65.self_attention.query.weight": "model-00034-of-00050.safetensors", "transformer.h.65.self_attention.key_value.weight": "model-00034-of-00050.safetensors", "transformer.h.65.mlp.gate_proj.weight": "model-00034-of-00050.safetensors", "transformer.h.65.mlp.down_proj.weight": "model-00034-of-00050.safetensors", "transformer.h.65.mlp.down_proj.bias": "model-00034-of-00050.safetensors", "transformer.h.65.mlp.up_proj.weight": "model-00034-of-00050.safetensors", "transformer.h.66.input_layernorm.weight": "model-00034-of-00050.safetensors", "transformer.h.66.post_attention_layernorm.weight": "model-00034-of-00050.safetensors", "transformer.h.66.self_attention.dense.weight": "model-00034-of-00050.safetensors", "transformer.h.66.self_attention.dense.bias": "model-00034-of-00050.safetensors", "transformer.h.66.self_attention.query.weight": "model-00034-of-00050.safetensors", "transformer.h.66.self_attention.key_value.weight": "model-00034-of-00050.safetensors", "transformer.h.66.mlp.gate_proj.weight": "model-00034-of-00050.safetensors", "transformer.h.66.mlp.down_proj.weight": "model-00034-of-00050.safetensors", "transformer.h.66.mlp.down_proj.bias": "model-00034-of-00050.safetensors", "transformer.h.66.mlp.up_proj.weight": "model-00034-of-00050.safetensors", "transformer.h.67.input_layernorm.weight": "model-00035-of-00050.safetensors", "transformer.h.67.post_attention_layernorm.weight": "model-00035-of-00050.safetensors", "transformer.h.67.self_attention.dense.weight": "model-00035-of-00050.safetensors", "transformer.h.67.self_attention.dense.bias": "model-00035-of-00050.safetensors", "transformer.h.67.self_attention.query.weight": "model-00035-of-00050.safetensors", "transformer.h.67.self_attention.key_value.weight": "model-00035-of-00050.safetensors", "transformer.h.67.mlp.gate_proj.weight": "model-00035-of-00050.safetensors", "transformer.h.67.mlp.down_proj.weight": "model-00035-of-00050.safetensors", "transformer.h.67.mlp.down_proj.bias": "model-00035-of-00050.safetensors", "transformer.h.67.mlp.up_proj.weight": "model-00035-of-00050.safetensors", "transformer.h.68.input_layernorm.weight": "model-00035-of-00050.safetensors", "transformer.h.68.post_attention_layernorm.weight": "model-00035-of-00050.safetensors", "transformer.h.68.self_attention.dense.weight": "model-00035-of-00050.safetensors", "transformer.h.68.self_attention.dense.bias": "model-00035-of-00050.safetensors", "transformer.h.68.self_attention.query.weight": "model-00035-of-00050.safetensors", "transformer.h.68.self_attention.key_value.weight": "model-00035-of-00050.safetensors", "transformer.h.68.mlp.gate_proj.weight": "model-00035-of-00050.safetensors", "transformer.h.68.mlp.down_proj.weight": "model-00035-of-00050.safetensors", "transformer.h.68.mlp.down_proj.bias": "model-00035-of-00050.safetensors", "transformer.h.68.mlp.up_proj.weight": "model-00035-of-00050.safetensors", "transformer.h.69.input_layernorm.weight": "model-00036-of-00050.safetensors", "transformer.h.69.post_attention_layernorm.weight": "model-00036-of-00050.safetensors", "transformer.h.69.self_attention.dense.weight": "model-00036-of-00050.safetensors", "transformer.h.69.self_attention.dense.bias": "model-00036-of-00050.safetensors", "transformer.h.69.self_attention.query.weight": "model-00036-of-00050.safetensors", "transformer.h.69.self_attention.key_value.weight": "model-00036-of-00050.safetensors", "transformer.h.69.mlp.gate_proj.weight": "model-00036-of-00050.safetensors", "transformer.h.69.mlp.down_proj.weight": "model-00036-of-00050.safetensors", "transformer.h.69.mlp.down_proj.bias": "model-00036-of-00050.safetensors", "transformer.h.69.mlp.up_proj.weight": "model-00036-of-00050.safetensors", "transformer.h.70.input_layernorm.weight": "model-00036-of-00050.safetensors", "transformer.h.70.post_attention_layernorm.weight": "model-00036-of-00050.safetensors", "transformer.h.70.self_attention.dense.weight": "model-00036-of-00050.safetensors", "transformer.h.70.self_attention.dense.bias": "model-00036-of-00050.safetensors", "transformer.h.70.self_attention.query.weight": "model-00036-of-00050.safetensors", "transformer.h.70.self_attention.key_value.weight": "model-00036-of-00050.safetensors", "transformer.h.70.mlp.gate_proj.weight": "model-00036-of-00050.safetensors", "transformer.h.70.mlp.down_proj.weight": "model-00036-of-00050.safetensors", "transformer.h.70.mlp.down_proj.bias": "model-00036-of-00050.safetensors", "transformer.h.70.mlp.up_proj.weight": "model-00036-of-00050.safetensors", "transformer.h.71.input_layernorm.weight": "model-00037-of-00050.safetensors", "transformer.h.71.post_attention_layernorm.weight": "model-00037-of-00050.safetensors", "transformer.h.71.self_attention.dense.weight": "model-00037-of-00050.safetensors", "transformer.h.71.self_attention.dense.bias": "model-00037-of-00050.safetensors", "transformer.h.71.self_attention.query.weight": "model-00037-of-00050.safetensors", "transformer.h.71.self_attention.key_value.weight": "model-00037-of-00050.safetensors", "transformer.h.71.mlp.gate_proj.weight": "model-00037-of-00050.safetensors", "transformer.h.71.mlp.down_proj.weight": "model-00037-of-00050.safetensors", "transformer.h.71.mlp.down_proj.bias": "model-00037-of-00050.safetensors", "transformer.h.71.mlp.up_proj.weight": "model-00037-of-00050.safetensors", "transformer.h.72.input_layernorm.weight": "model-00037-of-00050.safetensors", "transformer.h.72.post_attention_layernorm.weight": "model-00037-of-00050.safetensors", "transformer.h.72.self_attention.dense.weight": "model-00037-of-00050.safetensors", "transformer.h.72.self_attention.dense.bias": "model-00037-of-00050.safetensors", "transformer.h.72.self_attention.query.weight": "model-00037-of-00050.safetensors", "transformer.h.72.self_attention.key_value.weight": "model-00037-of-00050.safetensors", "transformer.h.72.mlp.gate_proj.weight": "model-00037-of-00050.safetensors", "transformer.h.72.mlp.down_proj.weight": "model-00037-of-00050.safetensors", "transformer.h.72.mlp.down_proj.bias": "model-00037-of-00050.safetensors", "transformer.h.72.mlp.up_proj.weight": "model-00037-of-00050.safetensors", "transformer.h.73.input_layernorm.weight": "model-00038-of-00050.safetensors", "transformer.h.73.post_attention_layernorm.weight": "model-00038-of-00050.safetensors", "transformer.h.73.self_attention.dense.weight": "model-00038-of-00050.safetensors", "transformer.h.73.self_attention.dense.bias": "model-00038-of-00050.safetensors", "transformer.h.73.self_attention.query.weight": "model-00038-of-00050.safetensors", "transformer.h.73.self_attention.key_value.weight": "model-00038-of-00050.safetensors", "transformer.h.73.mlp.gate_proj.weight": "model-00038-of-00050.safetensors", "transformer.h.73.mlp.down_proj.weight": "model-00038-of-00050.safetensors", "transformer.h.73.mlp.down_proj.bias": "model-00038-of-00050.safetensors", "transformer.h.73.mlp.up_proj.weight": "model-00038-of-00050.safetensors", "transformer.h.74.input_layernorm.weight": "model-00038-of-00050.safetensors", "transformer.h.74.post_attention_layernorm.weight": "model-00038-of-00050.safetensors", "transformer.h.74.self_attention.dense.weight": "model-00038-of-00050.safetensors", "transformer.h.74.self_attention.dense.bias": "model-00038-of-00050.safetensors", "transformer.h.74.self_attention.query.weight": "model-00038-of-00050.safetensors", "transformer.h.74.self_attention.key_value.weight": "model-00038-of-00050.safetensors", "transformer.h.74.mlp.gate_proj.weight": "model-00038-of-00050.safetensors", "transformer.h.74.mlp.down_proj.weight": "model-00038-of-00050.safetensors", "transformer.h.74.mlp.down_proj.bias": "model-00038-of-00050.safetensors", "transformer.h.74.mlp.up_proj.weight": "model-00038-of-00050.safetensors", "transformer.h.75.input_layernorm.weight": "model-00039-of-00050.safetensors", "transformer.h.75.post_attention_layernorm.weight": "model-00039-of-00050.safetensors", "transformer.h.75.self_attention.dense.weight": "model-00039-of-00050.safetensors", "transformer.h.75.self_attention.dense.bias": "model-00039-of-00050.safetensors", "transformer.h.75.self_attention.query.weight": "model-00039-of-00050.safetensors", "transformer.h.75.self_attention.key_value.weight": "model-00039-of-00050.safetensors", "transformer.h.75.mlp.gate_proj.weight": "model-00039-of-00050.safetensors", "transformer.h.75.mlp.down_proj.weight": "model-00039-of-00050.safetensors", "transformer.h.75.mlp.down_proj.bias": "model-00039-of-00050.safetensors", "transformer.h.75.mlp.up_proj.weight": "model-00039-of-00050.safetensors", "transformer.h.76.input_layernorm.weight": "model-00039-of-00050.safetensors", "transformer.h.76.post_attention_layernorm.weight": "model-00039-of-00050.safetensors", "transformer.h.76.self_attention.dense.weight": "model-00039-of-00050.safetensors", "transformer.h.76.self_attention.dense.bias": "model-00039-of-00050.safetensors", "transformer.h.76.self_attention.query.weight": "model-00039-of-00050.safetensors", "transformer.h.76.self_attention.key_value.weight": "model-00039-of-00050.safetensors", "transformer.h.76.mlp.gate_proj.weight": "model-00039-of-00050.safetensors", "transformer.h.76.mlp.down_proj.weight": "model-00039-of-00050.safetensors", "transformer.h.76.mlp.down_proj.bias": "model-00039-of-00050.safetensors", "transformer.h.76.mlp.up_proj.weight": "model-00039-of-00050.safetensors", "transformer.h.77.input_layernorm.weight": "model-00040-of-00050.safetensors", "transformer.h.77.post_attention_layernorm.weight": "model-00040-of-00050.safetensors", "transformer.h.77.self_attention.dense.weight": "model-00040-of-00050.safetensors", "transformer.h.77.self_attention.dense.bias": "model-00040-of-00050.safetensors", "transformer.h.77.self_attention.query.weight": "model-00040-of-00050.safetensors", "transformer.h.77.self_attention.key_value.weight": "model-00040-of-00050.safetensors", "transformer.h.77.mlp.gate_proj.weight": "model-00040-of-00050.safetensors", "transformer.h.77.mlp.down_proj.weight": "model-00040-of-00050.safetensors", "transformer.h.77.mlp.down_proj.bias": "model-00040-of-00050.safetensors", "transformer.h.77.mlp.up_proj.weight": "model-00040-of-00050.safetensors", "transformer.h.78.input_layernorm.weight": "model-00040-of-00050.safetensors", "transformer.h.78.post_attention_layernorm.weight": "model-00040-of-00050.safetensors", "transformer.h.78.self_attention.dense.weight": "model-00040-of-00050.safetensors", "transformer.h.78.self_attention.dense.bias": "model-00040-of-00050.safetensors", "transformer.h.78.self_attention.query.weight": "model-00040-of-00050.safetensors", "transformer.h.78.self_attention.key_value.weight": "model-00040-of-00050.safetensors", "transformer.h.78.mlp.gate_proj.weight": "model-00040-of-00050.safetensors", "transformer.h.78.mlp.down_proj.weight": "model-00040-of-00050.safetensors", "transformer.h.78.mlp.down_proj.bias": "model-00040-of-00050.safetensors", "transformer.h.78.mlp.up_proj.weight": "model-00040-of-00050.safetensors", "transformer.h.79.input_layernorm.weight": "model-00041-of-00050.safetensors", "transformer.h.79.post_attention_layernorm.weight": "model-00041-of-00050.safetensors", "transformer.h.79.self_attention.dense.weight": "model-00041-of-00050.safetensors", "transformer.h.79.self_attention.dense.bias": "model-00041-of-00050.safetensors", "transformer.h.79.self_attention.query.weight": "model-00041-of-00050.safetensors", "transformer.h.79.self_attention.key_value.weight": "model-00041-of-00050.safetensors", "transformer.h.79.mlp.gate_proj.weight": "model-00041-of-00050.safetensors", "transformer.h.79.mlp.down_proj.weight": "model-00041-of-00050.safetensors", "transformer.h.79.mlp.down_proj.bias": "model-00041-of-00050.safetensors", "transformer.h.79.mlp.up_proj.weight": "model-00041-of-00050.safetensors", "transformer.h.80.input_layernorm.weight": "model-00041-of-00050.safetensors", "transformer.h.80.post_attention_layernorm.weight": "model-00041-of-00050.safetensors", "transformer.h.80.self_attention.dense.weight": "model-00041-of-00050.safetensors", "transformer.h.80.self_attention.dense.bias": "model-00041-of-00050.safetensors", "transformer.h.80.self_attention.query.weight": "model-00041-of-00050.safetensors", "transformer.h.80.self_attention.key_value.weight": "model-00041-of-00050.safetensors", "transformer.h.80.mlp.gate_proj.weight": "model-00041-of-00050.safetensors", "transformer.h.80.mlp.down_proj.weight": "model-00041-of-00050.safetensors", "transformer.h.80.mlp.down_proj.bias": "model-00041-of-00050.safetensors", "transformer.h.80.mlp.up_proj.weight": "model-00041-of-00050.safetensors", "transformer.h.81.input_layernorm.weight": "model-00042-of-00050.safetensors", "transformer.h.81.post_attention_layernorm.weight": "model-00042-of-00050.safetensors", "transformer.h.81.self_attention.dense.weight": "model-00042-of-00050.safetensors", "transformer.h.81.self_attention.dense.bias": "model-00042-of-00050.safetensors", "transformer.h.81.self_attention.query.weight": "model-00042-of-00050.safetensors", "transformer.h.81.self_attention.key_value.weight": "model-00042-of-00050.safetensors", "transformer.h.81.mlp.gate_proj.weight": "model-00042-of-00050.safetensors", "transformer.h.81.mlp.down_proj.weight": "model-00042-of-00050.safetensors", "transformer.h.81.mlp.down_proj.bias": "model-00042-of-00050.safetensors", "transformer.h.81.mlp.up_proj.weight": "model-00042-of-00050.safetensors", "transformer.h.82.input_layernorm.weight": "model-00042-of-00050.safetensors", "transformer.h.82.post_attention_layernorm.weight": "model-00042-of-00050.safetensors", "transformer.h.82.self_attention.dense.weight": "model-00042-of-00050.safetensors", "transformer.h.82.self_attention.dense.bias": "model-00042-of-00050.safetensors", "transformer.h.82.self_attention.query.weight": "model-00042-of-00050.safetensors", "transformer.h.82.self_attention.key_value.weight": "model-00042-of-00050.safetensors", "transformer.h.82.mlp.gate_proj.weight": "model-00042-of-00050.safetensors", "transformer.h.82.mlp.down_proj.weight": "model-00042-of-00050.safetensors", "transformer.h.82.mlp.down_proj.bias": "model-00042-of-00050.safetensors", "transformer.h.82.mlp.up_proj.weight": "model-00042-of-00050.safetensors", "transformer.h.83.input_layernorm.weight": "model-00043-of-00050.safetensors", "transformer.h.83.post_attention_layernorm.weight": "model-00043-of-00050.safetensors", "transformer.h.83.self_attention.dense.weight": "model-00043-of-00050.safetensors", "transformer.h.83.self_attention.dense.bias": "model-00043-of-00050.safetensors", "transformer.h.83.self_attention.query.weight": "model-00043-of-00050.safetensors", "transformer.h.83.self_attention.key_value.weight": "model-00043-of-00050.safetensors", "transformer.h.83.mlp.gate_proj.weight": "model-00043-of-00050.safetensors", "transformer.h.83.mlp.down_proj.weight": "model-00043-of-00050.safetensors", "transformer.h.83.mlp.down_proj.bias": "model-00043-of-00050.safetensors", "transformer.h.83.mlp.up_proj.weight": "model-00043-of-00050.safetensors", "transformer.h.84.input_layernorm.weight": "model-00043-of-00050.safetensors", "transformer.h.84.post_attention_layernorm.weight": "model-00043-of-00050.safetensors", "transformer.h.84.self_attention.dense.weight": "model-00043-of-00050.safetensors", "transformer.h.84.self_attention.dense.bias": "model-00043-of-00050.safetensors", "transformer.h.84.self_attention.query.weight": "model-00043-of-00050.safetensors", "transformer.h.84.self_attention.key_value.weight": "model-00043-of-00050.safetensors", "transformer.h.84.mlp.gate_proj.weight": "model-00043-of-00050.safetensors", "transformer.h.84.mlp.down_proj.weight": "model-00043-of-00050.safetensors", "transformer.h.84.mlp.down_proj.bias": "model-00043-of-00050.safetensors", "transformer.h.84.mlp.up_proj.weight": "model-00043-of-00050.safetensors", "transformer.h.85.input_layernorm.weight": "model-00044-of-00050.safetensors", "transformer.h.85.post_attention_layernorm.weight": "model-00044-of-00050.safetensors", "transformer.h.85.self_attention.dense.weight": "model-00044-of-00050.safetensors", "transformer.h.85.self_attention.dense.bias": "model-00044-of-00050.safetensors", "transformer.h.85.self_attention.query.weight": "model-00044-of-00050.safetensors", "transformer.h.85.self_attention.key_value.weight": "model-00044-of-00050.safetensors", "transformer.h.85.mlp.gate_proj.weight": "model-00044-of-00050.safetensors", "transformer.h.85.mlp.down_proj.weight": "model-00044-of-00050.safetensors", "transformer.h.85.mlp.down_proj.bias": "model-00044-of-00050.safetensors", "transformer.h.85.mlp.up_proj.weight": "model-00044-of-00050.safetensors", "transformer.h.86.input_layernorm.weight": "model-00044-of-00050.safetensors", "transformer.h.86.post_attention_layernorm.weight": "model-00044-of-00050.safetensors", "transformer.h.86.self_attention.dense.weight": "model-00044-of-00050.safetensors", "transformer.h.86.self_attention.dense.bias": "model-00044-of-00050.safetensors", "transformer.h.86.self_attention.query.weight": "model-00044-of-00050.safetensors", "transformer.h.86.self_attention.key_value.weight": "model-00044-of-00050.safetensors", "transformer.h.86.mlp.gate_proj.weight": "model-00044-of-00050.safetensors", "transformer.h.86.mlp.down_proj.weight": "model-00044-of-00050.safetensors", "transformer.h.86.mlp.down_proj.bias": "model-00044-of-00050.safetensors", "transformer.h.86.mlp.up_proj.weight": "model-00044-of-00050.safetensors", "transformer.h.87.input_layernorm.weight": "model-00045-of-00050.safetensors", "transformer.h.87.post_attention_layernorm.weight": "model-00045-of-00050.safetensors", "transformer.h.87.self_attention.dense.weight": "model-00045-of-00050.safetensors", "transformer.h.87.self_attention.dense.bias": "model-00045-of-00050.safetensors", "transformer.h.87.self_attention.query.weight": "model-00045-of-00050.safetensors", "transformer.h.87.self_attention.key_value.weight": "model-00045-of-00050.safetensors", "transformer.h.87.mlp.gate_proj.weight": "model-00045-of-00050.safetensors", "transformer.h.87.mlp.down_proj.weight": "model-00045-of-00050.safetensors", "transformer.h.87.mlp.down_proj.bias": "model-00045-of-00050.safetensors", "transformer.h.87.mlp.up_proj.weight": "model-00045-of-00050.safetensors", "transformer.h.88.input_layernorm.weight": "model-00045-of-00050.safetensors", "transformer.h.88.post_attention_layernorm.weight": "model-00045-of-00050.safetensors", "transformer.h.88.self_attention.dense.weight": "model-00045-of-00050.safetensors", "transformer.h.88.self_attention.dense.bias": "model-00045-of-00050.safetensors", "transformer.h.88.self_attention.query.weight": "model-00045-of-00050.safetensors", "transformer.h.88.self_attention.key_value.weight": "model-00045-of-00050.safetensors", "transformer.h.88.mlp.gate_proj.weight": "model-00045-of-00050.safetensors", "transformer.h.88.mlp.down_proj.weight": "model-00045-of-00050.safetensors", "transformer.h.88.mlp.down_proj.bias": "model-00045-of-00050.safetensors", "transformer.h.88.mlp.up_proj.weight": "model-00045-of-00050.safetensors", "transformer.h.89.input_layernorm.weight": "model-00046-of-00050.safetensors", "transformer.h.89.post_attention_layernorm.weight": "model-00046-of-00050.safetensors", "transformer.h.89.self_attention.dense.weight": "model-00046-of-00050.safetensors", "transformer.h.89.self_attention.dense.bias": "model-00046-of-00050.safetensors", "transformer.h.89.self_attention.query.weight": "model-00046-of-00050.safetensors", "transformer.h.89.self_attention.key_value.weight": "model-00046-of-00050.safetensors", "transformer.h.89.mlp.gate_proj.weight": "model-00046-of-00050.safetensors", "transformer.h.89.mlp.down_proj.weight": "model-00046-of-00050.safetensors", "transformer.h.89.mlp.down_proj.bias": "model-00046-of-00050.safetensors", "transformer.h.89.mlp.up_proj.weight": "model-00046-of-00050.safetensors", "transformer.h.90.input_layernorm.weight": "model-00046-of-00050.safetensors", "transformer.h.90.post_attention_layernorm.weight": "model-00046-of-00050.safetensors", "transformer.h.90.self_attention.dense.weight": "model-00046-of-00050.safetensors", "transformer.h.90.self_attention.dense.bias": "model-00046-of-00050.safetensors", "transformer.h.90.self_attention.query.weight": "model-00046-of-00050.safetensors", "transformer.h.90.self_attention.key_value.weight": "model-00046-of-00050.safetensors", "transformer.h.90.mlp.gate_proj.weight": "model-00046-of-00050.safetensors", "transformer.h.90.mlp.down_proj.weight": "model-00046-of-00050.safetensors", "transformer.h.90.mlp.down_proj.bias": "model-00046-of-00050.safetensors", "transformer.h.90.mlp.up_proj.weight": "model-00046-of-00050.safetensors", "transformer.h.91.input_layernorm.weight": "model-00047-of-00050.safetensors", "transformer.h.91.post_attention_layernorm.weight": "model-00047-of-00050.safetensors", "transformer.h.91.self_attention.dense.weight": "model-00047-of-00050.safetensors", "transformer.h.91.self_attention.dense.bias": "model-00047-of-00050.safetensors", "transformer.h.91.self_attention.query.weight": "model-00047-of-00050.safetensors", "transformer.h.91.self_attention.key_value.weight": "model-00047-of-00050.safetensors", "transformer.h.91.mlp.gate_proj.weight": "model-00047-of-00050.safetensors", "transformer.h.91.mlp.down_proj.weight": "model-00047-of-00050.safetensors", "transformer.h.91.mlp.down_proj.bias": "model-00047-of-00050.safetensors", "transformer.h.91.mlp.up_proj.weight": "model-00047-of-00050.safetensors", "transformer.h.92.input_layernorm.weight": "model-00047-of-00050.safetensors", "transformer.h.92.post_attention_layernorm.weight": "model-00047-of-00050.safetensors", "transformer.h.92.self_attention.dense.weight": "model-00047-of-00050.safetensors", "transformer.h.92.self_attention.dense.bias": "model-00047-of-00050.safetensors", "transformer.h.92.self_attention.query.weight": "model-00047-of-00050.safetensors", "transformer.h.92.self_attention.key_value.weight": "model-00047-of-00050.safetensors", "transformer.h.92.mlp.gate_proj.weight": "model-00047-of-00050.safetensors", "transformer.h.92.mlp.down_proj.weight": "model-00047-of-00050.safetensors", "transformer.h.92.mlp.down_proj.bias": "model-00047-of-00050.safetensors", "transformer.h.92.mlp.up_proj.weight": "model-00047-of-00050.safetensors", "transformer.h.93.input_layernorm.weight": "model-00048-of-00050.safetensors", "transformer.h.93.post_attention_layernorm.weight": "model-00048-of-00050.safetensors", "transformer.h.93.self_attention.dense.weight": "model-00048-of-00050.safetensors", "transformer.h.93.self_attention.dense.bias": "model-00048-of-00050.safetensors", "transformer.h.93.self_attention.query.weight": "model-00048-of-00050.safetensors", "transformer.h.93.self_attention.key_value.weight": "model-00048-of-00050.safetensors", "transformer.h.93.mlp.gate_proj.weight": "model-00048-of-00050.safetensors", "transformer.h.93.mlp.down_proj.weight": "model-00048-of-00050.safetensors", "transformer.h.93.mlp.down_proj.bias": "model-00048-of-00050.safetensors", "transformer.h.93.mlp.up_proj.weight": "model-00048-of-00050.safetensors", "transformer.h.94.input_layernorm.weight": "model-00048-of-00050.safetensors", "transformer.h.94.post_attention_layernorm.weight": "model-00048-of-00050.safetensors", "transformer.h.94.self_attention.dense.weight": "model-00048-of-00050.safetensors", "transformer.h.94.self_attention.dense.bias": "model-00048-of-00050.safetensors", "transformer.h.94.self_attention.query.weight": "model-00048-of-00050.safetensors", "transformer.h.94.self_attention.key_value.weight": "model-00048-of-00050.safetensors", "transformer.h.94.mlp.gate_proj.weight": "model-00048-of-00050.safetensors", "transformer.h.94.mlp.down_proj.weight": "model-00048-of-00050.safetensors", "transformer.h.94.mlp.down_proj.bias": "model-00048-of-00050.safetensors", "transformer.h.94.mlp.up_proj.weight": "model-00048-of-00050.safetensors", "transformer.h.95.input_layernorm.weight": "model-00049-of-00050.safetensors", "transformer.h.95.post_attention_layernorm.weight": "model-00049-of-00050.safetensors", "transformer.h.95.self_attention.dense.weight": "model-00049-of-00050.safetensors", "transformer.h.95.self_attention.dense.bias": "model-00049-of-00050.safetensors", "transformer.h.95.self_attention.query.weight": "model-00049-of-00050.safetensors", "transformer.h.95.self_attention.key_value.weight": "model-00049-of-00050.safetensors", "transformer.h.95.mlp.gate_proj.weight": "model-00049-of-00050.safetensors", "transformer.h.95.mlp.down_proj.weight": "model-00049-of-00050.safetensors", "transformer.h.95.mlp.down_proj.bias": "model-00049-of-00050.safetensors", "transformer.h.95.mlp.up_proj.weight": "model-00049-of-00050.safetensors", "transformer.ln_f.weight": "model-00050-of-00050.safetensors", "lm_head.weight": "model-00050-of-00050.safetensors"}}
|