Daniel Hesslow commited on
Commit
56dde4b
1 Parent(s): 6064d60

Upload RWForCausalLM

Browse files
modelling_RW.py CHANGED
@@ -363,14 +363,12 @@ class DecoderLayer(nn.Module):
363
  super().__init__()
364
  hidden_size = config.hidden_size
365
 
366
- self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
 
 
367
  self.num_heads = config.n_head
368
  self.self_attention = Attention(config)
369
 
370
- if not config.parallel_attn:
371
- # unused if parallel attn
372
- self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
373
-
374
  self.mlp = MLP(config)
375
 
376
  self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
@@ -389,12 +387,14 @@ class DecoderLayer(nn.Module):
389
  output_attentions: bool = False,
390
  ):
391
 
392
- layernorm_output = self.input_layernorm(hidden_states)
 
 
393
  residual = hidden_states
394
 
395
  # Self attention.
396
  attn_outputs = self.self_attention(
397
- layernorm_output,
398
  layer_past=layer_past,
399
  attention_mask=attention_mask,
400
  alibi=alibi,
@@ -405,19 +405,14 @@ class DecoderLayer(nn.Module):
405
 
406
  attention_output = attn_outputs[0]
407
 
408
- if not self.config.parallel_attn:
409
- residual = dropout_add(attention_output, residual, self.config.attention_dropout, training=self.training)
410
- layernorm_output = self.post_attention_layernorm(residual)
411
-
412
  outputs = attn_outputs[1:]
413
 
414
  # MLP.
415
- mlp_output = self.mlp(layernorm_output)
416
-
417
- if self.config.parallel_attn:
418
- mlp_output += attention_output
419
 
420
- output = dropout_add(mlp_output, residual, self.config.hidden_dropout, training=self.training)
 
 
421
 
422
  if use_cache:
423
  outputs = (output,) + outputs
 
363
  super().__init__()
364
  hidden_size = config.hidden_size
365
 
366
+ self.ln_attn = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
367
+ self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
368
+
369
  self.num_heads = config.n_head
370
  self.self_attention = Attention(config)
371
 
 
 
 
 
372
  self.mlp = MLP(config)
373
 
374
  self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
 
387
  output_attentions: bool = False,
388
  ):
389
 
390
+ ln_attn = self.ln_attn(hidden_states)
391
+ ln_mlp = self.ln_mlp(hidden_states)
392
+
393
  residual = hidden_states
394
 
395
  # Self attention.
396
  attn_outputs = self.self_attention(
397
+ ln_attn,
398
  layer_past=layer_past,
399
  attention_mask=attention_mask,
400
  alibi=alibi,
 
405
 
406
  attention_output = attn_outputs[0]
407
 
 
 
 
 
408
  outputs = attn_outputs[1:]
409
 
410
  # MLP.
411
+ mlp_output = self.mlp(ln_mlp)
 
 
 
412
 
413
+ output = dropout_add(
414
+ mlp_output + attention_output, residual, self.config.hidden_dropout, training=self.training
415
+ )
416
 
417
  if use_cache:
418
  outputs = (output,) + outputs
pytorch_model-00001-of-00009.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf8d79dce91486c166640f2dbde437985109617072c8747abc875ff7b35f4937
3
- size 9504536357
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:384bd92e6d1d0b6a133b1d81bad915a10d954d1bfcdf4fdc8b6d0d81b4f5e9e8
3
+ size 9504770141
pytorch_model-00002-of-00009.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2bda814cd53c53b6e7c9c168dce4e48371714fcbbe1ee273b6a35173867942a
3
- size 9512925303
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:638cb048ee6ad66f85a176e980332b0d61cbddb2d5dc9b5ad1863d87e020350c
3
+ size 9513159151
pytorch_model-00003-of-00009.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29ea68447ced396b92bae50150340fffcb55166ee72835b8e0fb7c0a6002d0a7
3
- size 9512925367
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2125ceee3992a093f9808efc9a720643a44e59b17baa77eddb0fbec6965ce5a
3
+ size 9513159151
pytorch_model-00004-of-00009.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e99fbdaf8f48474b55c23cbaea749790bedf0d52e0d1d7a522f9b4bceed6a4c3
3
- size 9512925367
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fc011e39d4a9c5958630b1bb946dec9ca54adbccc52804fed545490f995b20b
3
+ size 9513159151
pytorch_model-00005-of-00009.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9398b3a9b7b758f333073f15db1d3ab958c9c9d30556221c0e51a07afa167ca
3
- size 9512925367
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae277c7c5aef0d0fea9e9e6b13113dfef0078a36c78805bebe14d32e9ab887d1
3
+ size 9513159151
pytorch_model-00006-of-00009.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f158d1250afe86b2e455cdba83a5ba183c7329a5b9f82c53d418878bf6e4b53
3
- size 9512925367
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6376a884bc80a6ea95da83f963508e57d8508333c424bed7beda565e4aaa0f3
3
+ size 9513159151
pytorch_model-00007-of-00009.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72b644f83bf1b351942b6801cb1da27933cd16c9d4bea0b009d48ca8b864bce2
3
- size 9512925367
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3f07c8ec31edebe5ff80ed32d175475a044a0517d84c5eff6f6247ad0a4432d
3
+ size 9513159151
pytorch_model-00008-of-00009.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4bc35d185c86958ac48722c41434bc9bd4b268e15a17e91b9f24d3cb1f4f2eb1
3
- size 9512925367
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7af5021fe9afb9b4186a59a91e75aa60570c66fba43390b634387ee54bd73223
3
+ size 9513159151
pytorch_model-00009-of-00009.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:063b41bbd55c371cac7d7cd5a5a93bb1709f54a661c6e4703a2849fc836069b1
3
- size 7575086769
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5c794d3106945f514b5501904506485377c95ce2cef030373c7cbe7459ef610
3
+ size 7575220277
pytorch_model.bin.index.json CHANGED
@@ -1,365 +1,485 @@
1
  {
2
  "metadata": {
3
- "total_size": 83669975040
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "pytorch_model-00009-of-00009.bin",
7
- "transformer.h.0.input_layernorm.bias": "pytorch_model-00001-of-00009.bin",
8
- "transformer.h.0.input_layernorm.weight": "pytorch_model-00001-of-00009.bin",
 
 
9
  "transformer.h.0.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00009.bin",
10
  "transformer.h.0.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00009.bin",
11
  "transformer.h.0.self_attention.dense.weight": "pytorch_model-00001-of-00009.bin",
12
  "transformer.h.0.self_attention.query_key_value.weight": "pytorch_model-00001-of-00009.bin",
13
- "transformer.h.1.input_layernorm.bias": "pytorch_model-00001-of-00009.bin",
14
- "transformer.h.1.input_layernorm.weight": "pytorch_model-00001-of-00009.bin",
 
 
15
  "transformer.h.1.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00009.bin",
16
  "transformer.h.1.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00009.bin",
17
  "transformer.h.1.self_attention.dense.weight": "pytorch_model-00001-of-00009.bin",
18
  "transformer.h.1.self_attention.query_key_value.weight": "pytorch_model-00001-of-00009.bin",
19
- "transformer.h.10.input_layernorm.bias": "pytorch_model-00002-of-00009.bin",
20
- "transformer.h.10.input_layernorm.weight": "pytorch_model-00002-of-00009.bin",
 
 
21
  "transformer.h.10.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00009.bin",
22
  "transformer.h.10.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00009.bin",
23
  "transformer.h.10.self_attention.dense.weight": "pytorch_model-00002-of-00009.bin",
24
  "transformer.h.10.self_attention.query_key_value.weight": "pytorch_model-00002-of-00009.bin",
25
- "transformer.h.11.input_layernorm.bias": "pytorch_model-00002-of-00009.bin",
26
- "transformer.h.11.input_layernorm.weight": "pytorch_model-00002-of-00009.bin",
 
 
27
  "transformer.h.11.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00009.bin",
28
  "transformer.h.11.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00009.bin",
29
  "transformer.h.11.self_attention.dense.weight": "pytorch_model-00002-of-00009.bin",
30
  "transformer.h.11.self_attention.query_key_value.weight": "pytorch_model-00002-of-00009.bin",
31
- "transformer.h.12.input_layernorm.bias": "pytorch_model-00002-of-00009.bin",
32
- "transformer.h.12.input_layernorm.weight": "pytorch_model-00002-of-00009.bin",
 
 
33
  "transformer.h.12.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00009.bin",
34
  "transformer.h.12.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00009.bin",
35
  "transformer.h.12.self_attention.dense.weight": "pytorch_model-00002-of-00009.bin",
36
  "transformer.h.12.self_attention.query_key_value.weight": "pytorch_model-00002-of-00009.bin",
37
- "transformer.h.13.input_layernorm.bias": "pytorch_model-00002-of-00009.bin",
38
- "transformer.h.13.input_layernorm.weight": "pytorch_model-00002-of-00009.bin",
 
 
39
  "transformer.h.13.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00009.bin",
40
  "transformer.h.13.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00009.bin",
41
  "transformer.h.13.self_attention.dense.weight": "pytorch_model-00002-of-00009.bin",
42
  "transformer.h.13.self_attention.query_key_value.weight": "pytorch_model-00002-of-00009.bin",
43
- "transformer.h.14.input_layernorm.bias": "pytorch_model-00003-of-00009.bin",
44
- "transformer.h.14.input_layernorm.weight": "pytorch_model-00003-of-00009.bin",
 
 
45
  "transformer.h.14.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00009.bin",
46
  "transformer.h.14.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00009.bin",
47
  "transformer.h.14.self_attention.dense.weight": "pytorch_model-00003-of-00009.bin",
48
  "transformer.h.14.self_attention.query_key_value.weight": "pytorch_model-00003-of-00009.bin",
49
- "transformer.h.15.input_layernorm.bias": "pytorch_model-00003-of-00009.bin",
50
- "transformer.h.15.input_layernorm.weight": "pytorch_model-00003-of-00009.bin",
 
 
51
  "transformer.h.15.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00009.bin",
52
  "transformer.h.15.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00009.bin",
53
  "transformer.h.15.self_attention.dense.weight": "pytorch_model-00003-of-00009.bin",
54
  "transformer.h.15.self_attention.query_key_value.weight": "pytorch_model-00003-of-00009.bin",
55
- "transformer.h.16.input_layernorm.bias": "pytorch_model-00003-of-00009.bin",
56
- "transformer.h.16.input_layernorm.weight": "pytorch_model-00003-of-00009.bin",
 
 
57
  "transformer.h.16.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00009.bin",
58
  "transformer.h.16.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00009.bin",
59
  "transformer.h.16.self_attention.dense.weight": "pytorch_model-00003-of-00009.bin",
60
  "transformer.h.16.self_attention.query_key_value.weight": "pytorch_model-00003-of-00009.bin",
61
- "transformer.h.17.input_layernorm.bias": "pytorch_model-00003-of-00009.bin",
62
- "transformer.h.17.input_layernorm.weight": "pytorch_model-00003-of-00009.bin",
 
 
63
  "transformer.h.17.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00009.bin",
64
  "transformer.h.17.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00009.bin",
65
  "transformer.h.17.self_attention.dense.weight": "pytorch_model-00003-of-00009.bin",
66
  "transformer.h.17.self_attention.query_key_value.weight": "pytorch_model-00003-of-00009.bin",
67
- "transformer.h.18.input_layernorm.bias": "pytorch_model-00003-of-00009.bin",
68
- "transformer.h.18.input_layernorm.weight": "pytorch_model-00003-of-00009.bin",
 
 
69
  "transformer.h.18.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00009.bin",
70
  "transformer.h.18.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00009.bin",
71
  "transformer.h.18.self_attention.dense.weight": "pytorch_model-00003-of-00009.bin",
72
  "transformer.h.18.self_attention.query_key_value.weight": "pytorch_model-00003-of-00009.bin",
73
- "transformer.h.19.input_layernorm.bias": "pytorch_model-00003-of-00009.bin",
74
- "transformer.h.19.input_layernorm.weight": "pytorch_model-00003-of-00009.bin",
 
 
75
  "transformer.h.19.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00009.bin",
76
  "transformer.h.19.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00009.bin",
77
  "transformer.h.19.self_attention.dense.weight": "pytorch_model-00003-of-00009.bin",
78
  "transformer.h.19.self_attention.query_key_value.weight": "pytorch_model-00003-of-00009.bin",
79
- "transformer.h.2.input_layernorm.bias": "pytorch_model-00001-of-00009.bin",
80
- "transformer.h.2.input_layernorm.weight": "pytorch_model-00001-of-00009.bin",
 
 
81
  "transformer.h.2.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00009.bin",
82
  "transformer.h.2.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00009.bin",
83
  "transformer.h.2.self_attention.dense.weight": "pytorch_model-00001-of-00009.bin",
84
  "transformer.h.2.self_attention.query_key_value.weight": "pytorch_model-00001-of-00009.bin",
85
- "transformer.h.20.input_layernorm.bias": "pytorch_model-00003-of-00009.bin",
86
- "transformer.h.20.input_layernorm.weight": "pytorch_model-00003-of-00009.bin",
 
 
87
  "transformer.h.20.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00009.bin",
88
  "transformer.h.20.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00009.bin",
89
  "transformer.h.20.self_attention.dense.weight": "pytorch_model-00003-of-00009.bin",
90
  "transformer.h.20.self_attention.query_key_value.weight": "pytorch_model-00003-of-00009.bin",
91
- "transformer.h.21.input_layernorm.bias": "pytorch_model-00004-of-00009.bin",
92
- "transformer.h.21.input_layernorm.weight": "pytorch_model-00004-of-00009.bin",
 
 
93
  "transformer.h.21.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00009.bin",
94
  "transformer.h.21.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00009.bin",
95
  "transformer.h.21.self_attention.dense.weight": "pytorch_model-00004-of-00009.bin",
96
  "transformer.h.21.self_attention.query_key_value.weight": "pytorch_model-00004-of-00009.bin",
97
- "transformer.h.22.input_layernorm.bias": "pytorch_model-00004-of-00009.bin",
98
- "transformer.h.22.input_layernorm.weight": "pytorch_model-00004-of-00009.bin",
 
 
99
  "transformer.h.22.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00009.bin",
100
  "transformer.h.22.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00009.bin",
101
  "transformer.h.22.self_attention.dense.weight": "pytorch_model-00004-of-00009.bin",
102
  "transformer.h.22.self_attention.query_key_value.weight": "pytorch_model-00004-of-00009.bin",
103
- "transformer.h.23.input_layernorm.bias": "pytorch_model-00004-of-00009.bin",
104
- "transformer.h.23.input_layernorm.weight": "pytorch_model-00004-of-00009.bin",
 
 
105
  "transformer.h.23.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00009.bin",
106
  "transformer.h.23.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00009.bin",
107
  "transformer.h.23.self_attention.dense.weight": "pytorch_model-00004-of-00009.bin",
108
  "transformer.h.23.self_attention.query_key_value.weight": "pytorch_model-00004-of-00009.bin",
109
- "transformer.h.24.input_layernorm.bias": "pytorch_model-00004-of-00009.bin",
110
- "transformer.h.24.input_layernorm.weight": "pytorch_model-00004-of-00009.bin",
 
 
111
  "transformer.h.24.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00009.bin",
112
  "transformer.h.24.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00009.bin",
113
  "transformer.h.24.self_attention.dense.weight": "pytorch_model-00004-of-00009.bin",
114
  "transformer.h.24.self_attention.query_key_value.weight": "pytorch_model-00004-of-00009.bin",
115
- "transformer.h.25.input_layernorm.bias": "pytorch_model-00004-of-00009.bin",
116
- "transformer.h.25.input_layernorm.weight": "pytorch_model-00004-of-00009.bin",
 
 
117
  "transformer.h.25.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00009.bin",
118
  "transformer.h.25.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00009.bin",
119
  "transformer.h.25.self_attention.dense.weight": "pytorch_model-00004-of-00009.bin",
120
  "transformer.h.25.self_attention.query_key_value.weight": "pytorch_model-00004-of-00009.bin",
121
- "transformer.h.26.input_layernorm.bias": "pytorch_model-00004-of-00009.bin",
122
- "transformer.h.26.input_layernorm.weight": "pytorch_model-00004-of-00009.bin",
 
 
123
  "transformer.h.26.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00009.bin",
124
  "transformer.h.26.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00009.bin",
125
  "transformer.h.26.self_attention.dense.weight": "pytorch_model-00004-of-00009.bin",
126
  "transformer.h.26.self_attention.query_key_value.weight": "pytorch_model-00004-of-00009.bin",
127
- "transformer.h.27.input_layernorm.bias": "pytorch_model-00004-of-00009.bin",
128
- "transformer.h.27.input_layernorm.weight": "pytorch_model-00004-of-00009.bin",
 
 
129
  "transformer.h.27.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00009.bin",
130
  "transformer.h.27.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00009.bin",
131
  "transformer.h.27.self_attention.dense.weight": "pytorch_model-00004-of-00009.bin",
132
  "transformer.h.27.self_attention.query_key_value.weight": "pytorch_model-00004-of-00009.bin",
133
- "transformer.h.28.input_layernorm.bias": "pytorch_model-00005-of-00009.bin",
134
- "transformer.h.28.input_layernorm.weight": "pytorch_model-00005-of-00009.bin",
 
 
135
  "transformer.h.28.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00009.bin",
136
  "transformer.h.28.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00009.bin",
137
  "transformer.h.28.self_attention.dense.weight": "pytorch_model-00005-of-00009.bin",
138
  "transformer.h.28.self_attention.query_key_value.weight": "pytorch_model-00005-of-00009.bin",
139
- "transformer.h.29.input_layernorm.bias": "pytorch_model-00005-of-00009.bin",
140
- "transformer.h.29.input_layernorm.weight": "pytorch_model-00005-of-00009.bin",
 
 
141
  "transformer.h.29.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00009.bin",
142
  "transformer.h.29.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00009.bin",
143
  "transformer.h.29.self_attention.dense.weight": "pytorch_model-00005-of-00009.bin",
144
  "transformer.h.29.self_attention.query_key_value.weight": "pytorch_model-00005-of-00009.bin",
145
- "transformer.h.3.input_layernorm.bias": "pytorch_model-00001-of-00009.bin",
146
- "transformer.h.3.input_layernorm.weight": "pytorch_model-00001-of-00009.bin",
 
 
147
  "transformer.h.3.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00009.bin",
148
  "transformer.h.3.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00009.bin",
149
  "transformer.h.3.self_attention.dense.weight": "pytorch_model-00001-of-00009.bin",
150
  "transformer.h.3.self_attention.query_key_value.weight": "pytorch_model-00001-of-00009.bin",
151
- "transformer.h.30.input_layernorm.bias": "pytorch_model-00005-of-00009.bin",
152
- "transformer.h.30.input_layernorm.weight": "pytorch_model-00005-of-00009.bin",
 
 
153
  "transformer.h.30.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00009.bin",
154
  "transformer.h.30.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00009.bin",
155
  "transformer.h.30.self_attention.dense.weight": "pytorch_model-00005-of-00009.bin",
156
  "transformer.h.30.self_attention.query_key_value.weight": "pytorch_model-00005-of-00009.bin",
157
- "transformer.h.31.input_layernorm.bias": "pytorch_model-00005-of-00009.bin",
158
- "transformer.h.31.input_layernorm.weight": "pytorch_model-00005-of-00009.bin",
 
 
159
  "transformer.h.31.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00009.bin",
160
  "transformer.h.31.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00009.bin",
161
  "transformer.h.31.self_attention.dense.weight": "pytorch_model-00005-of-00009.bin",
162
  "transformer.h.31.self_attention.query_key_value.weight": "pytorch_model-00005-of-00009.bin",
163
- "transformer.h.32.input_layernorm.bias": "pytorch_model-00005-of-00009.bin",
164
- "transformer.h.32.input_layernorm.weight": "pytorch_model-00005-of-00009.bin",
 
 
165
  "transformer.h.32.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00009.bin",
166
  "transformer.h.32.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00009.bin",
167
  "transformer.h.32.self_attention.dense.weight": "pytorch_model-00005-of-00009.bin",
168
  "transformer.h.32.self_attention.query_key_value.weight": "pytorch_model-00005-of-00009.bin",
169
- "transformer.h.33.input_layernorm.bias": "pytorch_model-00005-of-00009.bin",
170
- "transformer.h.33.input_layernorm.weight": "pytorch_model-00005-of-00009.bin",
 
 
171
  "transformer.h.33.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00009.bin",
172
  "transformer.h.33.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00009.bin",
173
  "transformer.h.33.self_attention.dense.weight": "pytorch_model-00005-of-00009.bin",
174
  "transformer.h.33.self_attention.query_key_value.weight": "pytorch_model-00005-of-00009.bin",
175
- "transformer.h.34.input_layernorm.bias": "pytorch_model-00005-of-00009.bin",
176
- "transformer.h.34.input_layernorm.weight": "pytorch_model-00005-of-00009.bin",
 
 
177
  "transformer.h.34.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00009.bin",
178
  "transformer.h.34.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00009.bin",
179
  "transformer.h.34.self_attention.dense.weight": "pytorch_model-00005-of-00009.bin",
180
  "transformer.h.34.self_attention.query_key_value.weight": "pytorch_model-00005-of-00009.bin",
181
- "transformer.h.35.input_layernorm.bias": "pytorch_model-00006-of-00009.bin",
182
- "transformer.h.35.input_layernorm.weight": "pytorch_model-00006-of-00009.bin",
 
 
183
  "transformer.h.35.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00009.bin",
184
  "transformer.h.35.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00009.bin",
185
  "transformer.h.35.self_attention.dense.weight": "pytorch_model-00006-of-00009.bin",
186
  "transformer.h.35.self_attention.query_key_value.weight": "pytorch_model-00006-of-00009.bin",
187
- "transformer.h.36.input_layernorm.bias": "pytorch_model-00006-of-00009.bin",
188
- "transformer.h.36.input_layernorm.weight": "pytorch_model-00006-of-00009.bin",
 
 
189
  "transformer.h.36.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00009.bin",
190
  "transformer.h.36.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00009.bin",
191
  "transformer.h.36.self_attention.dense.weight": "pytorch_model-00006-of-00009.bin",
192
  "transformer.h.36.self_attention.query_key_value.weight": "pytorch_model-00006-of-00009.bin",
193
- "transformer.h.37.input_layernorm.bias": "pytorch_model-00006-of-00009.bin",
194
- "transformer.h.37.input_layernorm.weight": "pytorch_model-00006-of-00009.bin",
 
 
195
  "transformer.h.37.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00009.bin",
196
  "transformer.h.37.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00009.bin",
197
  "transformer.h.37.self_attention.dense.weight": "pytorch_model-00006-of-00009.bin",
198
  "transformer.h.37.self_attention.query_key_value.weight": "pytorch_model-00006-of-00009.bin",
199
- "transformer.h.38.input_layernorm.bias": "pytorch_model-00006-of-00009.bin",
200
- "transformer.h.38.input_layernorm.weight": "pytorch_model-00006-of-00009.bin",
 
 
201
  "transformer.h.38.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00009.bin",
202
  "transformer.h.38.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00009.bin",
203
  "transformer.h.38.self_attention.dense.weight": "pytorch_model-00006-of-00009.bin",
204
  "transformer.h.38.self_attention.query_key_value.weight": "pytorch_model-00006-of-00009.bin",
205
- "transformer.h.39.input_layernorm.bias": "pytorch_model-00006-of-00009.bin",
206
- "transformer.h.39.input_layernorm.weight": "pytorch_model-00006-of-00009.bin",
 
 
207
  "transformer.h.39.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00009.bin",
208
  "transformer.h.39.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00009.bin",
209
  "transformer.h.39.self_attention.dense.weight": "pytorch_model-00006-of-00009.bin",
210
  "transformer.h.39.self_attention.query_key_value.weight": "pytorch_model-00006-of-00009.bin",
211
- "transformer.h.4.input_layernorm.bias": "pytorch_model-00001-of-00009.bin",
212
- "transformer.h.4.input_layernorm.weight": "pytorch_model-00001-of-00009.bin",
 
 
213
  "transformer.h.4.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00009.bin",
214
  "transformer.h.4.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00009.bin",
215
  "transformer.h.4.self_attention.dense.weight": "pytorch_model-00001-of-00009.bin",
216
  "transformer.h.4.self_attention.query_key_value.weight": "pytorch_model-00001-of-00009.bin",
217
- "transformer.h.40.input_layernorm.bias": "pytorch_model-00006-of-00009.bin",
218
- "transformer.h.40.input_layernorm.weight": "pytorch_model-00006-of-00009.bin",
 
 
219
  "transformer.h.40.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00009.bin",
220
  "transformer.h.40.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00009.bin",
221
  "transformer.h.40.self_attention.dense.weight": "pytorch_model-00006-of-00009.bin",
222
  "transformer.h.40.self_attention.query_key_value.weight": "pytorch_model-00006-of-00009.bin",
223
- "transformer.h.41.input_layernorm.bias": "pytorch_model-00006-of-00009.bin",
224
- "transformer.h.41.input_layernorm.weight": "pytorch_model-00006-of-00009.bin",
 
 
225
  "transformer.h.41.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00009.bin",
226
  "transformer.h.41.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00009.bin",
227
  "transformer.h.41.self_attention.dense.weight": "pytorch_model-00006-of-00009.bin",
228
  "transformer.h.41.self_attention.query_key_value.weight": "pytorch_model-00006-of-00009.bin",
229
- "transformer.h.42.input_layernorm.bias": "pytorch_model-00007-of-00009.bin",
230
- "transformer.h.42.input_layernorm.weight": "pytorch_model-00007-of-00009.bin",
 
 
231
  "transformer.h.42.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00009.bin",
232
  "transformer.h.42.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00009.bin",
233
  "transformer.h.42.self_attention.dense.weight": "pytorch_model-00007-of-00009.bin",
234
  "transformer.h.42.self_attention.query_key_value.weight": "pytorch_model-00007-of-00009.bin",
235
- "transformer.h.43.input_layernorm.bias": "pytorch_model-00007-of-00009.bin",
236
- "transformer.h.43.input_layernorm.weight": "pytorch_model-00007-of-00009.bin",
 
 
237
  "transformer.h.43.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00009.bin",
238
  "transformer.h.43.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00009.bin",
239
  "transformer.h.43.self_attention.dense.weight": "pytorch_model-00007-of-00009.bin",
240
  "transformer.h.43.self_attention.query_key_value.weight": "pytorch_model-00007-of-00009.bin",
241
- "transformer.h.44.input_layernorm.bias": "pytorch_model-00007-of-00009.bin",
242
- "transformer.h.44.input_layernorm.weight": "pytorch_model-00007-of-00009.bin",
 
 
243
  "transformer.h.44.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00009.bin",
244
  "transformer.h.44.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00009.bin",
245
  "transformer.h.44.self_attention.dense.weight": "pytorch_model-00007-of-00009.bin",
246
  "transformer.h.44.self_attention.query_key_value.weight": "pytorch_model-00007-of-00009.bin",
247
- "transformer.h.45.input_layernorm.bias": "pytorch_model-00007-of-00009.bin",
248
- "transformer.h.45.input_layernorm.weight": "pytorch_model-00007-of-00009.bin",
 
 
249
  "transformer.h.45.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00009.bin",
250
  "transformer.h.45.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00009.bin",
251
  "transformer.h.45.self_attention.dense.weight": "pytorch_model-00007-of-00009.bin",
252
  "transformer.h.45.self_attention.query_key_value.weight": "pytorch_model-00007-of-00009.bin",
253
- "transformer.h.46.input_layernorm.bias": "pytorch_model-00007-of-00009.bin",
254
- "transformer.h.46.input_layernorm.weight": "pytorch_model-00007-of-00009.bin",
 
 
255
  "transformer.h.46.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00009.bin",
256
  "transformer.h.46.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00009.bin",
257
  "transformer.h.46.self_attention.dense.weight": "pytorch_model-00007-of-00009.bin",
258
  "transformer.h.46.self_attention.query_key_value.weight": "pytorch_model-00007-of-00009.bin",
259
- "transformer.h.47.input_layernorm.bias": "pytorch_model-00007-of-00009.bin",
260
- "transformer.h.47.input_layernorm.weight": "pytorch_model-00007-of-00009.bin",
 
 
261
  "transformer.h.47.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00009.bin",
262
  "transformer.h.47.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00009.bin",
263
  "transformer.h.47.self_attention.dense.weight": "pytorch_model-00007-of-00009.bin",
264
  "transformer.h.47.self_attention.query_key_value.weight": "pytorch_model-00007-of-00009.bin",
265
- "transformer.h.48.input_layernorm.bias": "pytorch_model-00007-of-00009.bin",
266
- "transformer.h.48.input_layernorm.weight": "pytorch_model-00007-of-00009.bin",
 
 
267
  "transformer.h.48.mlp.dense_4h_to_h.weight": "pytorch_model-00008-of-00009.bin",
268
  "transformer.h.48.mlp.dense_h_to_4h.weight": "pytorch_model-00008-of-00009.bin",
269
  "transformer.h.48.self_attention.dense.weight": "pytorch_model-00007-of-00009.bin",
270
  "transformer.h.48.self_attention.query_key_value.weight": "pytorch_model-00007-of-00009.bin",
271
- "transformer.h.49.input_layernorm.bias": "pytorch_model-00008-of-00009.bin",
272
- "transformer.h.49.input_layernorm.weight": "pytorch_model-00008-of-00009.bin",
 
 
273
  "transformer.h.49.mlp.dense_4h_to_h.weight": "pytorch_model-00008-of-00009.bin",
274
  "transformer.h.49.mlp.dense_h_to_4h.weight": "pytorch_model-00008-of-00009.bin",
275
  "transformer.h.49.self_attention.dense.weight": "pytorch_model-00008-of-00009.bin",
276
  "transformer.h.49.self_attention.query_key_value.weight": "pytorch_model-00008-of-00009.bin",
277
- "transformer.h.5.input_layernorm.bias": "pytorch_model-00001-of-00009.bin",
278
- "transformer.h.5.input_layernorm.weight": "pytorch_model-00001-of-00009.bin",
 
 
279
  "transformer.h.5.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00009.bin",
280
  "transformer.h.5.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00009.bin",
281
  "transformer.h.5.self_attention.dense.weight": "pytorch_model-00001-of-00009.bin",
282
  "transformer.h.5.self_attention.query_key_value.weight": "pytorch_model-00001-of-00009.bin",
283
- "transformer.h.50.input_layernorm.bias": "pytorch_model-00008-of-00009.bin",
284
- "transformer.h.50.input_layernorm.weight": "pytorch_model-00008-of-00009.bin",
 
 
285
  "transformer.h.50.mlp.dense_4h_to_h.weight": "pytorch_model-00008-of-00009.bin",
286
  "transformer.h.50.mlp.dense_h_to_4h.weight": "pytorch_model-00008-of-00009.bin",
287
  "transformer.h.50.self_attention.dense.weight": "pytorch_model-00008-of-00009.bin",
288
  "transformer.h.50.self_attention.query_key_value.weight": "pytorch_model-00008-of-00009.bin",
289
- "transformer.h.51.input_layernorm.bias": "pytorch_model-00008-of-00009.bin",
290
- "transformer.h.51.input_layernorm.weight": "pytorch_model-00008-of-00009.bin",
 
 
291
  "transformer.h.51.mlp.dense_4h_to_h.weight": "pytorch_model-00008-of-00009.bin",
292
  "transformer.h.51.mlp.dense_h_to_4h.weight": "pytorch_model-00008-of-00009.bin",
293
  "transformer.h.51.self_attention.dense.weight": "pytorch_model-00008-of-00009.bin",
294
  "transformer.h.51.self_attention.query_key_value.weight": "pytorch_model-00008-of-00009.bin",
295
- "transformer.h.52.input_layernorm.bias": "pytorch_model-00008-of-00009.bin",
296
- "transformer.h.52.input_layernorm.weight": "pytorch_model-00008-of-00009.bin",
 
 
297
  "transformer.h.52.mlp.dense_4h_to_h.weight": "pytorch_model-00008-of-00009.bin",
298
  "transformer.h.52.mlp.dense_h_to_4h.weight": "pytorch_model-00008-of-00009.bin",
299
  "transformer.h.52.self_attention.dense.weight": "pytorch_model-00008-of-00009.bin",
300
  "transformer.h.52.self_attention.query_key_value.weight": "pytorch_model-00008-of-00009.bin",
301
- "transformer.h.53.input_layernorm.bias": "pytorch_model-00008-of-00009.bin",
302
- "transformer.h.53.input_layernorm.weight": "pytorch_model-00008-of-00009.bin",
 
 
303
  "transformer.h.53.mlp.dense_4h_to_h.weight": "pytorch_model-00008-of-00009.bin",
304
  "transformer.h.53.mlp.dense_h_to_4h.weight": "pytorch_model-00008-of-00009.bin",
305
  "transformer.h.53.self_attention.dense.weight": "pytorch_model-00008-of-00009.bin",
306
  "transformer.h.53.self_attention.query_key_value.weight": "pytorch_model-00008-of-00009.bin",
307
- "transformer.h.54.input_layernorm.bias": "pytorch_model-00008-of-00009.bin",
308
- "transformer.h.54.input_layernorm.weight": "pytorch_model-00008-of-00009.bin",
 
 
309
  "transformer.h.54.mlp.dense_4h_to_h.weight": "pytorch_model-00008-of-00009.bin",
310
  "transformer.h.54.mlp.dense_h_to_4h.weight": "pytorch_model-00008-of-00009.bin",
311
  "transformer.h.54.self_attention.dense.weight": "pytorch_model-00008-of-00009.bin",
312
  "transformer.h.54.self_attention.query_key_value.weight": "pytorch_model-00008-of-00009.bin",
313
- "transformer.h.55.input_layernorm.bias": "pytorch_model-00008-of-00009.bin",
314
- "transformer.h.55.input_layernorm.weight": "pytorch_model-00008-of-00009.bin",
 
 
315
  "transformer.h.55.mlp.dense_4h_to_h.weight": "pytorch_model-00009-of-00009.bin",
316
  "transformer.h.55.mlp.dense_h_to_4h.weight": "pytorch_model-00009-of-00009.bin",
317
  "transformer.h.55.self_attention.dense.weight": "pytorch_model-00008-of-00009.bin",
318
  "transformer.h.55.self_attention.query_key_value.weight": "pytorch_model-00008-of-00009.bin",
319
- "transformer.h.56.input_layernorm.bias": "pytorch_model-00009-of-00009.bin",
320
- "transformer.h.56.input_layernorm.weight": "pytorch_model-00009-of-00009.bin",
 
 
321
  "transformer.h.56.mlp.dense_4h_to_h.weight": "pytorch_model-00009-of-00009.bin",
322
  "transformer.h.56.mlp.dense_h_to_4h.weight": "pytorch_model-00009-of-00009.bin",
323
  "transformer.h.56.self_attention.dense.weight": "pytorch_model-00009-of-00009.bin",
324
  "transformer.h.56.self_attention.query_key_value.weight": "pytorch_model-00009-of-00009.bin",
325
- "transformer.h.57.input_layernorm.bias": "pytorch_model-00009-of-00009.bin",
326
- "transformer.h.57.input_layernorm.weight": "pytorch_model-00009-of-00009.bin",
 
 
327
  "transformer.h.57.mlp.dense_4h_to_h.weight": "pytorch_model-00009-of-00009.bin",
328
  "transformer.h.57.mlp.dense_h_to_4h.weight": "pytorch_model-00009-of-00009.bin",
329
  "transformer.h.57.self_attention.dense.weight": "pytorch_model-00009-of-00009.bin",
330
  "transformer.h.57.self_attention.query_key_value.weight": "pytorch_model-00009-of-00009.bin",
331
- "transformer.h.58.input_layernorm.bias": "pytorch_model-00009-of-00009.bin",
332
- "transformer.h.58.input_layernorm.weight": "pytorch_model-00009-of-00009.bin",
 
 
333
  "transformer.h.58.mlp.dense_4h_to_h.weight": "pytorch_model-00009-of-00009.bin",
334
  "transformer.h.58.mlp.dense_h_to_4h.weight": "pytorch_model-00009-of-00009.bin",
335
  "transformer.h.58.self_attention.dense.weight": "pytorch_model-00009-of-00009.bin",
336
  "transformer.h.58.self_attention.query_key_value.weight": "pytorch_model-00009-of-00009.bin",
337
- "transformer.h.59.input_layernorm.bias": "pytorch_model-00009-of-00009.bin",
338
- "transformer.h.59.input_layernorm.weight": "pytorch_model-00009-of-00009.bin",
 
 
339
  "transformer.h.59.mlp.dense_4h_to_h.weight": "pytorch_model-00009-of-00009.bin",
340
  "transformer.h.59.mlp.dense_h_to_4h.weight": "pytorch_model-00009-of-00009.bin",
341
  "transformer.h.59.self_attention.dense.weight": "pytorch_model-00009-of-00009.bin",
342
  "transformer.h.59.self_attention.query_key_value.weight": "pytorch_model-00009-of-00009.bin",
343
- "transformer.h.6.input_layernorm.bias": "pytorch_model-00001-of-00009.bin",
344
- "transformer.h.6.input_layernorm.weight": "pytorch_model-00001-of-00009.bin",
 
 
345
  "transformer.h.6.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00009.bin",
346
  "transformer.h.6.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00009.bin",
347
  "transformer.h.6.self_attention.dense.weight": "pytorch_model-00001-of-00009.bin",
348
  "transformer.h.6.self_attention.query_key_value.weight": "pytorch_model-00001-of-00009.bin",
349
- "transformer.h.7.input_layernorm.bias": "pytorch_model-00002-of-00009.bin",
350
- "transformer.h.7.input_layernorm.weight": "pytorch_model-00002-of-00009.bin",
 
 
351
  "transformer.h.7.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00009.bin",
352
  "transformer.h.7.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00009.bin",
353
  "transformer.h.7.self_attention.dense.weight": "pytorch_model-00002-of-00009.bin",
354
  "transformer.h.7.self_attention.query_key_value.weight": "pytorch_model-00002-of-00009.bin",
355
- "transformer.h.8.input_layernorm.bias": "pytorch_model-00002-of-00009.bin",
356
- "transformer.h.8.input_layernorm.weight": "pytorch_model-00002-of-00009.bin",
 
 
357
  "transformer.h.8.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00009.bin",
358
  "transformer.h.8.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00009.bin",
359
  "transformer.h.8.self_attention.dense.weight": "pytorch_model-00002-of-00009.bin",
360
  "transformer.h.8.self_attention.query_key_value.weight": "pytorch_model-00002-of-00009.bin",
361
- "transformer.h.9.input_layernorm.bias": "pytorch_model-00002-of-00009.bin",
362
- "transformer.h.9.input_layernorm.weight": "pytorch_model-00002-of-00009.bin",
 
 
363
  "transformer.h.9.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00009.bin",
364
  "transformer.h.9.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00009.bin",
365
  "transformer.h.9.self_attention.dense.weight": "pytorch_model-00002-of-00009.bin",
 
1
  {
2
  "metadata": {
3
+ "total_size": 83671941120
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "pytorch_model-00009-of-00009.bin",
7
+ "transformer.h.0.ln_attn.bias": "pytorch_model-00001-of-00009.bin",
8
+ "transformer.h.0.ln_attn.weight": "pytorch_model-00001-of-00009.bin",
9
+ "transformer.h.0.ln_mlp.bias": "pytorch_model-00001-of-00009.bin",
10
+ "transformer.h.0.ln_mlp.weight": "pytorch_model-00001-of-00009.bin",
11
  "transformer.h.0.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00009.bin",
12
  "transformer.h.0.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00009.bin",
13
  "transformer.h.0.self_attention.dense.weight": "pytorch_model-00001-of-00009.bin",
14
  "transformer.h.0.self_attention.query_key_value.weight": "pytorch_model-00001-of-00009.bin",
15
+ "transformer.h.1.ln_attn.bias": "pytorch_model-00001-of-00009.bin",
16
+ "transformer.h.1.ln_attn.weight": "pytorch_model-00001-of-00009.bin",
17
+ "transformer.h.1.ln_mlp.bias": "pytorch_model-00001-of-00009.bin",
18
+ "transformer.h.1.ln_mlp.weight": "pytorch_model-00001-of-00009.bin",
19
  "transformer.h.1.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00009.bin",
20
  "transformer.h.1.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00009.bin",
21
  "transformer.h.1.self_attention.dense.weight": "pytorch_model-00001-of-00009.bin",
22
  "transformer.h.1.self_attention.query_key_value.weight": "pytorch_model-00001-of-00009.bin",
23
+ "transformer.h.10.ln_attn.bias": "pytorch_model-00002-of-00009.bin",
24
+ "transformer.h.10.ln_attn.weight": "pytorch_model-00002-of-00009.bin",
25
+ "transformer.h.10.ln_mlp.bias": "pytorch_model-00002-of-00009.bin",
26
+ "transformer.h.10.ln_mlp.weight": "pytorch_model-00002-of-00009.bin",
27
  "transformer.h.10.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00009.bin",
28
  "transformer.h.10.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00009.bin",
29
  "transformer.h.10.self_attention.dense.weight": "pytorch_model-00002-of-00009.bin",
30
  "transformer.h.10.self_attention.query_key_value.weight": "pytorch_model-00002-of-00009.bin",
31
+ "transformer.h.11.ln_attn.bias": "pytorch_model-00002-of-00009.bin",
32
+ "transformer.h.11.ln_attn.weight": "pytorch_model-00002-of-00009.bin",
33
+ "transformer.h.11.ln_mlp.bias": "pytorch_model-00002-of-00009.bin",
34
+ "transformer.h.11.ln_mlp.weight": "pytorch_model-00002-of-00009.bin",
35
  "transformer.h.11.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00009.bin",
36
  "transformer.h.11.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00009.bin",
37
  "transformer.h.11.self_attention.dense.weight": "pytorch_model-00002-of-00009.bin",
38
  "transformer.h.11.self_attention.query_key_value.weight": "pytorch_model-00002-of-00009.bin",
39
+ "transformer.h.12.ln_attn.bias": "pytorch_model-00002-of-00009.bin",
40
+ "transformer.h.12.ln_attn.weight": "pytorch_model-00002-of-00009.bin",
41
+ "transformer.h.12.ln_mlp.bias": "pytorch_model-00002-of-00009.bin",
42
+ "transformer.h.12.ln_mlp.weight": "pytorch_model-00002-of-00009.bin",
43
  "transformer.h.12.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00009.bin",
44
  "transformer.h.12.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00009.bin",
45
  "transformer.h.12.self_attention.dense.weight": "pytorch_model-00002-of-00009.bin",
46
  "transformer.h.12.self_attention.query_key_value.weight": "pytorch_model-00002-of-00009.bin",
47
+ "transformer.h.13.ln_attn.bias": "pytorch_model-00002-of-00009.bin",
48
+ "transformer.h.13.ln_attn.weight": "pytorch_model-00002-of-00009.bin",
49
+ "transformer.h.13.ln_mlp.bias": "pytorch_model-00002-of-00009.bin",
50
+ "transformer.h.13.ln_mlp.weight": "pytorch_model-00002-of-00009.bin",
51
  "transformer.h.13.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00009.bin",
52
  "transformer.h.13.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00009.bin",
53
  "transformer.h.13.self_attention.dense.weight": "pytorch_model-00002-of-00009.bin",
54
  "transformer.h.13.self_attention.query_key_value.weight": "pytorch_model-00002-of-00009.bin",
55
+ "transformer.h.14.ln_attn.bias": "pytorch_model-00003-of-00009.bin",
56
+ "transformer.h.14.ln_attn.weight": "pytorch_model-00003-of-00009.bin",
57
+ "transformer.h.14.ln_mlp.bias": "pytorch_model-00003-of-00009.bin",
58
+ "transformer.h.14.ln_mlp.weight": "pytorch_model-00003-of-00009.bin",
59
  "transformer.h.14.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00009.bin",
60
  "transformer.h.14.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00009.bin",
61
  "transformer.h.14.self_attention.dense.weight": "pytorch_model-00003-of-00009.bin",
62
  "transformer.h.14.self_attention.query_key_value.weight": "pytorch_model-00003-of-00009.bin",
63
+ "transformer.h.15.ln_attn.bias": "pytorch_model-00003-of-00009.bin",
64
+ "transformer.h.15.ln_attn.weight": "pytorch_model-00003-of-00009.bin",
65
+ "transformer.h.15.ln_mlp.bias": "pytorch_model-00003-of-00009.bin",
66
+ "transformer.h.15.ln_mlp.weight": "pytorch_model-00003-of-00009.bin",
67
  "transformer.h.15.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00009.bin",
68
  "transformer.h.15.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00009.bin",
69
  "transformer.h.15.self_attention.dense.weight": "pytorch_model-00003-of-00009.bin",
70
  "transformer.h.15.self_attention.query_key_value.weight": "pytorch_model-00003-of-00009.bin",
71
+ "transformer.h.16.ln_attn.bias": "pytorch_model-00003-of-00009.bin",
72
+ "transformer.h.16.ln_attn.weight": "pytorch_model-00003-of-00009.bin",
73
+ "transformer.h.16.ln_mlp.bias": "pytorch_model-00003-of-00009.bin",
74
+ "transformer.h.16.ln_mlp.weight": "pytorch_model-00003-of-00009.bin",
75
  "transformer.h.16.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00009.bin",
76
  "transformer.h.16.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00009.bin",
77
  "transformer.h.16.self_attention.dense.weight": "pytorch_model-00003-of-00009.bin",
78
  "transformer.h.16.self_attention.query_key_value.weight": "pytorch_model-00003-of-00009.bin",
79
+ "transformer.h.17.ln_attn.bias": "pytorch_model-00003-of-00009.bin",
80
+ "transformer.h.17.ln_attn.weight": "pytorch_model-00003-of-00009.bin",
81
+ "transformer.h.17.ln_mlp.bias": "pytorch_model-00003-of-00009.bin",
82
+ "transformer.h.17.ln_mlp.weight": "pytorch_model-00003-of-00009.bin",
83
  "transformer.h.17.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00009.bin",
84
  "transformer.h.17.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00009.bin",
85
  "transformer.h.17.self_attention.dense.weight": "pytorch_model-00003-of-00009.bin",
86
  "transformer.h.17.self_attention.query_key_value.weight": "pytorch_model-00003-of-00009.bin",
87
+ "transformer.h.18.ln_attn.bias": "pytorch_model-00003-of-00009.bin",
88
+ "transformer.h.18.ln_attn.weight": "pytorch_model-00003-of-00009.bin",
89
+ "transformer.h.18.ln_mlp.bias": "pytorch_model-00003-of-00009.bin",
90
+ "transformer.h.18.ln_mlp.weight": "pytorch_model-00003-of-00009.bin",
91
  "transformer.h.18.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00009.bin",
92
  "transformer.h.18.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00009.bin",
93
  "transformer.h.18.self_attention.dense.weight": "pytorch_model-00003-of-00009.bin",
94
  "transformer.h.18.self_attention.query_key_value.weight": "pytorch_model-00003-of-00009.bin",
95
+ "transformer.h.19.ln_attn.bias": "pytorch_model-00003-of-00009.bin",
96
+ "transformer.h.19.ln_attn.weight": "pytorch_model-00003-of-00009.bin",
97
+ "transformer.h.19.ln_mlp.bias": "pytorch_model-00003-of-00009.bin",
98
+ "transformer.h.19.ln_mlp.weight": "pytorch_model-00003-of-00009.bin",
99
  "transformer.h.19.mlp.dense_4h_to_h.weight": "pytorch_model-00003-of-00009.bin",
100
  "transformer.h.19.mlp.dense_h_to_4h.weight": "pytorch_model-00003-of-00009.bin",
101
  "transformer.h.19.self_attention.dense.weight": "pytorch_model-00003-of-00009.bin",
102
  "transformer.h.19.self_attention.query_key_value.weight": "pytorch_model-00003-of-00009.bin",
103
+ "transformer.h.2.ln_attn.bias": "pytorch_model-00001-of-00009.bin",
104
+ "transformer.h.2.ln_attn.weight": "pytorch_model-00001-of-00009.bin",
105
+ "transformer.h.2.ln_mlp.bias": "pytorch_model-00001-of-00009.bin",
106
+ "transformer.h.2.ln_mlp.weight": "pytorch_model-00001-of-00009.bin",
107
  "transformer.h.2.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00009.bin",
108
  "transformer.h.2.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00009.bin",
109
  "transformer.h.2.self_attention.dense.weight": "pytorch_model-00001-of-00009.bin",
110
  "transformer.h.2.self_attention.query_key_value.weight": "pytorch_model-00001-of-00009.bin",
111
+ "transformer.h.20.ln_attn.bias": "pytorch_model-00003-of-00009.bin",
112
+ "transformer.h.20.ln_attn.weight": "pytorch_model-00003-of-00009.bin",
113
+ "transformer.h.20.ln_mlp.bias": "pytorch_model-00003-of-00009.bin",
114
+ "transformer.h.20.ln_mlp.weight": "pytorch_model-00003-of-00009.bin",
115
  "transformer.h.20.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00009.bin",
116
  "transformer.h.20.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00009.bin",
117
  "transformer.h.20.self_attention.dense.weight": "pytorch_model-00003-of-00009.bin",
118
  "transformer.h.20.self_attention.query_key_value.weight": "pytorch_model-00003-of-00009.bin",
119
+ "transformer.h.21.ln_attn.bias": "pytorch_model-00004-of-00009.bin",
120
+ "transformer.h.21.ln_attn.weight": "pytorch_model-00004-of-00009.bin",
121
+ "transformer.h.21.ln_mlp.bias": "pytorch_model-00004-of-00009.bin",
122
+ "transformer.h.21.ln_mlp.weight": "pytorch_model-00004-of-00009.bin",
123
  "transformer.h.21.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00009.bin",
124
  "transformer.h.21.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00009.bin",
125
  "transformer.h.21.self_attention.dense.weight": "pytorch_model-00004-of-00009.bin",
126
  "transformer.h.21.self_attention.query_key_value.weight": "pytorch_model-00004-of-00009.bin",
127
+ "transformer.h.22.ln_attn.bias": "pytorch_model-00004-of-00009.bin",
128
+ "transformer.h.22.ln_attn.weight": "pytorch_model-00004-of-00009.bin",
129
+ "transformer.h.22.ln_mlp.bias": "pytorch_model-00004-of-00009.bin",
130
+ "transformer.h.22.ln_mlp.weight": "pytorch_model-00004-of-00009.bin",
131
  "transformer.h.22.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00009.bin",
132
  "transformer.h.22.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00009.bin",
133
  "transformer.h.22.self_attention.dense.weight": "pytorch_model-00004-of-00009.bin",
134
  "transformer.h.22.self_attention.query_key_value.weight": "pytorch_model-00004-of-00009.bin",
135
+ "transformer.h.23.ln_attn.bias": "pytorch_model-00004-of-00009.bin",
136
+ "transformer.h.23.ln_attn.weight": "pytorch_model-00004-of-00009.bin",
137
+ "transformer.h.23.ln_mlp.bias": "pytorch_model-00004-of-00009.bin",
138
+ "transformer.h.23.ln_mlp.weight": "pytorch_model-00004-of-00009.bin",
139
  "transformer.h.23.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00009.bin",
140
  "transformer.h.23.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00009.bin",
141
  "transformer.h.23.self_attention.dense.weight": "pytorch_model-00004-of-00009.bin",
142
  "transformer.h.23.self_attention.query_key_value.weight": "pytorch_model-00004-of-00009.bin",
143
+ "transformer.h.24.ln_attn.bias": "pytorch_model-00004-of-00009.bin",
144
+ "transformer.h.24.ln_attn.weight": "pytorch_model-00004-of-00009.bin",
145
+ "transformer.h.24.ln_mlp.bias": "pytorch_model-00004-of-00009.bin",
146
+ "transformer.h.24.ln_mlp.weight": "pytorch_model-00004-of-00009.bin",
147
  "transformer.h.24.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00009.bin",
148
  "transformer.h.24.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00009.bin",
149
  "transformer.h.24.self_attention.dense.weight": "pytorch_model-00004-of-00009.bin",
150
  "transformer.h.24.self_attention.query_key_value.weight": "pytorch_model-00004-of-00009.bin",
151
+ "transformer.h.25.ln_attn.bias": "pytorch_model-00004-of-00009.bin",
152
+ "transformer.h.25.ln_attn.weight": "pytorch_model-00004-of-00009.bin",
153
+ "transformer.h.25.ln_mlp.bias": "pytorch_model-00004-of-00009.bin",
154
+ "transformer.h.25.ln_mlp.weight": "pytorch_model-00004-of-00009.bin",
155
  "transformer.h.25.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00009.bin",
156
  "transformer.h.25.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00009.bin",
157
  "transformer.h.25.self_attention.dense.weight": "pytorch_model-00004-of-00009.bin",
158
  "transformer.h.25.self_attention.query_key_value.weight": "pytorch_model-00004-of-00009.bin",
159
+ "transformer.h.26.ln_attn.bias": "pytorch_model-00004-of-00009.bin",
160
+ "transformer.h.26.ln_attn.weight": "pytorch_model-00004-of-00009.bin",
161
+ "transformer.h.26.ln_mlp.bias": "pytorch_model-00004-of-00009.bin",
162
+ "transformer.h.26.ln_mlp.weight": "pytorch_model-00004-of-00009.bin",
163
  "transformer.h.26.mlp.dense_4h_to_h.weight": "pytorch_model-00004-of-00009.bin",
164
  "transformer.h.26.mlp.dense_h_to_4h.weight": "pytorch_model-00004-of-00009.bin",
165
  "transformer.h.26.self_attention.dense.weight": "pytorch_model-00004-of-00009.bin",
166
  "transformer.h.26.self_attention.query_key_value.weight": "pytorch_model-00004-of-00009.bin",
167
+ "transformer.h.27.ln_attn.bias": "pytorch_model-00004-of-00009.bin",
168
+ "transformer.h.27.ln_attn.weight": "pytorch_model-00004-of-00009.bin",
169
+ "transformer.h.27.ln_mlp.bias": "pytorch_model-00004-of-00009.bin",
170
+ "transformer.h.27.ln_mlp.weight": "pytorch_model-00004-of-00009.bin",
171
  "transformer.h.27.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00009.bin",
172
  "transformer.h.27.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00009.bin",
173
  "transformer.h.27.self_attention.dense.weight": "pytorch_model-00004-of-00009.bin",
174
  "transformer.h.27.self_attention.query_key_value.weight": "pytorch_model-00004-of-00009.bin",
175
+ "transformer.h.28.ln_attn.bias": "pytorch_model-00005-of-00009.bin",
176
+ "transformer.h.28.ln_attn.weight": "pytorch_model-00005-of-00009.bin",
177
+ "transformer.h.28.ln_mlp.bias": "pytorch_model-00005-of-00009.bin",
178
+ "transformer.h.28.ln_mlp.weight": "pytorch_model-00005-of-00009.bin",
179
  "transformer.h.28.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00009.bin",
180
  "transformer.h.28.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00009.bin",
181
  "transformer.h.28.self_attention.dense.weight": "pytorch_model-00005-of-00009.bin",
182
  "transformer.h.28.self_attention.query_key_value.weight": "pytorch_model-00005-of-00009.bin",
183
+ "transformer.h.29.ln_attn.bias": "pytorch_model-00005-of-00009.bin",
184
+ "transformer.h.29.ln_attn.weight": "pytorch_model-00005-of-00009.bin",
185
+ "transformer.h.29.ln_mlp.bias": "pytorch_model-00005-of-00009.bin",
186
+ "transformer.h.29.ln_mlp.weight": "pytorch_model-00005-of-00009.bin",
187
  "transformer.h.29.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00009.bin",
188
  "transformer.h.29.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00009.bin",
189
  "transformer.h.29.self_attention.dense.weight": "pytorch_model-00005-of-00009.bin",
190
  "transformer.h.29.self_attention.query_key_value.weight": "pytorch_model-00005-of-00009.bin",
191
+ "transformer.h.3.ln_attn.bias": "pytorch_model-00001-of-00009.bin",
192
+ "transformer.h.3.ln_attn.weight": "pytorch_model-00001-of-00009.bin",
193
+ "transformer.h.3.ln_mlp.bias": "pytorch_model-00001-of-00009.bin",
194
+ "transformer.h.3.ln_mlp.weight": "pytorch_model-00001-of-00009.bin",
195
  "transformer.h.3.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00009.bin",
196
  "transformer.h.3.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00009.bin",
197
  "transformer.h.3.self_attention.dense.weight": "pytorch_model-00001-of-00009.bin",
198
  "transformer.h.3.self_attention.query_key_value.weight": "pytorch_model-00001-of-00009.bin",
199
+ "transformer.h.30.ln_attn.bias": "pytorch_model-00005-of-00009.bin",
200
+ "transformer.h.30.ln_attn.weight": "pytorch_model-00005-of-00009.bin",
201
+ "transformer.h.30.ln_mlp.bias": "pytorch_model-00005-of-00009.bin",
202
+ "transformer.h.30.ln_mlp.weight": "pytorch_model-00005-of-00009.bin",
203
  "transformer.h.30.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00009.bin",
204
  "transformer.h.30.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00009.bin",
205
  "transformer.h.30.self_attention.dense.weight": "pytorch_model-00005-of-00009.bin",
206
  "transformer.h.30.self_attention.query_key_value.weight": "pytorch_model-00005-of-00009.bin",
207
+ "transformer.h.31.ln_attn.bias": "pytorch_model-00005-of-00009.bin",
208
+ "transformer.h.31.ln_attn.weight": "pytorch_model-00005-of-00009.bin",
209
+ "transformer.h.31.ln_mlp.bias": "pytorch_model-00005-of-00009.bin",
210
+ "transformer.h.31.ln_mlp.weight": "pytorch_model-00005-of-00009.bin",
211
  "transformer.h.31.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00009.bin",
212
  "transformer.h.31.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00009.bin",
213
  "transformer.h.31.self_attention.dense.weight": "pytorch_model-00005-of-00009.bin",
214
  "transformer.h.31.self_attention.query_key_value.weight": "pytorch_model-00005-of-00009.bin",
215
+ "transformer.h.32.ln_attn.bias": "pytorch_model-00005-of-00009.bin",
216
+ "transformer.h.32.ln_attn.weight": "pytorch_model-00005-of-00009.bin",
217
+ "transformer.h.32.ln_mlp.bias": "pytorch_model-00005-of-00009.bin",
218
+ "transformer.h.32.ln_mlp.weight": "pytorch_model-00005-of-00009.bin",
219
  "transformer.h.32.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00009.bin",
220
  "transformer.h.32.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00009.bin",
221
  "transformer.h.32.self_attention.dense.weight": "pytorch_model-00005-of-00009.bin",
222
  "transformer.h.32.self_attention.query_key_value.weight": "pytorch_model-00005-of-00009.bin",
223
+ "transformer.h.33.ln_attn.bias": "pytorch_model-00005-of-00009.bin",
224
+ "transformer.h.33.ln_attn.weight": "pytorch_model-00005-of-00009.bin",
225
+ "transformer.h.33.ln_mlp.bias": "pytorch_model-00005-of-00009.bin",
226
+ "transformer.h.33.ln_mlp.weight": "pytorch_model-00005-of-00009.bin",
227
  "transformer.h.33.mlp.dense_4h_to_h.weight": "pytorch_model-00005-of-00009.bin",
228
  "transformer.h.33.mlp.dense_h_to_4h.weight": "pytorch_model-00005-of-00009.bin",
229
  "transformer.h.33.self_attention.dense.weight": "pytorch_model-00005-of-00009.bin",
230
  "transformer.h.33.self_attention.query_key_value.weight": "pytorch_model-00005-of-00009.bin",
231
+ "transformer.h.34.ln_attn.bias": "pytorch_model-00005-of-00009.bin",
232
+ "transformer.h.34.ln_attn.weight": "pytorch_model-00005-of-00009.bin",
233
+ "transformer.h.34.ln_mlp.bias": "pytorch_model-00005-of-00009.bin",
234
+ "transformer.h.34.ln_mlp.weight": "pytorch_model-00005-of-00009.bin",
235
  "transformer.h.34.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00009.bin",
236
  "transformer.h.34.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00009.bin",
237
  "transformer.h.34.self_attention.dense.weight": "pytorch_model-00005-of-00009.bin",
238
  "transformer.h.34.self_attention.query_key_value.weight": "pytorch_model-00005-of-00009.bin",
239
+ "transformer.h.35.ln_attn.bias": "pytorch_model-00006-of-00009.bin",
240
+ "transformer.h.35.ln_attn.weight": "pytorch_model-00006-of-00009.bin",
241
+ "transformer.h.35.ln_mlp.bias": "pytorch_model-00006-of-00009.bin",
242
+ "transformer.h.35.ln_mlp.weight": "pytorch_model-00006-of-00009.bin",
243
  "transformer.h.35.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00009.bin",
244
  "transformer.h.35.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00009.bin",
245
  "transformer.h.35.self_attention.dense.weight": "pytorch_model-00006-of-00009.bin",
246
  "transformer.h.35.self_attention.query_key_value.weight": "pytorch_model-00006-of-00009.bin",
247
+ "transformer.h.36.ln_attn.bias": "pytorch_model-00006-of-00009.bin",
248
+ "transformer.h.36.ln_attn.weight": "pytorch_model-00006-of-00009.bin",
249
+ "transformer.h.36.ln_mlp.bias": "pytorch_model-00006-of-00009.bin",
250
+ "transformer.h.36.ln_mlp.weight": "pytorch_model-00006-of-00009.bin",
251
  "transformer.h.36.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00009.bin",
252
  "transformer.h.36.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00009.bin",
253
  "transformer.h.36.self_attention.dense.weight": "pytorch_model-00006-of-00009.bin",
254
  "transformer.h.36.self_attention.query_key_value.weight": "pytorch_model-00006-of-00009.bin",
255
+ "transformer.h.37.ln_attn.bias": "pytorch_model-00006-of-00009.bin",
256
+ "transformer.h.37.ln_attn.weight": "pytorch_model-00006-of-00009.bin",
257
+ "transformer.h.37.ln_mlp.bias": "pytorch_model-00006-of-00009.bin",
258
+ "transformer.h.37.ln_mlp.weight": "pytorch_model-00006-of-00009.bin",
259
  "transformer.h.37.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00009.bin",
260
  "transformer.h.37.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00009.bin",
261
  "transformer.h.37.self_attention.dense.weight": "pytorch_model-00006-of-00009.bin",
262
  "transformer.h.37.self_attention.query_key_value.weight": "pytorch_model-00006-of-00009.bin",
263
+ "transformer.h.38.ln_attn.bias": "pytorch_model-00006-of-00009.bin",
264
+ "transformer.h.38.ln_attn.weight": "pytorch_model-00006-of-00009.bin",
265
+ "transformer.h.38.ln_mlp.bias": "pytorch_model-00006-of-00009.bin",
266
+ "transformer.h.38.ln_mlp.weight": "pytorch_model-00006-of-00009.bin",
267
  "transformer.h.38.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00009.bin",
268
  "transformer.h.38.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00009.bin",
269
  "transformer.h.38.self_attention.dense.weight": "pytorch_model-00006-of-00009.bin",
270
  "transformer.h.38.self_attention.query_key_value.weight": "pytorch_model-00006-of-00009.bin",
271
+ "transformer.h.39.ln_attn.bias": "pytorch_model-00006-of-00009.bin",
272
+ "transformer.h.39.ln_attn.weight": "pytorch_model-00006-of-00009.bin",
273
+ "transformer.h.39.ln_mlp.bias": "pytorch_model-00006-of-00009.bin",
274
+ "transformer.h.39.ln_mlp.weight": "pytorch_model-00006-of-00009.bin",
275
  "transformer.h.39.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00009.bin",
276
  "transformer.h.39.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00009.bin",
277
  "transformer.h.39.self_attention.dense.weight": "pytorch_model-00006-of-00009.bin",
278
  "transformer.h.39.self_attention.query_key_value.weight": "pytorch_model-00006-of-00009.bin",
279
+ "transformer.h.4.ln_attn.bias": "pytorch_model-00001-of-00009.bin",
280
+ "transformer.h.4.ln_attn.weight": "pytorch_model-00001-of-00009.bin",
281
+ "transformer.h.4.ln_mlp.bias": "pytorch_model-00001-of-00009.bin",
282
+ "transformer.h.4.ln_mlp.weight": "pytorch_model-00001-of-00009.bin",
283
  "transformer.h.4.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00009.bin",
284
  "transformer.h.4.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00009.bin",
285
  "transformer.h.4.self_attention.dense.weight": "pytorch_model-00001-of-00009.bin",
286
  "transformer.h.4.self_attention.query_key_value.weight": "pytorch_model-00001-of-00009.bin",
287
+ "transformer.h.40.ln_attn.bias": "pytorch_model-00006-of-00009.bin",
288
+ "transformer.h.40.ln_attn.weight": "pytorch_model-00006-of-00009.bin",
289
+ "transformer.h.40.ln_mlp.bias": "pytorch_model-00006-of-00009.bin",
290
+ "transformer.h.40.ln_mlp.weight": "pytorch_model-00006-of-00009.bin",
291
  "transformer.h.40.mlp.dense_4h_to_h.weight": "pytorch_model-00006-of-00009.bin",
292
  "transformer.h.40.mlp.dense_h_to_4h.weight": "pytorch_model-00006-of-00009.bin",
293
  "transformer.h.40.self_attention.dense.weight": "pytorch_model-00006-of-00009.bin",
294
  "transformer.h.40.self_attention.query_key_value.weight": "pytorch_model-00006-of-00009.bin",
295
+ "transformer.h.41.ln_attn.bias": "pytorch_model-00006-of-00009.bin",
296
+ "transformer.h.41.ln_attn.weight": "pytorch_model-00006-of-00009.bin",
297
+ "transformer.h.41.ln_mlp.bias": "pytorch_model-00006-of-00009.bin",
298
+ "transformer.h.41.ln_mlp.weight": "pytorch_model-00006-of-00009.bin",
299
  "transformer.h.41.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00009.bin",
300
  "transformer.h.41.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00009.bin",
301
  "transformer.h.41.self_attention.dense.weight": "pytorch_model-00006-of-00009.bin",
302
  "transformer.h.41.self_attention.query_key_value.weight": "pytorch_model-00006-of-00009.bin",
303
+ "transformer.h.42.ln_attn.bias": "pytorch_model-00007-of-00009.bin",
304
+ "transformer.h.42.ln_attn.weight": "pytorch_model-00007-of-00009.bin",
305
+ "transformer.h.42.ln_mlp.bias": "pytorch_model-00007-of-00009.bin",
306
+ "transformer.h.42.ln_mlp.weight": "pytorch_model-00007-of-00009.bin",
307
  "transformer.h.42.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00009.bin",
308
  "transformer.h.42.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00009.bin",
309
  "transformer.h.42.self_attention.dense.weight": "pytorch_model-00007-of-00009.bin",
310
  "transformer.h.42.self_attention.query_key_value.weight": "pytorch_model-00007-of-00009.bin",
311
+ "transformer.h.43.ln_attn.bias": "pytorch_model-00007-of-00009.bin",
312
+ "transformer.h.43.ln_attn.weight": "pytorch_model-00007-of-00009.bin",
313
+ "transformer.h.43.ln_mlp.bias": "pytorch_model-00007-of-00009.bin",
314
+ "transformer.h.43.ln_mlp.weight": "pytorch_model-00007-of-00009.bin",
315
  "transformer.h.43.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00009.bin",
316
  "transformer.h.43.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00009.bin",
317
  "transformer.h.43.self_attention.dense.weight": "pytorch_model-00007-of-00009.bin",
318
  "transformer.h.43.self_attention.query_key_value.weight": "pytorch_model-00007-of-00009.bin",
319
+ "transformer.h.44.ln_attn.bias": "pytorch_model-00007-of-00009.bin",
320
+ "transformer.h.44.ln_attn.weight": "pytorch_model-00007-of-00009.bin",
321
+ "transformer.h.44.ln_mlp.bias": "pytorch_model-00007-of-00009.bin",
322
+ "transformer.h.44.ln_mlp.weight": "pytorch_model-00007-of-00009.bin",
323
  "transformer.h.44.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00009.bin",
324
  "transformer.h.44.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00009.bin",
325
  "transformer.h.44.self_attention.dense.weight": "pytorch_model-00007-of-00009.bin",
326
  "transformer.h.44.self_attention.query_key_value.weight": "pytorch_model-00007-of-00009.bin",
327
+ "transformer.h.45.ln_attn.bias": "pytorch_model-00007-of-00009.bin",
328
+ "transformer.h.45.ln_attn.weight": "pytorch_model-00007-of-00009.bin",
329
+ "transformer.h.45.ln_mlp.bias": "pytorch_model-00007-of-00009.bin",
330
+ "transformer.h.45.ln_mlp.weight": "pytorch_model-00007-of-00009.bin",
331
  "transformer.h.45.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00009.bin",
332
  "transformer.h.45.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00009.bin",
333
  "transformer.h.45.self_attention.dense.weight": "pytorch_model-00007-of-00009.bin",
334
  "transformer.h.45.self_attention.query_key_value.weight": "pytorch_model-00007-of-00009.bin",
335
+ "transformer.h.46.ln_attn.bias": "pytorch_model-00007-of-00009.bin",
336
+ "transformer.h.46.ln_attn.weight": "pytorch_model-00007-of-00009.bin",
337
+ "transformer.h.46.ln_mlp.bias": "pytorch_model-00007-of-00009.bin",
338
+ "transformer.h.46.ln_mlp.weight": "pytorch_model-00007-of-00009.bin",
339
  "transformer.h.46.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00009.bin",
340
  "transformer.h.46.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00009.bin",
341
  "transformer.h.46.self_attention.dense.weight": "pytorch_model-00007-of-00009.bin",
342
  "transformer.h.46.self_attention.query_key_value.weight": "pytorch_model-00007-of-00009.bin",
343
+ "transformer.h.47.ln_attn.bias": "pytorch_model-00007-of-00009.bin",
344
+ "transformer.h.47.ln_attn.weight": "pytorch_model-00007-of-00009.bin",
345
+ "transformer.h.47.ln_mlp.bias": "pytorch_model-00007-of-00009.bin",
346
+ "transformer.h.47.ln_mlp.weight": "pytorch_model-00007-of-00009.bin",
347
  "transformer.h.47.mlp.dense_4h_to_h.weight": "pytorch_model-00007-of-00009.bin",
348
  "transformer.h.47.mlp.dense_h_to_4h.weight": "pytorch_model-00007-of-00009.bin",
349
  "transformer.h.47.self_attention.dense.weight": "pytorch_model-00007-of-00009.bin",
350
  "transformer.h.47.self_attention.query_key_value.weight": "pytorch_model-00007-of-00009.bin",
351
+ "transformer.h.48.ln_attn.bias": "pytorch_model-00007-of-00009.bin",
352
+ "transformer.h.48.ln_attn.weight": "pytorch_model-00007-of-00009.bin",
353
+ "transformer.h.48.ln_mlp.bias": "pytorch_model-00007-of-00009.bin",
354
+ "transformer.h.48.ln_mlp.weight": "pytorch_model-00007-of-00009.bin",
355
  "transformer.h.48.mlp.dense_4h_to_h.weight": "pytorch_model-00008-of-00009.bin",
356
  "transformer.h.48.mlp.dense_h_to_4h.weight": "pytorch_model-00008-of-00009.bin",
357
  "transformer.h.48.self_attention.dense.weight": "pytorch_model-00007-of-00009.bin",
358
  "transformer.h.48.self_attention.query_key_value.weight": "pytorch_model-00007-of-00009.bin",
359
+ "transformer.h.49.ln_attn.bias": "pytorch_model-00008-of-00009.bin",
360
+ "transformer.h.49.ln_attn.weight": "pytorch_model-00008-of-00009.bin",
361
+ "transformer.h.49.ln_mlp.bias": "pytorch_model-00008-of-00009.bin",
362
+ "transformer.h.49.ln_mlp.weight": "pytorch_model-00008-of-00009.bin",
363
  "transformer.h.49.mlp.dense_4h_to_h.weight": "pytorch_model-00008-of-00009.bin",
364
  "transformer.h.49.mlp.dense_h_to_4h.weight": "pytorch_model-00008-of-00009.bin",
365
  "transformer.h.49.self_attention.dense.weight": "pytorch_model-00008-of-00009.bin",
366
  "transformer.h.49.self_attention.query_key_value.weight": "pytorch_model-00008-of-00009.bin",
367
+ "transformer.h.5.ln_attn.bias": "pytorch_model-00001-of-00009.bin",
368
+ "transformer.h.5.ln_attn.weight": "pytorch_model-00001-of-00009.bin",
369
+ "transformer.h.5.ln_mlp.bias": "pytorch_model-00001-of-00009.bin",
370
+ "transformer.h.5.ln_mlp.weight": "pytorch_model-00001-of-00009.bin",
371
  "transformer.h.5.mlp.dense_4h_to_h.weight": "pytorch_model-00001-of-00009.bin",
372
  "transformer.h.5.mlp.dense_h_to_4h.weight": "pytorch_model-00001-of-00009.bin",
373
  "transformer.h.5.self_attention.dense.weight": "pytorch_model-00001-of-00009.bin",
374
  "transformer.h.5.self_attention.query_key_value.weight": "pytorch_model-00001-of-00009.bin",
375
+ "transformer.h.50.ln_attn.bias": "pytorch_model-00008-of-00009.bin",
376
+ "transformer.h.50.ln_attn.weight": "pytorch_model-00008-of-00009.bin",
377
+ "transformer.h.50.ln_mlp.bias": "pytorch_model-00008-of-00009.bin",
378
+ "transformer.h.50.ln_mlp.weight": "pytorch_model-00008-of-00009.bin",
379
  "transformer.h.50.mlp.dense_4h_to_h.weight": "pytorch_model-00008-of-00009.bin",
380
  "transformer.h.50.mlp.dense_h_to_4h.weight": "pytorch_model-00008-of-00009.bin",
381
  "transformer.h.50.self_attention.dense.weight": "pytorch_model-00008-of-00009.bin",
382
  "transformer.h.50.self_attention.query_key_value.weight": "pytorch_model-00008-of-00009.bin",
383
+ "transformer.h.51.ln_attn.bias": "pytorch_model-00008-of-00009.bin",
384
+ "transformer.h.51.ln_attn.weight": "pytorch_model-00008-of-00009.bin",
385
+ "transformer.h.51.ln_mlp.bias": "pytorch_model-00008-of-00009.bin",
386
+ "transformer.h.51.ln_mlp.weight": "pytorch_model-00008-of-00009.bin",
387
  "transformer.h.51.mlp.dense_4h_to_h.weight": "pytorch_model-00008-of-00009.bin",
388
  "transformer.h.51.mlp.dense_h_to_4h.weight": "pytorch_model-00008-of-00009.bin",
389
  "transformer.h.51.self_attention.dense.weight": "pytorch_model-00008-of-00009.bin",
390
  "transformer.h.51.self_attention.query_key_value.weight": "pytorch_model-00008-of-00009.bin",
391
+ "transformer.h.52.ln_attn.bias": "pytorch_model-00008-of-00009.bin",
392
+ "transformer.h.52.ln_attn.weight": "pytorch_model-00008-of-00009.bin",
393
+ "transformer.h.52.ln_mlp.bias": "pytorch_model-00008-of-00009.bin",
394
+ "transformer.h.52.ln_mlp.weight": "pytorch_model-00008-of-00009.bin",
395
  "transformer.h.52.mlp.dense_4h_to_h.weight": "pytorch_model-00008-of-00009.bin",
396
  "transformer.h.52.mlp.dense_h_to_4h.weight": "pytorch_model-00008-of-00009.bin",
397
  "transformer.h.52.self_attention.dense.weight": "pytorch_model-00008-of-00009.bin",
398
  "transformer.h.52.self_attention.query_key_value.weight": "pytorch_model-00008-of-00009.bin",
399
+ "transformer.h.53.ln_attn.bias": "pytorch_model-00008-of-00009.bin",
400
+ "transformer.h.53.ln_attn.weight": "pytorch_model-00008-of-00009.bin",
401
+ "transformer.h.53.ln_mlp.bias": "pytorch_model-00008-of-00009.bin",
402
+ "transformer.h.53.ln_mlp.weight": "pytorch_model-00008-of-00009.bin",
403
  "transformer.h.53.mlp.dense_4h_to_h.weight": "pytorch_model-00008-of-00009.bin",
404
  "transformer.h.53.mlp.dense_h_to_4h.weight": "pytorch_model-00008-of-00009.bin",
405
  "transformer.h.53.self_attention.dense.weight": "pytorch_model-00008-of-00009.bin",
406
  "transformer.h.53.self_attention.query_key_value.weight": "pytorch_model-00008-of-00009.bin",
407
+ "transformer.h.54.ln_attn.bias": "pytorch_model-00008-of-00009.bin",
408
+ "transformer.h.54.ln_attn.weight": "pytorch_model-00008-of-00009.bin",
409
+ "transformer.h.54.ln_mlp.bias": "pytorch_model-00008-of-00009.bin",
410
+ "transformer.h.54.ln_mlp.weight": "pytorch_model-00008-of-00009.bin",
411
  "transformer.h.54.mlp.dense_4h_to_h.weight": "pytorch_model-00008-of-00009.bin",
412
  "transformer.h.54.mlp.dense_h_to_4h.weight": "pytorch_model-00008-of-00009.bin",
413
  "transformer.h.54.self_attention.dense.weight": "pytorch_model-00008-of-00009.bin",
414
  "transformer.h.54.self_attention.query_key_value.weight": "pytorch_model-00008-of-00009.bin",
415
+ "transformer.h.55.ln_attn.bias": "pytorch_model-00008-of-00009.bin",
416
+ "transformer.h.55.ln_attn.weight": "pytorch_model-00008-of-00009.bin",
417
+ "transformer.h.55.ln_mlp.bias": "pytorch_model-00008-of-00009.bin",
418
+ "transformer.h.55.ln_mlp.weight": "pytorch_model-00008-of-00009.bin",
419
  "transformer.h.55.mlp.dense_4h_to_h.weight": "pytorch_model-00009-of-00009.bin",
420
  "transformer.h.55.mlp.dense_h_to_4h.weight": "pytorch_model-00009-of-00009.bin",
421
  "transformer.h.55.self_attention.dense.weight": "pytorch_model-00008-of-00009.bin",
422
  "transformer.h.55.self_attention.query_key_value.weight": "pytorch_model-00008-of-00009.bin",
423
+ "transformer.h.56.ln_attn.bias": "pytorch_model-00009-of-00009.bin",
424
+ "transformer.h.56.ln_attn.weight": "pytorch_model-00009-of-00009.bin",
425
+ "transformer.h.56.ln_mlp.bias": "pytorch_model-00009-of-00009.bin",
426
+ "transformer.h.56.ln_mlp.weight": "pytorch_model-00009-of-00009.bin",
427
  "transformer.h.56.mlp.dense_4h_to_h.weight": "pytorch_model-00009-of-00009.bin",
428
  "transformer.h.56.mlp.dense_h_to_4h.weight": "pytorch_model-00009-of-00009.bin",
429
  "transformer.h.56.self_attention.dense.weight": "pytorch_model-00009-of-00009.bin",
430
  "transformer.h.56.self_attention.query_key_value.weight": "pytorch_model-00009-of-00009.bin",
431
+ "transformer.h.57.ln_attn.bias": "pytorch_model-00009-of-00009.bin",
432
+ "transformer.h.57.ln_attn.weight": "pytorch_model-00009-of-00009.bin",
433
+ "transformer.h.57.ln_mlp.bias": "pytorch_model-00009-of-00009.bin",
434
+ "transformer.h.57.ln_mlp.weight": "pytorch_model-00009-of-00009.bin",
435
  "transformer.h.57.mlp.dense_4h_to_h.weight": "pytorch_model-00009-of-00009.bin",
436
  "transformer.h.57.mlp.dense_h_to_4h.weight": "pytorch_model-00009-of-00009.bin",
437
  "transformer.h.57.self_attention.dense.weight": "pytorch_model-00009-of-00009.bin",
438
  "transformer.h.57.self_attention.query_key_value.weight": "pytorch_model-00009-of-00009.bin",
439
+ "transformer.h.58.ln_attn.bias": "pytorch_model-00009-of-00009.bin",
440
+ "transformer.h.58.ln_attn.weight": "pytorch_model-00009-of-00009.bin",
441
+ "transformer.h.58.ln_mlp.bias": "pytorch_model-00009-of-00009.bin",
442
+ "transformer.h.58.ln_mlp.weight": "pytorch_model-00009-of-00009.bin",
443
  "transformer.h.58.mlp.dense_4h_to_h.weight": "pytorch_model-00009-of-00009.bin",
444
  "transformer.h.58.mlp.dense_h_to_4h.weight": "pytorch_model-00009-of-00009.bin",
445
  "transformer.h.58.self_attention.dense.weight": "pytorch_model-00009-of-00009.bin",
446
  "transformer.h.58.self_attention.query_key_value.weight": "pytorch_model-00009-of-00009.bin",
447
+ "transformer.h.59.ln_attn.bias": "pytorch_model-00009-of-00009.bin",
448
+ "transformer.h.59.ln_attn.weight": "pytorch_model-00009-of-00009.bin",
449
+ "transformer.h.59.ln_mlp.bias": "pytorch_model-00009-of-00009.bin",
450
+ "transformer.h.59.ln_mlp.weight": "pytorch_model-00009-of-00009.bin",
451
  "transformer.h.59.mlp.dense_4h_to_h.weight": "pytorch_model-00009-of-00009.bin",
452
  "transformer.h.59.mlp.dense_h_to_4h.weight": "pytorch_model-00009-of-00009.bin",
453
  "transformer.h.59.self_attention.dense.weight": "pytorch_model-00009-of-00009.bin",
454
  "transformer.h.59.self_attention.query_key_value.weight": "pytorch_model-00009-of-00009.bin",
455
+ "transformer.h.6.ln_attn.bias": "pytorch_model-00001-of-00009.bin",
456
+ "transformer.h.6.ln_attn.weight": "pytorch_model-00001-of-00009.bin",
457
+ "transformer.h.6.ln_mlp.bias": "pytorch_model-00001-of-00009.bin",
458
+ "transformer.h.6.ln_mlp.weight": "pytorch_model-00001-of-00009.bin",
459
  "transformer.h.6.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00009.bin",
460
  "transformer.h.6.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00009.bin",
461
  "transformer.h.6.self_attention.dense.weight": "pytorch_model-00001-of-00009.bin",
462
  "transformer.h.6.self_attention.query_key_value.weight": "pytorch_model-00001-of-00009.bin",
463
+ "transformer.h.7.ln_attn.bias": "pytorch_model-00002-of-00009.bin",
464
+ "transformer.h.7.ln_attn.weight": "pytorch_model-00002-of-00009.bin",
465
+ "transformer.h.7.ln_mlp.bias": "pytorch_model-00002-of-00009.bin",
466
+ "transformer.h.7.ln_mlp.weight": "pytorch_model-00002-of-00009.bin",
467
  "transformer.h.7.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00009.bin",
468
  "transformer.h.7.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00009.bin",
469
  "transformer.h.7.self_attention.dense.weight": "pytorch_model-00002-of-00009.bin",
470
  "transformer.h.7.self_attention.query_key_value.weight": "pytorch_model-00002-of-00009.bin",
471
+ "transformer.h.8.ln_attn.bias": "pytorch_model-00002-of-00009.bin",
472
+ "transformer.h.8.ln_attn.weight": "pytorch_model-00002-of-00009.bin",
473
+ "transformer.h.8.ln_mlp.bias": "pytorch_model-00002-of-00009.bin",
474
+ "transformer.h.8.ln_mlp.weight": "pytorch_model-00002-of-00009.bin",
475
  "transformer.h.8.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00009.bin",
476
  "transformer.h.8.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00009.bin",
477
  "transformer.h.8.self_attention.dense.weight": "pytorch_model-00002-of-00009.bin",
478
  "transformer.h.8.self_attention.query_key_value.weight": "pytorch_model-00002-of-00009.bin",
479
+ "transformer.h.9.ln_attn.bias": "pytorch_model-00002-of-00009.bin",
480
+ "transformer.h.9.ln_attn.weight": "pytorch_model-00002-of-00009.bin",
481
+ "transformer.h.9.ln_mlp.bias": "pytorch_model-00002-of-00009.bin",
482
+ "transformer.h.9.ln_mlp.weight": "pytorch_model-00002-of-00009.bin",
483
  "transformer.h.9.mlp.dense_4h_to_h.weight": "pytorch_model-00002-of-00009.bin",
484
  "transformer.h.9.mlp.dense_h_to_4h.weight": "pytorch_model-00002-of-00009.bin",
485
  "transformer.h.9.self_attention.dense.weight": "pytorch_model-00002-of-00009.bin",