Upload 2 files
Browse files- configuration_mamba.py +43 -0
- modeling_mamba.py +308 -0
configuration_mamba.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
from typing import Optional , Union
|
3 |
+
|
4 |
+
from transformers import PretrainedConfig
|
5 |
+
class MambaConfig(PretrainedConfig):
|
6 |
+
model_type = "mamba"
|
7 |
+
def __init__(
|
8 |
+
self,
|
9 |
+
vocab_size=50277,
|
10 |
+
d_state=16,
|
11 |
+
d_model=2560,
|
12 |
+
d_conv=4,
|
13 |
+
expand=2,
|
14 |
+
conv_bias=True,
|
15 |
+
bias=False,
|
16 |
+
n_layer=64,
|
17 |
+
dt_rank: Union[int, str] = "auto",
|
18 |
+
pad_vocab_size_multiple=8,
|
19 |
+
initializer_range=0.02,
|
20 |
+
**kwargs,
|
21 |
+
):
|
22 |
+
self.vocab_size = vocab_size
|
23 |
+
self.n_layer= n_layer
|
24 |
+
self.conv_bias = conv_bias
|
25 |
+
self.expand = expand
|
26 |
+
self.pad_vocab_size_multiple = pad_vocab_size_multiple
|
27 |
+
self.d_conv = d_conv
|
28 |
+
self.d_model = d_model
|
29 |
+
self.d_state = d_state
|
30 |
+
self.d_inner = int(self.expand * self.d_model)
|
31 |
+
self.dt_rank = dt_rank
|
32 |
+
self.initializer_range = initializer_range
|
33 |
+
self.bias = bias
|
34 |
+
|
35 |
+
if self.dt_rank == 'auto':
|
36 |
+
self.dt_rank = math.ceil(self.d_model / 16)
|
37 |
+
|
38 |
+
if self.vocab_size % self.pad_vocab_size_multiple != 0:
|
39 |
+
self.vocab_size += (self.pad_vocab_size_multiple
|
40 |
+
- self.vocab_size % self.pad_vocab_size_multiple)
|
41 |
+
super().__init__(
|
42 |
+
**kwargs,
|
43 |
+
)
|
modeling_mamba.py
ADDED
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
import torch
|
3 |
+
from configuration_mamba import MambaConfig
|
4 |
+
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
5 |
+
from transformers.modeling_utils import PreTrainedModel
|
6 |
+
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
|
7 |
+
import math
|
8 |
+
import json
|
9 |
+
import torch
|
10 |
+
import torch.nn as nn
|
11 |
+
import torch.nn.functional as F
|
12 |
+
from dataclasses import dataclass
|
13 |
+
from einops import rearrange, repeat, einsum
|
14 |
+
from typing import Optional , Union ,Tuple
|
15 |
+
|
16 |
+
# Dear contributors of the https://github.com/johnma2006/mamba-minimal/tree/master repository, special thanks to Albert Gu and Tri Dao for their articles. (https://arxiv.org/abs/2312.00752)
|
17 |
+
|
18 |
+
|
19 |
+
class MambaRMSNorm(nn.Module):
|
20 |
+
def __init__(self,
|
21 |
+
d_model: int,
|
22 |
+
eps: float = 1e-5):
|
23 |
+
super().__init__()
|
24 |
+
self.eps = eps
|
25 |
+
self.weight = nn.Parameter(torch.ones(d_model))
|
26 |
+
def forward(self, x):
|
27 |
+
output = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
|
28 |
+
return output
|
29 |
+
|
30 |
+
|
31 |
+
class MambaBlock(nn.Module):
|
32 |
+
def __init__(self, config: MambaConfig):
|
33 |
+
"""A single Mamba block, as described in Figure 3 in Section 3.4 in the Mamba paper [1]."""
|
34 |
+
super().__init__()
|
35 |
+
self.config = config
|
36 |
+
|
37 |
+
self.in_proj = nn.Linear(config.d_model, config.d_inner * 2, bias=config.bias)
|
38 |
+
|
39 |
+
self.conv1d = nn.Conv1d(
|
40 |
+
in_channels=config.d_inner,
|
41 |
+
out_channels=config.d_inner,
|
42 |
+
bias=config.conv_bias,
|
43 |
+
kernel_size=config.d_conv,
|
44 |
+
groups=config.d_inner,
|
45 |
+
padding=config.d_conv - 1,
|
46 |
+
)
|
47 |
+
|
48 |
+
# x_proj takes in `x` and outputs the input-specific Δ, B, C
|
49 |
+
self.x_proj = nn.Linear(config.d_inner, config.dt_rank + config.d_state * 2, bias=False)
|
50 |
+
|
51 |
+
# dt_proj projects Δ from dt_rank to d_in
|
52 |
+
self.dt_proj = nn.Linear(config.dt_rank, config.d_inner, bias=True)
|
53 |
+
|
54 |
+
A = repeat(torch.arange(1, config.d_state + 1), 'n -> d n', d=config.d_inner)
|
55 |
+
self.A_log = nn.Parameter(torch.log(A))
|
56 |
+
self.D = nn.Parameter(torch.ones(config.d_inner))
|
57 |
+
self.out_proj = nn.Linear(config.d_inner, config.d_model, bias=config.bias)
|
58 |
+
self.norm = MambaRMSNorm(config.d_model)
|
59 |
+
|
60 |
+
def forward(self, x):
|
61 |
+
"""Mamba block forward. This looks the same as Figure 3 in Section 3.4 in the Mamba paper [1].
|
62 |
+
|
63 |
+
Args:
|
64 |
+
x: shape (b, l, d) (See Glossary at top for definitions of b, l, d_in, n...)
|
65 |
+
|
66 |
+
Returns:
|
67 |
+
output: shape (b, l, d)
|
68 |
+
|
69 |
+
Official Implementation:
|
70 |
+
class Mamba, https://github.com/state-spaces/mamba/blob/main/mamba_ssm/modules/mamba_simple.py#L119
|
71 |
+
mamba_inner_ref(), https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/selective_scan_interface.py#L311
|
72 |
+
|
73 |
+
"""
|
74 |
+
|
75 |
+
(b, l, d) = x.shape
|
76 |
+
x_copy = x # There was a separate class for residual, I deleted that part and added it here.
|
77 |
+
x = self.norm(x)
|
78 |
+
x_and_res = self.in_proj(x) # shape (b, l, 2 * d_in)
|
79 |
+
(x, res) = x_and_res.split(split_size=[self.config.d_inner, self.config.d_inner], dim=-1)
|
80 |
+
|
81 |
+
x = rearrange(x, 'b l d_in -> b d_in l')
|
82 |
+
x = self.conv1d(x)[:, :, :l]
|
83 |
+
x = rearrange(x, 'b d_in l -> b l d_in')
|
84 |
+
|
85 |
+
x = F.silu(x)
|
86 |
+
|
87 |
+
y = self.ssm(x)
|
88 |
+
|
89 |
+
y = y * F.silu(res)
|
90 |
+
|
91 |
+
output = self.out_proj(y) + x_copy
|
92 |
+
|
93 |
+
return output
|
94 |
+
|
95 |
+
|
96 |
+
def ssm(self, x):
|
97 |
+
"""Runs the SSM. See:
|
98 |
+
- Algorithm 2 in Section 3.2 in the Mamba paper [1]
|
99 |
+
- run_SSM(A, B, C, u) in The Annotated S4 [2]
|
100 |
+
|
101 |
+
Args:
|
102 |
+
x: shape (b, l, d_in) (See Glossary at top for definitions of b, l, d_in, n...)
|
103 |
+
|
104 |
+
Returns:
|
105 |
+
output: shape (b, l, d_in)
|
106 |
+
|
107 |
+
Official Implementation:
|
108 |
+
mamba_inner_ref(), https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/selective_scan_interface.py#L311
|
109 |
+
|
110 |
+
"""
|
111 |
+
(d_in, n) = self.A_log.shape
|
112 |
+
|
113 |
+
# Compute ∆ A B C D, the state space parameters.
|
114 |
+
# A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
|
115 |
+
# ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
|
116 |
+
# and is why Mamba is called **selective** state spaces)
|
117 |
+
|
118 |
+
A = -torch.exp(self.A_log.float()) # shape (d_in, n)
|
119 |
+
D = self.D.float()
|
120 |
+
|
121 |
+
x_dbl = self.x_proj(x) # (b, l, dt_rank + 2*n)
|
122 |
+
|
123 |
+
(delta, B, C) = x_dbl.split(split_size=[self.config.dt_rank, n, n], dim=-1) # delta: (b, l, dt_rank). B, C: (b, l, n)
|
124 |
+
delta = F.softplus(self.dt_proj(delta)) # (b, l, d_in)
|
125 |
+
|
126 |
+
y = self.selective_scan(x, delta, A, B, C, D) # This is similar to run_SSM(A, B, C, u) in The Annotated S4 [2]
|
127 |
+
|
128 |
+
return y
|
129 |
+
|
130 |
+
|
131 |
+
def selective_scan(self, u, delta, A, B, C, D):
|
132 |
+
"""Does selective scan algorithm. See:
|
133 |
+
- Section 2 State Space Models in the Mamba paper [1]
|
134 |
+
- Algorithm 2 in Section 3.2 in the Mamba paper [1]
|
135 |
+
- run_SSM(A, B, C, u) in The Annotated S4 [2]
|
136 |
+
|
137 |
+
This is the classic discrete state space formula:
|
138 |
+
x(t + 1) = Ax(t) + Bu(t)
|
139 |
+
y(t) = Cx(t) + Du(t)
|
140 |
+
except B and C (and the step size delta, which is used for discretization) are dependent on the input x(t).
|
141 |
+
|
142 |
+
Args:
|
143 |
+
u: shape (b, l, d_in) (See Glossary at top for definitions of b, l, d_in, n...)
|
144 |
+
delta: shape (b, l, d_in)
|
145 |
+
A: shape (d_in, n)
|
146 |
+
B: shape (b, l, n)
|
147 |
+
C: shape (b, l, n)
|
148 |
+
D: shape (d_in,)
|
149 |
+
|
150 |
+
Returns:
|
151 |
+
output: shape (b, l, d_in)
|
152 |
+
|
153 |
+
Official Implementation:
|
154 |
+
selective_scan_ref(), https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/selective_scan_interface.py#L86
|
155 |
+
Note: I refactored some parts out of `selective_scan_ref` out, so the functionality doesn't match exactly.
|
156 |
+
|
157 |
+
"""
|
158 |
+
(b, l, d_in) = u.shape
|
159 |
+
n = A.shape[1]
|
160 |
+
|
161 |
+
# Discretize continuous parameters (A, B)
|
162 |
+
# - A is discretized using zero-order hold (ZOH) discretization (see Section 2 Equation 4 in the Mamba paper [1])
|
163 |
+
# - B is discretized using a simplified Euler discretization instead of ZOH. From a discussion with authors:
|
164 |
+
# "A is the more important term and the performance doesn't change much with the simplication on B"
|
165 |
+
deltaA = torch.exp(einsum(delta, A, 'b l d_in, d_in n -> b d_in l n'))
|
166 |
+
deltaB_u = einsum(delta, B, u, 'b l d_in, b l n, b l d_in -> b d_in l n')
|
167 |
+
|
168 |
+
# Perform selective scan (see scan_SSM() in The Annotated S4 [2])
|
169 |
+
x = torch.zeros((b, d_in, n), device=deltaA.device)
|
170 |
+
ys = []
|
171 |
+
for i in range(l):
|
172 |
+
x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
|
173 |
+
y = einsum(x, C[:, i, :], 'b d_in n, b n -> b d_in')
|
174 |
+
ys.append(y)
|
175 |
+
y = torch.stack(ys, dim=1) # shape (b, l, d_in)
|
176 |
+
|
177 |
+
y = y + u * D
|
178 |
+
|
179 |
+
return y
|
180 |
+
|
181 |
+
class MambaPreTrainedModel(PreTrainedModel):
|
182 |
+
config_class = MambaConfig
|
183 |
+
base_model_prefix = "model"
|
184 |
+
supports_gradient_checkpointing = True
|
185 |
+
_no_split_modules = ["MambaBlock"]
|
186 |
+
|
187 |
+
def _init_weights(self, module):
|
188 |
+
std = 0.02
|
189 |
+
if isinstance(module, (nn.Linear, nn.Conv1d)):
|
190 |
+
module.weight.data.normal_(mean=0.0, std=std)
|
191 |
+
if module.bias is not None:
|
192 |
+
module.bias.data.zero_()
|
193 |
+
elif isinstance(module, nn.Embedding):
|
194 |
+
module.weight.data.normal_(mean=0.0, std=std)
|
195 |
+
if module.padding_idx is not None:
|
196 |
+
module.weight.data[module.padding_idx].zero_()
|
197 |
+
|
198 |
+
class MambaModel(MambaPreTrainedModel):
|
199 |
+
def __init__(self, config: MambaConfig):
|
200 |
+
"""Full Mamba model.
|
201 |
+
Mamba model decoder consisting of *config.n_layer* layers. Each layer is a [`MambaBlock`]
|
202 |
+
|
203 |
+
Args:
|
204 |
+
config: MambaConfig
|
205 |
+
"""
|
206 |
+
super().__init__(config)
|
207 |
+
self.config = config
|
208 |
+
|
209 |
+
self.embedding = nn.Embedding(config.vocab_size, config.d_model)
|
210 |
+
self.layers = nn.ModuleList([MambaBlock(config) for _ in range(config.n_layer)])
|
211 |
+
self.norm_f = MambaRMSNorm(config.d_model)
|
212 |
+
|
213 |
+
self.gradient_checkpointing = False
|
214 |
+
self.post_init()
|
215 |
+
|
216 |
+
def get_input_embeddings(self):
|
217 |
+
return self.embedding
|
218 |
+
|
219 |
+
def set_input_embeddings(self, value):
|
220 |
+
self.embedding = value
|
221 |
+
|
222 |
+
def forward(self,
|
223 |
+
input_ids: torch.LongTensor = None,
|
224 |
+
return_dict: Optional[bool] = None,
|
225 |
+
)-> Union[Tuple, BaseModelOutputWithPast]:
|
226 |
+
x = self.embedding(input_ids)
|
227 |
+
all_hidden_states = list()
|
228 |
+
for layer in self.layers:
|
229 |
+
x = layer(x)
|
230 |
+
all_hidden_states.append(x)
|
231 |
+
|
232 |
+
hidden_states = self.norm_f(x)
|
233 |
+
|
234 |
+
return BaseModelOutputWithPast(
|
235 |
+
last_hidden_state=hidden_states,
|
236 |
+
hidden_states=all_hidden_states,
|
237 |
+
)
|
238 |
+
class MambaForCausalLM(MambaPreTrainedModel):
|
239 |
+
_tied_weights_keys = ["lm_head.weight"]
|
240 |
+
|
241 |
+
def __init__(self, config):
|
242 |
+
super().__init__(config)
|
243 |
+
self.model = MambaModel(config)
|
244 |
+
self.vocab_size = config.vocab_size
|
245 |
+
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
|
246 |
+
self.lm_head.weight = self.model.embedding.weight
|
247 |
+
self.post_init()
|
248 |
+
|
249 |
+
def get_input_embeddings(self):
|
250 |
+
return self.model.embedding
|
251 |
+
|
252 |
+
def set_input_embeddings(self, value):
|
253 |
+
self.model.embedding = value
|
254 |
+
|
255 |
+
def get_output_embeddings(self):
|
256 |
+
return self.lm_head
|
257 |
+
|
258 |
+
def set_output_embeddings(self, new_embeddings):
|
259 |
+
self.lm_head = new_embeddings
|
260 |
+
|
261 |
+
def set_decoder(self, decoder):
|
262 |
+
self.model = decoder
|
263 |
+
|
264 |
+
def get_decoder(self):
|
265 |
+
return self.model
|
266 |
+
|
267 |
+
def forward(self,
|
268 |
+
input_ids: torch.LongTensor = None,
|
269 |
+
labels: Optional[torch.LongTensor] = None,
|
270 |
+
output_attentions: Optional[bool] = None,
|
271 |
+
output_hidden_states: Optional[bool] = None,
|
272 |
+
return_dict: Optional[bool] = None,
|
273 |
+
)-> Union[Tuple, CausalLMOutputWithPast]:
|
274 |
+
outputs = self.model(
|
275 |
+
input_ids=input_ids,
|
276 |
+
return_dict=return_dict,
|
277 |
+
)
|
278 |
+
hidden_states = outputs[0]
|
279 |
+
logits = self.lm_head(hidden_states)
|
280 |
+
logits = logits.float()
|
281 |
+
loss = None
|
282 |
+
if labels is not None:
|
283 |
+
shift_logits = logits[..., :-1, :].contiguous()
|
284 |
+
shift_labels = labels[..., 1:].contiguous()
|
285 |
+
loss_fct = CrossEntropyLoss()
|
286 |
+
shift_logits = shift_logits.view(-1, self.config.vocab_size)
|
287 |
+
shift_labels = shift_labels.view(-1)
|
288 |
+
|
289 |
+
shift_labels = shift_labels.to(shift_logits.device)
|
290 |
+
loss = loss_fct(shift_logits, shift_labels)
|
291 |
+
|
292 |
+
if not return_dict:
|
293 |
+
output = (logits,) + outputs[1:]
|
294 |
+
return (loss,) + output if loss is not None else output
|
295 |
+
|
296 |
+
return CausalLMOutputWithPast(
|
297 |
+
loss=loss,
|
298 |
+
logits=logits,
|
299 |
+
hidden_states=outputs.hidden_states,
|
300 |
+
)
|
301 |
+
|
302 |
+
def prepare_inputs_for_generation(
|
303 |
+
self, input_ids, **kwargs
|
304 |
+
):
|
305 |
+
model_inputs = {"input_ids": input_ids}
|
306 |
+
return model_inputs
|
307 |
+
|
308 |
+
|