paulilioaica commited on
Commit
f3fe2bd
1 Parent(s): bf41156

Create modeling_phi.py

Browse files
Files changed (1) hide show
  1. modeling_phi.py +987 -0
modeling_phi.py ADDED
@@ -0,0 +1,987 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+ #
4
+ # Copyright (c) 2022, Tri Dao, [email protected].
5
+ # Licensed under the BSD 3-Clause License.
6
+
7
+ from __future__ import annotations
8
+
9
+ import math
10
+ from dataclasses import dataclass, field
11
+ from typing import Any, Dict, Optional, Tuple, Union
12
+
13
+ import torch
14
+ import torch.nn as nn
15
+ from einops import rearrange, repeat
16
+ from transformers import PretrainedConfig, PreTrainedModel
17
+ from transformers.activations import ACT2FN
18
+ from transformers.modeling_outputs import CausalLMOutputWithPast
19
+
20
+ from .configuration_phi import PhiConfig
21
+
22
+ try:
23
+ from flash_attn.bert_padding import pad_input, unpad_input
24
+ from flash_attn.layers.rotary import RotaryEmbedding as FlashRotaryEmbedding
25
+ from flash_attn.modules.mha import FlashCrossAttention, FlashSelfAttention
26
+ from flash_attn.ops.fused_dense import FusedDense
27
+ except:
28
+ pad_input, unpad_input = None, None
29
+ FlashRotaryEmbedding = None
30
+ FlashSelfAttention, FlashCrossAttention = None, None
31
+ FusedDense = None
32
+
33
+
34
+ @dataclass
35
+ class InferenceParams:
36
+ """Inference parameters passed to model to efficiently calculate
37
+ and store context during inference.
38
+
39
+ Reference:
40
+ https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/utils/generation.py.
41
+
42
+ Args:
43
+ max_seqlen: Maximum sequence length.
44
+ max_batch_size: Maximum batch size.
45
+ seqlen_offset: Sequence length offset.
46
+ batch_size_offset: Batch size offset.
47
+ key_value_memory_dict: Key value memory dictionary.
48
+ lengths_per_sample: Lengths per sample.
49
+
50
+ """
51
+
52
+ max_seqlen: int = field(metadata={"help": "Maximum sequence length."})
53
+
54
+ max_batch_size: int = field(metadata={"help": "Maximum batch size."})
55
+
56
+ seqlen_offset: int = field(default=0, metadata={"help": "Sequence length offset."})
57
+
58
+ batch_size_offset: int = field(default=0, metadata={"help": "Batch size offset."})
59
+
60
+ key_value_memory_dict: Dict[str, Any] = field(
61
+ default_factory=dict, metadata={"help": "Key value memory dictionary."}
62
+ )
63
+
64
+ lengths_per_sample: torch.Tensor = field(default=None, metadata={"help": "Lengths per sample."})
65
+
66
+
67
+ class Embedding(nn.Module):
68
+ """Token embedding with dropout."""
69
+
70
+ def __init__(self, config: PretrainedConfig) -> None:
71
+ super().__init__()
72
+
73
+ self.wte = nn.Embedding(config.vocab_size, config.n_embd)
74
+ self.drop = nn.Dropout(config.embd_pdrop)
75
+
76
+ def forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
77
+ input_shape = input_ids.size()
78
+ input_ids = input_ids.view(-1, input_shape[-1])
79
+
80
+ hidden_states = self.wte(input_ids)
81
+ hidden_states = self.drop(hidden_states)
82
+
83
+ return hidden_states
84
+
85
+
86
+ def _apply_rotary_emb(
87
+ x: torch.FloatTensor,
88
+ cos: torch.FloatTensor,
89
+ sin: torch.FloatTensor,
90
+ ) -> torch.FloatTensor:
91
+ _, seqlen, _, _ = x.shape
92
+ _, rotary_dim = cos.shape
93
+ rotary_dim *= 2
94
+
95
+ x_rot = x[:, :, :, :rotary_dim]
96
+ x_pass = x[:, :, :, rotary_dim:]
97
+
98
+ x1, x2 = x_rot.chunk(2, dim=-1)
99
+ c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
100
+ x1, x2, c, s = [t.to(dtype=torch.float32) for t in [x1, x2, c, s]]
101
+
102
+ x_rot = torch.cat([x1 * c - x2 * s, x1 * s + x2 * c], axis=-1).to(x.dtype)
103
+
104
+ return torch.cat([x_rot, x_pass], axis=-1)
105
+
106
+
107
+ def _apply_rotary_emb_kv(
108
+ kv: torch.FloatTensor,
109
+ cos: torch.FloatTensor,
110
+ sin: torch.FloatTensor,
111
+ cos_k: Optional[torch.FloatTensor] = None,
112
+ sin_k: Optional[torch.FloatTensor] = None,
113
+ ) -> torch.FloatTensor:
114
+ _, seqlen, _, _, _ = kv.shape
115
+ _, rotary_dim = cos.shape
116
+ rotary_dim *= 2
117
+
118
+ k_rot = kv[:, :, 0, :, :rotary_dim]
119
+ k_pass = kv[:, :, 0, :, rotary_dim:]
120
+
121
+ k1, k2 = k_rot.chunk(2, dim=-1)
122
+ c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
123
+ k1, k2, c, s = [t.to(dtype=torch.float32) for t in [k1, k2, c, s]]
124
+
125
+ k_rot = torch.cat([k1 * c - k2 * s, k1 * s + k2 * c], axis=-1).to(kv.dtype)
126
+
127
+ return torch.cat(
128
+ [
129
+ torch.cat([k_rot, k_pass], axis=-1).unsqueeze(2),
130
+ kv[:, :, 1:2, :, :],
131
+ ],
132
+ axis=2,
133
+ )
134
+
135
+
136
+ def _apply_rotary_emb_qkv(
137
+ qkv: torch.FloatTensor,
138
+ cos: torch.FloatTensor,
139
+ sin: torch.FloatTensor,
140
+ cos_k: Optional[torch.FloatTensor] = None,
141
+ sin_k: Optional[torch.FloatTensor] = None,
142
+ ) -> torch.FloatTensor:
143
+ _, seqlen, _, _, _ = qkv.shape
144
+ _, rotary_dim = cos.shape
145
+ rotary_dim *= 2
146
+
147
+ q_rot = qkv[:, :, 0, :, :rotary_dim]
148
+ q_pass = qkv[:, :, 0, :, rotary_dim:]
149
+
150
+ k_rot = qkv[:, :, 1, :, :rotary_dim]
151
+ k_pass = qkv[:, :, 1, :, rotary_dim:]
152
+
153
+ q1, q2 = q_rot.chunk(2, dim=-1)
154
+ k1, k2 = k_rot.chunk(2, dim=-1)
155
+ c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
156
+ q1, q2, k1, k2, c, s = [t.to(dtype=torch.float32) for t in [q1, q2, k1, k2, c, s]]
157
+
158
+ q_rot = torch.cat([q1 * c - q2 * s, q1 * s + q2 * c], axis=-1).to(qkv.dtype)
159
+ k_rot = torch.cat([k1 * c - k2 * s, k1 * s + k2 * c], axis=-1).to(qkv.dtype)
160
+
161
+ return torch.cat(
162
+ [
163
+ torch.cat([q_rot, q_pass], axis=-1).unsqueeze(2),
164
+ torch.cat([k_rot, k_pass], axis=-1).unsqueeze(2),
165
+ qkv[:, :, 2:3, :, :],
166
+ ],
167
+ axis=2,
168
+ )
169
+
170
+
171
+ class RotaryEmbedding(nn.Module):
172
+ """Rotary positional embedding (RoPE).
173
+
174
+ Reference:
175
+ RoFormer: Enhanced Transformer with Rotary Position Embedding.
176
+ https://arxiv.org/pdf/2104.09864.pdf.
177
+
178
+ """
179
+
180
+ def __init__(
181
+ self,
182
+ dim: int,
183
+ base: int = 10000,
184
+ scale_base: Optional[float] = None,
185
+ pos_idx_in_fp32: bool = True,
186
+ max_position_embeddings: int = 2048,
187
+ device: Optional[str] = None,
188
+ **kwargs,
189
+ ) -> None:
190
+ super().__init__()
191
+
192
+ if scale_base is not None:
193
+ raise NotImplementedError
194
+
195
+ self.dim = dim
196
+ self.base = float(base)
197
+ self.scale_base = scale_base
198
+ self.pos_idx_in_fp32 = pos_idx_in_fp32
199
+ self.max_position_embeddings = max_position_embeddings
200
+ self.device = device
201
+
202
+ # Generate and save the inverse frequency buffer (non-trainable)
203
+ inv_freq = self._compute_inv_freq(device)
204
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
205
+
206
+ # Generate and save the scale buffer (non-trainable)
207
+ scale = (
208
+ (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
209
+ if scale_base is not None
210
+ else None
211
+ )
212
+ self.register_buffer("scale", scale, persistent=False)
213
+
214
+ # Initialize cached attributes since ONNX can't rely on dynamic initialization
215
+ self._update_cos_sin_cache(max_position_embeddings, device=device, dtype=torch.float32)
216
+
217
+ def _compute_inv_freq(self, device: Optional[str] = None) -> torch.FloatTensor:
218
+ return 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim))
219
+
220
+ def _update_cos_sin_cache(
221
+ self,
222
+ seqlen: int,
223
+ device: Optional[str] = None,
224
+ dtype: Optional[torch.dtype] = None,
225
+ ) -> None:
226
+ self._seq_len_cached = seqlen
227
+
228
+ # fp32 is preferred since the output of `torch.arange` can be quite large
229
+ # and bf16 would lose a lot of precision
230
+ if self.pos_idx_in_fp32:
231
+ t = torch.arange(seqlen, device=device, dtype=torch.float32)
232
+ if self.inv_freq.dtype != torch.float32:
233
+ inv_freq = self._compute_inv_freq(device=device)
234
+ else:
235
+ inv_freq = self.inv_freq
236
+ else:
237
+ t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
238
+ inv_freq = self.inv_freq
239
+
240
+ # `torch.outer` is preferred since `torch.einsum` converts from fp32 to fp16 if used with AMP
241
+ freqs = torch.outer(t, inv_freq)
242
+ if self.scale is None:
243
+ self._cos_cached = torch.cos(freqs).to(dtype)
244
+ self._sin_cached = torch.sin(freqs).to(dtype)
245
+ else:
246
+ power = (
247
+ torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device) - seqlen // 2
248
+ ) / self.scale_base
249
+ scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
250
+
251
+ # Force the scale multiplication to happen in fp32
252
+ self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
253
+ self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
254
+ self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
255
+ self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
256
+
257
+ def forward(
258
+ self,
259
+ qkv: torch.Tensor,
260
+ kv: Optional[torch.Tensor] = None,
261
+ seqlen_offset: int = 0,
262
+ **kwargs,
263
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
264
+ if (
265
+ self._seq_len_cached < qkv.shape[1] + seqlen_offset
266
+ or self._cos_cached.device != qkv.device
267
+ or self._cos_cached.dtype != qkv.dtype
268
+ or (self.training and self._cos_cached.is_inference())
269
+ ):
270
+ self._update_cos_sin_cache(qkv.shape[1] + seqlen_offset, device=qkv.device, dtype=qkv.dtype)
271
+
272
+ if kv is None:
273
+ return _apply_rotary_emb_qkv(
274
+ qkv,
275
+ self._cos_cached[seqlen_offset:],
276
+ self._sin_cached[seqlen_offset:],
277
+ )
278
+ else:
279
+ q = _apply_rotary_emb(
280
+ qkv,
281
+ self._cos_cached[seqlen_offset:],
282
+ self._sin_cached[seqlen_offset:],
283
+ )
284
+ kv = _apply_rotary_emb_kv(
285
+ kv,
286
+ self._cos_cached[seqlen_offset:],
287
+ self._sin_cached[seqlen_offset:],
288
+ )
289
+
290
+ return q, kv
291
+
292
+
293
+ class MoE(nn.Module):
294
+ def __init__(
295
+ self,
296
+ config: PretrainedConfig,
297
+ ):
298
+ super().__init__()
299
+ self.mlp = nn.ModuleList([MLP(config) for i in range(config.num_local_experts)])
300
+ self.gate = nn.Linear(config.n_embd, config.num_local_experts, bias=False)
301
+ self.num_experts_per_tok = config.num_experts_per_tok
302
+
303
+ def forward(self, x):
304
+ orig_shape = x.shape
305
+ x = x.view(-1, x.shape[-1])
306
+
307
+ scores = self.gate(x)
308
+ expert_weights, expert_indices = torch.topk(scores, self.num_experts_per_tok, dim=-1)
309
+ expert_weights = expert_weights.softmax(dim=-1)
310
+ flat_expert_indices = expert_indices.view(-1)
311
+
312
+ x = x.repeat_interleave(self.num_experts_per_tok, dim=0)
313
+ y = torch.empty_like(x)
314
+ for i, expert in enumerate(self.mlp):
315
+ y[flat_expert_indices == i] = expert(x[flat_expert_indices == i])
316
+ y = (y.view(*expert_weights.shape, -1) * expert_weights.unsqueeze(-1)).sum(dim=1)
317
+ return y.view(*orig_shape)
318
+
319
+
320
+ class MLP(nn.Module):
321
+ """Multi-Layer Perceptron.
322
+
323
+ Reference:
324
+ Attention Is All You Need.
325
+ https://arxiv.org/pdf/1706.03762.pdf.
326
+
327
+ """
328
+
329
+ def __init__(
330
+ self,
331
+ config: PretrainedConfig,
332
+ n_inner: Optional[int] = None,
333
+ act_fn: Optional[str] = None,
334
+ ) -> None:
335
+ super().__init__()
336
+
337
+ act_fn = config.activation_function if act_fn is None else act_fn
338
+
339
+ n_inner = getattr(config, "n_inner", None) if n_inner is None else n_inner
340
+ n_inner = n_inner if n_inner is not None else 4 * config.n_embd
341
+
342
+ self.fc1 = nn.Linear(config.n_embd, n_inner)
343
+ self.fc2 = nn.Linear(n_inner, config.n_embd)
344
+ self.act = ACT2FN[act_fn]
345
+
346
+ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
347
+ hidden_states = self.fc1(hidden_states)
348
+ hidden_states = self.act(hidden_states)
349
+ hidden_states = self.fc2(hidden_states)
350
+
351
+ return hidden_states
352
+
353
+
354
+ class SelfAttention(nn.Module):
355
+ """Self-attention layer (compatible with PyTorch).
356
+
357
+ Reference:
358
+ https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/mha.py.
359
+
360
+ """
361
+
362
+ def __init__(
363
+ self,
364
+ causal: bool = True,
365
+ softmax_scale: Optional[float] = None,
366
+ attention_dropout: float = 0.0,
367
+ ) -> None:
368
+ super().__init__()
369
+
370
+ self.causal = causal
371
+ self.softmax_scale = softmax_scale
372
+ self.drop = nn.Dropout(attention_dropout)
373
+
374
+ @torch.autocast("cpu", enabled=False)
375
+ @torch.autocast("cuda", enabled=False)
376
+ def forward(
377
+ self,
378
+ qkv: torch.FloatTensor,
379
+ causal: bool = None,
380
+ key_padding_mask: Optional[torch.BoolTensor] = None,
381
+ **kwargs,
382
+ ) -> torch.FloatTensor:
383
+ batch_size, seqlen = qkv.shape[0], qkv.shape[1]
384
+ q, k, v = qkv.unbind(dim=2)
385
+
386
+ q = q.to(torch.float32)
387
+ k = k.to(torch.float32)
388
+
389
+ causal = self.causal if causal is None else causal
390
+ softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
391
+
392
+ # Autocast is manually disabled to avoid `torch.einsum` performing the operation
393
+ # using float16, which might lead to overflow
394
+ scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
395
+
396
+ if key_padding_mask is not None:
397
+ padding_mask = torch.full((batch_size, seqlen), -10000.0, dtype=scores.dtype, device=scores.device)
398
+ padding_mask.masked_fill_(key_padding_mask, 0.0)
399
+
400
+ scores = scores + rearrange(padding_mask, "b s -> b 1 1 s")
401
+
402
+ if causal:
403
+ causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
404
+ scores = scores + causal_mask.to(dtype=scores.dtype)
405
+
406
+ attention = torch.softmax(scores, dim=-1).to(v.dtype)
407
+ attention = self.drop(attention)
408
+
409
+ output = torch.einsum("bhts,bshd->bthd", attention, v)
410
+
411
+ return output
412
+
413
+
414
+ class CrossAttention(nn.Module):
415
+ """Cross-attention layer (compatible with PyTorch).
416
+
417
+ Reference:
418
+ https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/mha.py.
419
+
420
+ """
421
+
422
+ def __init__(
423
+ self,
424
+ causal: bool = True,
425
+ softmax_scale: Optional[float] = None,
426
+ attention_dropout: float = 0.0,
427
+ ) -> None:
428
+ super().__init__()
429
+
430
+ self.causal = causal
431
+ self.softmax_scale = softmax_scale
432
+ self.drop = nn.Dropout(attention_dropout)
433
+
434
+ @torch.autocast("cpu", enabled=False)
435
+ @torch.autocast("cuda", enabled=False)
436
+ def forward(
437
+ self,
438
+ q: torch.FloatTensor,
439
+ kv: torch.FloatTensor,
440
+ causal: bool = None,
441
+ key_padding_mask: Optional[torch.BoolTensor] = None,
442
+ **kwargs,
443
+ ) -> torch.FloatTensor:
444
+ batch_size, seqlen_q = q.shape[0], q.shape[1]
445
+ seqlen_k = kv.shape[1]
446
+
447
+ if kv.shape[3] != q.shape[2]:
448
+ kv = repeat(kv, "... hkv d -> ... (hkv g) d", g=q.shape[2] // kv.shape[3])
449
+ k, v = kv.unbind(dim=2)
450
+
451
+ q = q.to(torch.float32)
452
+ k = k.to(torch.float32)
453
+
454
+ causal = self.causal if causal is None else causal
455
+ softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
456
+
457
+ # Autocast is manually disabled to avoid `torch.einsum` performing the operation
458
+ # using float16, which might lead to overflow
459
+ scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
460
+
461
+ if key_padding_mask is not None:
462
+ padding_mask = torch.full(
463
+ (batch_size, seqlen_k),
464
+ -10000.0,
465
+ dtype=scores.dtype,
466
+ device=scores.device,
467
+ )
468
+ padding_mask.masked_fill_(key_padding_mask, 0.0)
469
+
470
+ scores = scores + rearrange(padding_mask, "b s -> b 1 1 s")
471
+
472
+ if causal:
473
+ rows = rearrange(torch.arange(seqlen_q, device=q.device, dtype=torch.long), "s -> s 1")
474
+ cols = torch.arange(seqlen_k, device=k.device, dtype=torch.long)
475
+ causal_mask = cols > rows + seqlen_k - seqlen_q
476
+
477
+ scores = scores.masked_fill(causal_mask, -10000.0)
478
+
479
+ attention = torch.softmax(scores, dim=-1).to(v.dtype)
480
+ attention = self.drop(attention)
481
+
482
+ output = torch.einsum("bhts,bshd->bthd", attention, v)
483
+
484
+ return output
485
+
486
+
487
+ def _find_mha_dims(
488
+ config: PretrainedConfig,
489
+ n_head: Optional[int] = None,
490
+ n_head_kv: Optional[int] = None,
491
+ head_dim: Optional[int] = None,
492
+ ) -> Tuple[int, int]:
493
+ if n_head is None and head_dim is None:
494
+ head_dim = config.n_embd // config.n_head
495
+ n_head = config.n_head
496
+ elif n_head is None or head_dim is None:
497
+ raise ValueError("`n_head` and `head_dim` must be both specified or `None`.")
498
+
499
+ if n_head_kv is None:
500
+ n_head_kv = getattr(config, "n_head_kv", None) or n_head
501
+
502
+ return n_head, n_head_kv, head_dim
503
+
504
+
505
+ def _update_kv_cache(kv: torch.FloatTensor, inference_params: InferenceParams, layer_idx: int) -> torch.FloatTensor:
506
+ num_heads, head_dim = kv.shape[-2:]
507
+
508
+ if layer_idx not in inference_params.key_value_memory_dict:
509
+ inference_params.key_value_memory_dict[layer_idx] = torch.empty(
510
+ inference_params.max_batch_size,
511
+ inference_params.max_seqlen,
512
+ 2,
513
+ num_heads,
514
+ head_dim,
515
+ dtype=kv.dtype,
516
+ device=kv.device,
517
+ )
518
+
519
+ batch_start = inference_params.batch_size_offset
520
+ batch_end = batch_start + kv.shape[0]
521
+
522
+ sequence_start = inference_params.seqlen_offset
523
+ sequence_end = sequence_start + kv.shape[1]
524
+
525
+ # When the current sequence length is equal to or larger than the maximum sequence length,
526
+ # we need to concatenate the current `kv` with the cached `kv` to expand its length
527
+ if sequence_end >= inference_params.max_seqlen:
528
+ inference_params.key_value_memory_dict[layer_idx] = torch.concatenate((inference_params.key_value_memory_dict[layer_idx], kv), dim=1)
529
+
530
+ inference_params.key_value_memory_dict[layer_idx][batch_start:batch_end, sequence_start:sequence_end, ...] = kv
531
+ kv = inference_params.key_value_memory_dict[layer_idx][batch_start:batch_end, :sequence_end, ...]
532
+
533
+ return kv
534
+
535
+
536
+ class MHA(nn.Module):
537
+ """Multi-head attention layer."""
538
+
539
+ def __init__(
540
+ self,
541
+ config: PretrainedConfig,
542
+ dtype: Optional[torch.dtype] = None,
543
+ device: Optional[str] = None,
544
+ rotary_dim: Optional[int] = None,
545
+ rotary_base: float = 10000.0,
546
+ rotary_scale_base: Optional[float] = None,
547
+ n_head: Optional[int] = None,
548
+ n_head_kv: Optional[int] = None,
549
+ head_dim: Optional[int] = None,
550
+ bias: bool = True,
551
+ causal: bool = True,
552
+ softmax_scale: Optional[float] = None,
553
+ layer_idx: Optional[int] = None,
554
+ return_residual: bool = False,
555
+ checkpointing: bool = False,
556
+ ) -> None:
557
+ super().__init__()
558
+
559
+ # Rotary embedding
560
+ self.rotary_dim = rotary_dim if rotary_dim is not None else getattr(config, "rotary_dim", 0)
561
+ if self.rotary_dim > 0:
562
+ rotary_cls = FlashRotaryEmbedding if config.flash_rotary else RotaryEmbedding
563
+ if rotary_cls is None:
564
+ rotary_cls = RotaryEmbedding
565
+
566
+ rotary_kwargs = {}
567
+ if rotary_cls is RotaryEmbedding:
568
+ rotary_kwargs["max_position_embeddings"] = config.n_positions
569
+
570
+ self.rotary_emb = rotary_cls(
571
+ self.rotary_dim,
572
+ base=rotary_base,
573
+ scale_base=rotary_scale_base,
574
+ device=device,
575
+ **rotary_kwargs,
576
+ )
577
+
578
+ # MLP
579
+ self.n_head, self.n_head_kv, self.head_dim = _find_mha_dims(
580
+ config, n_head=n_head, n_head_kv=n_head_kv, head_dim=head_dim
581
+ )
582
+ op_size = self.head_dim * (self.n_head + 2 * self.n_head_kv)
583
+ hidden_size = config.n_embd
584
+
585
+ linear_cls = FusedDense if config.fused_dense else nn.Linear
586
+ if linear_cls is None:
587
+ linear_cls = nn.Linear
588
+
589
+ self.Wqkv = linear_cls(hidden_size, op_size, bias=bias, device=device, dtype=dtype)
590
+ self.out_proj = linear_cls(hidden_size, hidden_size, bias=bias, device=device, dtype=dtype)
591
+
592
+ # Attention
593
+ attn_cls = FlashSelfAttention if config.flash_attn else SelfAttention
594
+ if attn_cls is None:
595
+ attn_cls = SelfAttention
596
+
597
+ cross_attn_cls = FlashCrossAttention if config.flash_attn else CrossAttention
598
+ if cross_attn_cls is None:
599
+ cross_attn_cls = CrossAttention
600
+
601
+ self.inner_attn = attn_cls(
602
+ causal=causal,
603
+ softmax_scale=softmax_scale,
604
+ attention_dropout=config.attn_pdrop,
605
+ )
606
+ self.inner_cross_attn = cross_attn_cls(
607
+ causal=causal,
608
+ softmax_scale=softmax_scale,
609
+ attention_dropout=config.attn_pdrop,
610
+ )
611
+
612
+ self.flash_attn = config.flash_attn and attn_cls is FlashSelfAttention
613
+ self.layer_idx = layer_idx
614
+ self.return_residual = return_residual
615
+ self.checkpointing = checkpointing
616
+
617
+ def _forward_self_attn(
618
+ self, x: torch.FloatTensor, key_padding_mask: Optional[torch.BoolTensor]
619
+ ) -> torch.FloatTensor:
620
+ qkv = self.Wqkv(x)
621
+ qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim)
622
+
623
+ if self.rotary_dim > 0:
624
+ qkv = self.rotary_emb(qkv)
625
+
626
+ if self.flash_attn:
627
+ batch_size, seqlen = qkv.shape[0], qkv.shape[1]
628
+
629
+ cu_seqlens, max_seqlen = None, None
630
+ if key_padding_mask is not None:
631
+ # If `key_padding_mask` is supplied, we need to unpad the input and retrieve
632
+ # the `cu_seqlens` and `max_seqlen` to be used by `flash-attn`
633
+ qkv, indices, cu_seqlens, max_seqlen = unpad_input(qkv, key_padding_mask)
634
+
635
+ if self.checkpointing:
636
+ attn_output = torch.utils.checkpoint.checkpoint(
637
+ self.inner_attn, qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen
638
+ )
639
+ else:
640
+ attn_output = self.inner_attn(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen).to(qkv.device)
641
+
642
+ # If `key_padding_mask` is supplied, we need to pad the output back to the original shape
643
+ return pad_input(attn_output, indices, batch_size, seqlen) if key_padding_mask is not None else attn_output
644
+
645
+ if self.checkpointing:
646
+ return torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, key_padding_mask=key_padding_mask)
647
+
648
+ return self.inner_attn(qkv, key_padding_mask=key_padding_mask)
649
+
650
+ def _forward_cross_attn(
651
+ self,
652
+ x: torch.FloatTensor,
653
+ past_key_values: Optional[InferenceParams],
654
+ key_padding_mask: Optional[torch.BoolTensor],
655
+ ) -> torch.FloatTensor:
656
+ batch_size = x.shape[0]
657
+
658
+ qkv = self.Wqkv(x)
659
+
660
+ q = qkv[..., : self.n_head * self.head_dim]
661
+ q = rearrange(q, "... (h d) -> ... h d", d=self.head_dim)
662
+
663
+ kv = qkv[..., self.n_head * self.head_dim :]
664
+ kv = rearrange(kv, "... (two hkv d) -> ... two hkv d", two=2, d=self.head_dim)
665
+
666
+ seqlen_offset = past_key_values.seqlen_offset if past_key_values is not None else 0
667
+ causal = None if seqlen_offset == 0 else False
668
+ if self.rotary_dim > 0:
669
+ q, kv = self.rotary_emb(q, kv=kv, seqlen_offset=seqlen_offset)
670
+
671
+ if past_key_values is not None:
672
+ kv = _update_kv_cache(kv, past_key_values, self.layer_idx)
673
+
674
+ if self.flash_attn:
675
+ batch_size, seqlen_q = q.shape[0], q.shape[1]
676
+ seqlen_k = kv.shape[1]
677
+
678
+ cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k = (
679
+ None,
680
+ None,
681
+ None,
682
+ None,
683
+ )
684
+ if key_padding_mask is not None:
685
+ kv, _, cu_seqlens_k, max_seqlen_k = unpad_input(kv, key_padding_mask)
686
+
687
+ if seqlen_q == 1:
688
+ key_padding_mask = torch.ones(batch_size, 1, device=q.device)
689
+ elif seqlen_q != seqlen_k:
690
+ key_padding_mask = key_padding_mask[:, -seqlen_q:]
691
+
692
+ q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, key_padding_mask)
693
+
694
+ if self.checkpointing:
695
+ attn_output = torch.utils.checkpoint.checkpoint(
696
+ self.inner_cross_attn,
697
+ q,
698
+ kv,
699
+ causal=causal,
700
+ cu_seqlens=cu_seqlens_q,
701
+ max_seqlen=max_seqlen_q,
702
+ cu_seqlens_k=cu_seqlens_k,
703
+ max_seqlen_k=max_seqlen_k,
704
+ )
705
+ else:
706
+ attn_output = self.inner_cross_attn(
707
+ q,
708
+ kv,
709
+ causal=causal,
710
+ cu_seqlens=cu_seqlens_q,
711
+ max_seqlen=max_seqlen_q,
712
+ cu_seqlens_k=cu_seqlens_k,
713
+ max_seqlen_k=max_seqlen_k,
714
+ )
715
+
716
+ return (
717
+ pad_input(attn_output, indices_q, batch_size, max_seqlen_q)
718
+ if key_padding_mask is not None
719
+ else attn_output
720
+ )
721
+
722
+ if self.checkpointing:
723
+ return torch.utils.checkpoint.checkpoint(
724
+ self.inner_cross_attn,
725
+ q,
726
+ kv,
727
+ key_padding_mask=key_padding_mask,
728
+ causal=causal,
729
+ )
730
+
731
+ return self.inner_cross_attn(q, kv, key_padding_mask=key_padding_mask, causal=causal)
732
+
733
+ def forward(
734
+ self,
735
+ x: torch.FloatTensor,
736
+ past_key_values: Optional[InferenceParams] = None,
737
+ attention_mask: Optional[Union[torch.LongTensor, torch.BoolTensor]] = None,
738
+ **kwargs,
739
+ ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
740
+ if attention_mask is not None:
741
+ attention_mask = attention_mask.bool()
742
+ else:
743
+ attention_mask = None
744
+
745
+ # MHA
746
+ if self.n_head == self.n_head_kv:
747
+ if past_key_values is None:
748
+ # If `past_key_values` are not supplied, we run self-attention
749
+ attn_output = self._forward_self_attn(x, attention_mask)
750
+ else:
751
+ # If `past_key_values` are supplied, it means that we might have cached values and
752
+ # could take advantage of cross-attention
753
+ attn_output = self._forward_cross_attn(x, past_key_values, attention_mask)
754
+ # MQA / GQA
755
+ else:
756
+ # Regardless of `past_key_values` being supplied or not, it always use cross-attention
757
+ # because `q` and `kv` lengths might be different
758
+ attn_output = self._forward_cross_attn(x, past_key_values, attention_mask)
759
+
760
+ output = rearrange(attn_output, "... h d -> ... (h d)")
761
+ output = self.out_proj(output)
762
+
763
+ return output if not self.return_residual else (output, x)
764
+
765
+
766
+ class ParallelBlock(nn.Module):
767
+ """Parallel block.
768
+
769
+ This block applies parallel mixer and MLP layers to the input (used in GPT-J and CodeGen).
770
+
771
+ """
772
+
773
+ def __init__(
774
+ self,
775
+ config: PretrainedConfig,
776
+ block_idx: Optional[int] = None,
777
+ ) -> None:
778
+ super().__init__()
779
+
780
+ self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
781
+ self.resid_dropout = nn.Dropout(config.resid_pdrop)
782
+ self.block_idx = block_idx
783
+
784
+ self.mixer = MHA(config, layer_idx=block_idx)
785
+ self.moe = MoE(config)
786
+
787
+ def forward(
788
+ self,
789
+ hidden_states: torch.FloatTensor,
790
+ past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
791
+ attention_mask: Optional[torch.BoolTensor] = None,
792
+ **kwargs,
793
+ ) -> torch.FloatTensor:
794
+ residual = hidden_states
795
+ hidden_states = self.ln(hidden_states)
796
+
797
+ attn_outputs = self.mixer(
798
+ hidden_states,
799
+ past_key_values=past_key_values,
800
+ attention_mask=attention_mask,
801
+ )
802
+ if isinstance(attn_outputs, tuple):
803
+ attn_outputs = attn_outputs[0]
804
+
805
+ attn_outputs = self.resid_dropout(attn_outputs)
806
+ feed_forward_hidden_states = self.resid_dropout(self.moe(hidden_states))
807
+
808
+ hidden_states = attn_outputs + feed_forward_hidden_states + residual
809
+
810
+ return hidden_states
811
+
812
+
813
+ class CausalLMHead(nn.Module):
814
+ """Causal Language Modeling head.
815
+
816
+ Reference:
817
+ Improving Language Understanding by Generative Pre-Training.
818
+ https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
819
+
820
+ """
821
+
822
+ def __init__(self, config: PretrainedConfig) -> None:
823
+ super().__init__()
824
+
825
+ self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
826
+ self.linear = nn.Linear(config.n_embd, config.vocab_size)
827
+
828
+ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
829
+ hidden_states = self.ln(hidden_states)
830
+ logits = self.linear(hidden_states).to(torch.float32)
831
+
832
+ return logits
833
+
834
+
835
+ class CausalLMLoss(nn.Module):
836
+ """Causal Language Modeling loss.
837
+
838
+ Reference:
839
+ Improving Language Understanding by Generative Pre-Training.
840
+ https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
841
+
842
+ """
843
+
844
+ def __init__(self, shift_labels: bool = True) -> None:
845
+ super().__init__()
846
+
847
+ self.shift_labels = shift_labels
848
+ self.loss_fct = nn.CrossEntropyLoss()
849
+
850
+ def forward(self, logits: torch.FloatTensor, labels: torch.LongTensor) -> torch.FloatTensor:
851
+ if self.shift_labels:
852
+ logits = logits[..., :-1, :].contiguous()
853
+ labels = labels[..., 1:].contiguous()
854
+
855
+ loss = self.loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
856
+
857
+ return loss
858
+
859
+
860
+ class PhiPreTrainedModel(PreTrainedModel):
861
+ """Phi pre-trained model."""
862
+
863
+ config_class = PhiConfig
864
+ base_model_prefix = "transformer"
865
+ supports_gradient_checkpointing = False
866
+ _no_split_modules = ["ParallelBlock"]
867
+
868
+ def __init__(self, *inputs, **kwargs) -> None:
869
+ super().__init__(*inputs, **kwargs)
870
+
871
+ def _init_weights(self, module: nn.Module) -> None:
872
+ if isinstance(module, (nn.Linear,)):
873
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
874
+ if module.bias is not None:
875
+ module.bias.data.zero_()
876
+ elif isinstance(module, nn.Embedding):
877
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
878
+ if module.padding_idx is not None:
879
+ module.weight.data[module.padding_idx].zero_()
880
+ elif isinstance(module, nn.LayerNorm):
881
+ if module.bias is not None:
882
+ module.bias.data.zero_()
883
+ module.weight.data.fill_(1.0)
884
+
885
+ def prepare_inputs_for_generation(
886
+ self,
887
+ input_ids: torch.LongTensor,
888
+ past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
889
+ attention_mask: Optional[Union[torch.LongTensor, torch.BoolTensor]] = None,
890
+ **kwargs,
891
+ ) -> Dict[str, Any]:
892
+ if past_key_values is None or not (isinstance(past_key_values, InferenceParams)):
893
+ past_key_values = InferenceParams(
894
+ max_seqlen=self.config.n_positions,
895
+ max_batch_size=input_ids.shape[0],
896
+ seqlen_offset=0,
897
+ batch_size_offset=0,
898
+ key_value_memory_dict={},
899
+ lengths_per_sample=None,
900
+ )
901
+ else:
902
+ # Assume that `past_key_values` has cached all tokens up to the last token in `input_ids`
903
+ past_key_values.seqlen_offset = input_ids.shape[1] - 1
904
+ input_ids = input_ids[:, -1].unsqueeze(-1)
905
+
906
+ return {
907
+ "input_ids": input_ids,
908
+ "past_key_values": past_key_values,
909
+ "attention_mask": attention_mask,
910
+ }
911
+
912
+
913
+ class PhiModel(PhiPreTrainedModel):
914
+ """Phi model."""
915
+
916
+ _keys_to_ignore_on_load_missing = [""]
917
+ _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.mlp.(fc_in|fc_out)\.(weight|bias)"]
918
+
919
+ def __init__(self, config: PhiConfig) -> None:
920
+ super().__init__(config)
921
+
922
+ self.embd = Embedding(config)
923
+ self.h = nn.ModuleList([ParallelBlock(config, block_idx=i) for i in range(config.n_layer)])
924
+ self.gradient_checkpointing = False
925
+ self.post_init()
926
+
927
+ def get_input_embeddings(self) -> nn.Embedding:
928
+ return self.embd.wte
929
+
930
+ def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
931
+ self.embd.wte = new_embeddings
932
+
933
+ def forward(
934
+ self,
935
+ input_ids: torch.LongTensor,
936
+ past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
937
+ attention_mask: Optional[torch.BoolTensor] = None,
938
+ ) -> torch.FloatTensor:
939
+ hidden_states = self.embd(input_ids)
940
+
941
+ for layer in self.h:
942
+ hidden_states = layer(
943
+ hidden_states,
944
+ past_key_values=past_key_values,
945
+ attention_mask=attention_mask,
946
+ )
947
+
948
+ return hidden_states
949
+
950
+
951
+ class PhiForCausalLM(PhiPreTrainedModel):
952
+ """Phi for Causal Language Modeling."""
953
+
954
+ _keys_to_ignore_on_load_missing = [""]
955
+ _keys_to_ignore_on_load_unexpected = [r"transformer\.h\.\d+\.mlp.(fc_in|fc_out)\.(weight|bias)"]
956
+
957
+ def __init__(self, config: PhiConfig) -> None:
958
+ super().__init__(config)
959
+
960
+ self.transformer = PhiModel(config)
961
+ self.lm_head = CausalLMHead(config)
962
+ self.loss = CausalLMLoss()
963
+
964
+ self.post_init()
965
+
966
+ def get_output_embeddings(self) -> nn.Linear:
967
+ return self.lm_head.linear
968
+
969
+ def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
970
+ self.lm_head.linear = new_embeddings
971
+
972
+ def forward(
973
+ self,
974
+ input_ids: torch.LongTensor,
975
+ past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
976
+ attention_mask: Optional[torch.BoolTensor] = None,
977
+ labels: Optional[torch.LongTensor] = None,
978
+ **kwargs,
979
+ ) -> CausalLMOutputWithPast:
980
+ hidden_states = self.transformer(input_ids, past_key_values=past_key_values, attention_mask=attention_mask)
981
+ lm_logits = self.lm_head(hidden_states)
982
+
983
+ loss = None
984
+ if labels is not None:
985
+ loss = self.loss(lm_logits, labels)
986
+
987
+ return CausalLMOutputWithPast(loss=loss, logits=lm_logits, past_key_values=past_key_values)