add kv cache

Browse files

Files changed (5) hide show

attention.py +57 -16
blocks.py +3 -2
config.json +2 -1
generation_config.json +1 -1
modeling_mpt.py +23 -5

attention.py CHANGED Viewed

@@ -18,6 +18,7 @@ class PastKeyValue(NamedTuple):
 class AttnFnOutput(NamedTuple):
     attns: torch.Tensor
     attn_probs: Optional[torch.Tensor]
 class AttnFn(Protocol):
     def __call__(
@@ -81,6 +82,7 @@ def scaled_multihead_dot_product_attention(
     key: torch.Tensor,
     value: torch.Tensor,
     n_heads: int,
     softmax_scale: Optional[float] = None,
     attn_bias: Optional[torch.Tensor] = None,
     key_padding_mask: Optional[torch.ByteTensor] = None,
@@ -91,23 +93,41 @@ def scaled_multihead_dot_product_attention(
     multiquery = False,
 ) -> AttnFnOutput:
     q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
-    k = rearrange(key, 'b s (h d) -> b h d s', h=1 if multiquery else n_heads)
-    v = rearrange(value, 'b s (h d) -> b h s d', h=1 if multiquery else n_heads)
-    min_val = torch.finfo(q.dtype).min
     (b, _, s_q, d) = q.shape
     s_k = k.size(-1)
     if softmax_scale is None:
         softmax_scale = 1 / math.sqrt(d)
     attn_weight = q.matmul(k) * softmax_scale
     if attn_bias is not None:
         if attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
             raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.')
         attn_weight = attn_weight + attn_bias
     if key_padding_mask is not None:
         if attn_bias is not None:
             warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unneccessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
         attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
-    if is_causal:
         s = max(s_q, s_k)
         causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
         causal_mask = causal_mask.tril()
@@ -121,8 +141,8 @@ def scaled_multihead_dot_product_attention(
     out = attn_weight.matmul(v)
     out = rearrange(out, 'b h s d -> b s (h d)')
     if needs_weights:
-        return AttnFnOutput(out, attn_weight)
-    return AttnFnOutput(out, None)
 def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
     for tensor in tensors:
@@ -136,6 +156,7 @@ def flash_attn_fn(
     key: torch.Tensor,
     value: torch.Tensor,
     n_heads: int,
     softmax_scale: Optional[float] = None,
     attn_bias: Optional[torch.Tensor] = None,
     key_padding_mask: Optional[torch.ByteTensor] = None,
@@ -150,6 +171,18 @@ def flash_attn_fn(
     except:
         raise RuntimeError('Please install flash-attn==1.0.3.post0')
     check_valid_inputs(query, key, value)
     if attn_bias is not None:
         raise NotImplementedError(f'attn_bias not implemented for flash attn.')
     (batch_size, seqlen) = query.shape[:2]
@@ -169,13 +202,14 @@ def flash_attn_fn(
     reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
     output_unpad = flash_attn_interface.flash_attn_unpadded_func(query_unpad, key_unpad, value_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights)
     output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
-    return AttnFnOutput(output, None)
 def triton_flash_attn_fn(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
     n_heads: int,
     softmax_scale: Optional[float] = None,
     attn_bias: Optional[torch.Tensor] = None,
     key_padding_mask: Optional[torch.ByteTensor] = None,
@@ -198,6 +232,18 @@ def triton_flash_attn_fn(
         if not _installed:
             raise RuntimeError('Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU and `pip install .[gpu]` if installing from llm-foundry source or `pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). Note: (1) requires you have CMake and PyTorch already installed.')
     check_valid_inputs(query, key, value)
     if dropout_p:
         raise NotImplementedError(f'Dropout not implemented for attn_impl: triton.')
     if needs_weights:
@@ -217,7 +263,7 @@ def triton_flash_attn_fn(
     reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
     attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
     output = attn_output.view(*attn_output.shape[:2], -1)
-    return AttnFnOutput(output, None)
 class MultiheadAttention(nn.Module, Attn):
     """Multi-head self attention.
@@ -278,13 +324,6 @@ class MultiheadAttention(nn.Module, Attn):
             dtype = query.dtype
             query = self.q_ln(query).to(dtype)
             key = self.k_ln(key).to(dtype)
-        if past_key_value is not None:
-            if len(past_key_value) != 0:
-                key = torch.cat([past_key_value[0], key], dim=1)
-                value = torch.cat([past_key_value[1], value], dim=1)
-            past_key_value = PastKeyValue(key, value)
-        if attn_bias is not None:
-            attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
         if self.training and self.gradient_checkpointing:
             ckpt_kwargs: Dict[str, Any] = {'use_reentrant': False} if is_torch_version('>=', '1.11.0') else {}
             def create_custom_forward(attn_fn: AttnFn) -> AttnFnCheckpointed:
@@ -337,6 +376,7 @@ class MultiheadAttention(nn.Module, Attn):
                 key,
                 value,
                 self.n_heads,
                 softmax_scale=self.softmax_scale,
                 attn_bias=attn_bias,
                 key_padding_mask=key_padding_mask,
@@ -345,7 +385,7 @@ class MultiheadAttention(nn.Module, Attn):
                 training=self.training,
                 needs_weights=needs_weights,
             )
-        context, attn_weights = attn_fn_out
         return AttnOutput(self.out_proj(context), attn_weights, past_key_value)
 class MultiQueryAttention(nn.Module, Attn):
@@ -465,6 +505,7 @@ class MultiQueryAttention(nn.Module, Attn):
                 key,
                 value,
                 self.n_heads,
                 softmax_scale=self.softmax_scale,
                 attn_bias=attn_bias,
                 key_padding_mask=key_padding_mask,

 class AttnFnOutput(NamedTuple):
     attns: torch.Tensor
     attn_probs: Optional[torch.Tensor]
+    past_key_value: Union[PastKeyValue, Tuple, None]
 class AttnFn(Protocol):
     def __call__(
     key: torch.Tensor,
     value: torch.Tensor,
     n_heads: int,
+    past_key_value=None,
     softmax_scale: Optional[float] = None,
     attn_bias: Optional[torch.Tensor] = None,
     key_padding_mask: Optional[torch.ByteTensor] = None,
     multiquery = False,
 ) -> AttnFnOutput:
     q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
+    kv_n_heads = 1 if multiquery else n_heads
+    k = rearrange(key, 'b s (h d) -> b h d s', h=kv_n_heads)
+    v = rearrange(value, 'b s (h d) -> b h s d', h=kv_n_heads)
+    if past_key_value is not None:
+        # attn_impl: flash & triton use kernels which expect input shape [b, s, h, d_head].
+        # kv_cache is therefore stored using that shape.
+        # attn_impl: torch stores the kv_cache in the ordering which is most advantageous
+        # for its attn computation ie
+        # keys are stored as tensors with shape [b, h, d_head, s] and
+        # values are stored as tensors with shape [b, h, s, d_head]
+        if len(past_key_value) != 0:
+            k = torch.cat([past_key_value[0], k], dim=3)
+            v = torch.cat([past_key_value[1], v], dim=2)
+        past_key_value = (k, v)
     (b, _, s_q, d) = q.shape
     s_k = k.size(-1)
     if softmax_scale is None:
         softmax_scale = 1 / math.sqrt(d)
     attn_weight = q.matmul(k) * softmax_scale
     if attn_bias is not None:
+        # clamp to 0 necessary for torch 2.0 compile()
+        _s_q = max(0, attn_bias.size(2) - s_q)
+        _s_k = max(0, attn_bias.size(3) - s_k)
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
         if attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
             raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.')
         attn_weight = attn_weight + attn_bias
+    min_val = torch.finfo(q.dtype).min
     if key_padding_mask is not None:
         if attn_bias is not None:
             warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unneccessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
         attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
+    if is_causal and (not q.size(2) == 1):
         s = max(s_q, s_k)
         causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
         causal_mask = causal_mask.tril()
     out = attn_weight.matmul(v)
     out = rearrange(out, 'b h s d -> b s (h d)')
     if needs_weights:
+        return AttnFnOutput(out, attn_weight, past_key_value)
+    return AttnFnOutput(out, None, past_key_value)
 def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
     for tensor in tensors:
     key: torch.Tensor,
     value: torch.Tensor,
     n_heads: int,
+    past_key_value=None,
     softmax_scale: Optional[float] = None,
     attn_bias: Optional[torch.Tensor] = None,
     key_padding_mask: Optional[torch.ByteTensor] = None,
     except:
         raise RuntimeError('Please install flash-attn==1.0.3.post0')
     check_valid_inputs(query, key, value)
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            key = torch.cat([past_key_value[0], key], dim=1)
+            value = torch.cat([past_key_value[1], value], dim=1)
+        past_key_value = (key, value)
+    if attn_bias is not None:
+        # clamp to 0 necessary for torch 2.0 compile()
+        _s_q = max(0, attn_bias.size(2) - query.size(1))
+        _s_k = max(0, attn_bias.size(3) - key.size(1))
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
     if attn_bias is not None:
         raise NotImplementedError(f'attn_bias not implemented for flash attn.')
     (batch_size, seqlen) = query.shape[:2]
     reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
     output_unpad = flash_attn_interface.flash_attn_unpadded_func(query_unpad, key_unpad, value_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights)
     output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
+    return AttnFnOutput(output, None, past_key_value)
 def triton_flash_attn_fn(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
     n_heads: int,
+    past_key_value=None,
     softmax_scale: Optional[float] = None,
     attn_bias: Optional[torch.Tensor] = None,
     key_padding_mask: Optional[torch.ByteTensor] = None,
         if not _installed:
             raise RuntimeError('Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU and `pip install .[gpu]` if installing from llm-foundry source or `pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). Note: (1) requires you have CMake and PyTorch already installed.')
     check_valid_inputs(query, key, value)
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            key = torch.cat([past_key_value[0], key], dim=1)
+            value = torch.cat([past_key_value[1], value], dim=1)
+        past_key_value = (key, value)
+    if attn_bias is not None:
+        # clamp to 0 necessary for torch 2.0 compile()
+        _s_q = max(0, attn_bias.size(2) - query.size(1))
+        _s_k = max(0, attn_bias.size(3) - key.size(1))
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
     if dropout_p:
         raise NotImplementedError(f'Dropout not implemented for attn_impl: triton.')
     if needs_weights:
     reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
     attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
     output = attn_output.view(*attn_output.shape[:2], -1)
+    return AttnFnOutput(output, None, past_key_value)
 class MultiheadAttention(nn.Module, Attn):
     """Multi-head self attention.
             dtype = query.dtype
             query = self.q_ln(query).to(dtype)
             key = self.k_ln(key).to(dtype)
         if self.training and self.gradient_checkpointing:
             ckpt_kwargs: Dict[str, Any] = {'use_reentrant': False} if is_torch_version('>=', '1.11.0') else {}
             def create_custom_forward(attn_fn: AttnFn) -> AttnFnCheckpointed:
                 key,
                 value,
                 self.n_heads,
+                past_key_value=past_key_value,
                 softmax_scale=self.softmax_scale,
                 attn_bias=attn_bias,
                 key_padding_mask=key_padding_mask,
                 training=self.training,
                 needs_weights=needs_weights,
             )
+        context, attn_weights, past_key_value = attn_fn_out
         return AttnOutput(self.out_proj(context), attn_weights, past_key_value)
 class MultiQueryAttention(nn.Module, Attn):
                 key,
                 value,
                 self.n_heads,
+                past_key_value=past_key_value,
                 softmax_scale=self.softmax_scale,
                 attn_bias=attn_bias,
                 key_padding_mask=key_padding_mask,

blocks.py CHANGED Viewed

@@ -7,6 +7,7 @@ from .norm import NORM_CLASS_REGISTRY
 class MPTBlockOutput(NamedTuple):
     hidden_states: torch.Tensor
     past_key_value: Union[PastKeyValue, Tuple, None]
 class MPTMLP(nn.Module):
@@ -38,9 +39,9 @@ class MPTBlock(nn.Module):
     def forward(self, x: torch.Tensor, past_key_value: Union[PastKeyValue, Tuple, None] = None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> MPTBlockOutput:
         a = self.norm_1(x)
-        (b, _, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
         x = x + self.resid_attn_dropout(b)
         m = self.norm_2(x)
         n = self.ffn(m)
         x = x + self.resid_ffn_dropout(n)
-        return MPTBlockOutput(x, past_key_value)

 class MPTBlockOutput(NamedTuple):
     hidden_states: torch.Tensor
+    attn_probs: Optional[torch.Tensor]
     past_key_value: Union[PastKeyValue, Tuple, None]
 class MPTMLP(nn.Module):
     def forward(self, x: torch.Tensor, past_key_value: Union[PastKeyValue, Tuple, None] = None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> MPTBlockOutput:
         a = self.norm_1(x)
+        (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
         x = x + self.resid_attn_dropout(b)
         m = self.norm_2(x)
         n = self.ffn(m)
         x = x + self.resid_ffn_dropout(n)
+        return MPTBlockOutput(x, attn_weights, past_key_value)

config.json CHANGED Viewed

@@ -21,6 +21,7 @@
   "d_model": 4096,
   "emb_pdrop": 0,
   "embedding_fraction": 1.0,
   "expansion_ratio": 4,
   "init_config": {
     "emb_init_std": null,
@@ -46,7 +47,7 @@
   "tokenizer_name": "EleutherAI/gpt-neox-20b",
   "torch_dtype": "bfloat16",
   "transformers_version": "4.29.2",
-  "use_cache": false,
   "verbose": 0,
   "vocab_size": 50432
 }

   "d_model": 4096,
   "emb_pdrop": 0,
   "embedding_fraction": 1.0,
+  "eos_token_id": 0,
   "expansion_ratio": 4,
   "init_config": {
     "emb_init_std": null,
   "tokenizer_name": "EleutherAI/gpt-neox-20b",
   "torch_dtype": "bfloat16",
   "transformers_version": "4.29.2",
+  "use_cache": true,
   "verbose": 0,
   "vocab_size": 50432
 }

generation_config.json CHANGED Viewed

@@ -2,5 +2,5 @@
   "_from_model_config": true,
   "transformers_version": "4.29.2",
   "eos_token_id": 0,
-  "use_cache": false
 }

   "_from_model_config": true,
   "transformers_version": "4.29.2",
   "eos_token_id": 0,
+  "use_cache": true
 }

modeling_mpt.py CHANGED Viewed

@@ -116,7 +116,9 @@ class MPTModel(MPTPreTrainedModel):
             if attn_bias is None:
                 attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
             else:
-                attn_bias = attn_bias[:, :, :, -s_k:]
             if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
                 raise ValueError(f'attention_mask shape={attention_mask.shape} ' + f'and prefix_mask shape={prefix_mask.shape} are not equal.')
             min_val = torch.finfo(attn_bias.dtype).min
@@ -164,7 +166,10 @@ class MPTModel(MPTPreTrainedModel):
         if not return_dict:
             raise NotImplementedError('return_dict False is not implemented yet for MPT')
         if output_attentions:
-            raise NotImplementedError('output_attentions is not implemented yet for MPT')
         if attention_mask is not None and attention_mask[:, 0].sum() != attention_mask.shape[0] and self.training:
             raise NotImplementedError('MPT does not support training with left padding.')
         if self.prefix_lm and prefix_mask is None:
@@ -184,7 +189,12 @@ class MPTModel(MPTPreTrainedModel):
             if past_key_values is not None:
                 if len(past_key_values) != self.config.n_layers:
                     raise ValueError(f'past_key_values must provide a past_key_value for each attention ' + f'layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r}).')
                 past_position = past_key_values[0][0].size(1)
             if S + past_position > self.config.max_seq_len:
                 raise ValueError(f'Cannot forward input with past sequence length {past_position} and current sequence length {S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.')
             pos = torch.arange(past_position, S + past_position, dtype=torch.long, device=input_ids.device).unsqueeze(0)
@@ -202,6 +212,7 @@ class MPTModel(MPTPreTrainedModel):
         if use_cache and past_key_values is None:
             past_key_values = [() for _ in range(self.config.n_layers)]
         all_hidden_states = () if output_hidden_states else None
         for (b_idx, block) in enumerate(self.blocks):
             if output_hidden_states:
                 assert all_hidden_states is not None
@@ -242,12 +253,19 @@ class MPTModel(MPTPreTrainedModel):
                     attention_mask=attention_mask,
                     is_causal=self.is_causal,
                 )
-            x, past_key_value = block_out
             del block_out
             if past_key_values is not None:
                 past_key_values[b_idx] = past_key_value
         x = self.norm_f(x)
-        return BaseModelOutputWithPast(last_hidden_state=x, past_key_values=past_key_values, hidden_states=all_hidden_states)
     def param_init_fn(self, module):
         init_fn_name = self.config.init_config['name']
@@ -308,7 +326,7 @@ class MPTForCausalLM(MPTPreTrainedModel):
             labels = torch.roll(labels, shifts=-1)
             labels[:, -1] = -100
             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
-        return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states)
     def param_init_fn(self, module):
         init_fn_name = self.config.init_config['name']

             if attn_bias is None:
                 attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
             else:
+                # clamp to 0 necessary for torch 2.0 compile()
+                _s_k = max(0, attn_bias.size(-1) - s_k)
+                attn_bias = attn_bias[:, :, :, _s_k:]
             if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
                 raise ValueError(f'attention_mask shape={attention_mask.shape} ' + f'and prefix_mask shape={prefix_mask.shape} are not equal.')
             min_val = torch.finfo(attn_bias.dtype).min
         if not return_dict:
             raise NotImplementedError('return_dict False is not implemented yet for MPT')
         if output_attentions:
+            if self.attn_impl != 'torch':
+                raise NotImplementedError(
+                    'output_attentions is not implemented for MPT when using attn_impl `flash` or `triton`.'
+                )
         if attention_mask is not None and attention_mask[:, 0].sum() != attention_mask.shape[0] and self.training:
             raise NotImplementedError('MPT does not support training with left padding.')
         if self.prefix_lm and prefix_mask is None:
             if past_key_values is not None:
                 if len(past_key_values) != self.config.n_layers:
                     raise ValueError(f'past_key_values must provide a past_key_value for each attention ' + f'layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r}).')
+                # For attn_impl: triton and flash the past key tensor spec is (batch, seq, dim).
+                # For attn_impl: torch the past key tensor spec is (batch, heads, head_dim, seq).
+                # Here we shift position embedding using the `seq` dim of the past key
                 past_position = past_key_values[0][0].size(1)
+                if self.attn_impl == 'torch':
+                    past_position = past_key_values[0][0].size(3)
             if S + past_position > self.config.max_seq_len:
                 raise ValueError(f'Cannot forward input with past sequence length {past_position} and current sequence length {S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.')
             pos = torch.arange(past_position, S + past_position, dtype=torch.long, device=input_ids.device).unsqueeze(0)
         if use_cache and past_key_values is None:
             past_key_values = [() for _ in range(self.config.n_layers)]
         all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
         for (b_idx, block) in enumerate(self.blocks):
             if output_hidden_states:
                 assert all_hidden_states is not None
                     attention_mask=attention_mask,
                     is_causal=self.is_causal,
                 )
+            x, attn_weights, past_key_value = block_out
             del block_out
             if past_key_values is not None:
                 past_key_values[b_idx] = past_key_value
+            if output_attentions:
+                assert all_self_attns is not None  # pyright
+                all_self_attns = all_self_attns + (attn_weights,)
         x = self.norm_f(x)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            assert all_hidden_states is not None  # pyright
+            all_hidden_states = all_hidden_states + (x,)
+        return BaseModelOutputWithPast(last_hidden_state=x, past_key_values=past_key_values, hidden_states=all_hidden_states, attentions=all_self_attns)
     def param_init_fn(self, module):
         init_fn_name = self.config.init_config['name']
             labels = torch.roll(labels, shifts=-1)
             labels[:, -1] = -100
             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
+        return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
     def param_init_fn(self, module):
         init_fn_name = self.config.init_config['name']