Upload modeling_phi.py

# Gradient Checkpointing for HF Trainer

acon96 updated this script for phi-2 (See comment here: https://huggingface.co/microsoft/phi-2/commit/78c707edeb2428936635019e94f568c1885ecc09).
I've done the same for dolphin-2_6-phi-2 using his same logic.

Files changed (1) hide show

modeling_phi.py +13 -6

modeling_phi.py CHANGED Viewed

@@ -605,7 +605,7 @@ class MHA(nn.Module):
                 # the `cu_seqlens` and `max_seqlen` to be used by `flash-attn`
                 qkv, indices, cu_seqlens, max_seqlen = unpad_input(qkv, key_padding_mask)
-            if self.checkpointing:
                 attn_output = torch.utils.checkpoint.checkpoint(
                     self.inner_attn, qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen
                 )
@@ -615,8 +615,8 @@ class MHA(nn.Module):
             # If `key_padding_mask` is supplied, we need to pad the output back to the original shape
             return pad_input(attn_output, indices, batch_size, seqlen) if key_padding_mask is not None else attn_output
-        if self.checkpointing:
-            return torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, key_padding_mask=key_padding_mask)
         return self.inner_attn(qkv, key_padding_mask=key_padding_mask)
@@ -664,7 +664,7 @@ class MHA(nn.Module):
                 q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, key_padding_mask)
-            if self.checkpointing:
                 attn_output = torch.utils.checkpoint.checkpoint(
                     self.inner_cross_attn,
                     q,
@@ -674,6 +674,7 @@ class MHA(nn.Module):
                     max_seqlen=max_seqlen_q,
                     cu_seqlens_k=cu_seqlens_k,
                     max_seqlen_k=max_seqlen_k,
                 )
             else:
                 attn_output = self.inner_cross_attn(
@@ -692,13 +693,14 @@ class MHA(nn.Module):
                 else attn_output
             )
-        if self.checkpointing:
             return torch.utils.checkpoint.checkpoint(
                 self.inner_cross_attn,
                 q,
                 kv,
                 key_padding_mask=key_padding_mask,
                 causal=causal,
             )
         return self.inner_cross_attn(q, kv, key_padding_mask=key_padding_mask, causal=causal)
@@ -835,7 +837,7 @@ class PhiPreTrainedModel(PreTrainedModel):
     config_class = PhiConfig
     base_model_prefix = "transformer"
-    supports_gradient_checkpointing = False
     _no_split_modules = ["ParallelBlock"]
     def __init__(self, *inputs, **kwargs) -> None:
@@ -854,6 +856,11 @@ class PhiPreTrainedModel(PreTrainedModel):
             if module.bias is not None:
                 module.bias.data.zero_()
             module.weight.data.fill_(1.0)
     def prepare_inputs_for_generation(
         self,

                 # the `cu_seqlens` and `max_seqlen` to be used by `flash-attn`
                 qkv, indices, cu_seqlens, max_seqlen = unpad_input(qkv, key_padding_mask)
+            if self.checkpointing and self.training:
                 attn_output = torch.utils.checkpoint.checkpoint(
                     self.inner_attn, qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen
                 )
             # If `key_padding_mask` is supplied, we need to pad the output back to the original shape
             return pad_input(attn_output, indices, batch_size, seqlen) if key_padding_mask is not None else attn_output
+        if self.checkpointing and self.training:
+            return torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, key_padding_mask=key_padding_mask, use_reentrant=False)
         return self.inner_attn(qkv, key_padding_mask=key_padding_mask)
                 q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, key_padding_mask)
+            if self.checkpointing and self.training:
                 attn_output = torch.utils.checkpoint.checkpoint(
                     self.inner_cross_attn,
                     q,
                     max_seqlen=max_seqlen_q,
                     cu_seqlens_k=cu_seqlens_k,
                     max_seqlen_k=max_seqlen_k,
+                    use_reentrant=False
                 )
             else:
                 attn_output = self.inner_cross_attn(
                 else attn_output
             )
+        if self.checkpointing and self.training:
             return torch.utils.checkpoint.checkpoint(
                 self.inner_cross_attn,
                 q,
                 kv,
                 key_padding_mask=key_padding_mask,
                 causal=causal,
+                use_reentrant=False
             )
         return self.inner_cross_attn(q, kv, key_padding_mask=key_padding_mask, causal=causal)
     config_class = PhiConfig
     base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
     _no_split_modules = ["ParallelBlock"]
     def __init__(self, *inputs, **kwargs) -> None:
             if module.bias is not None:
                 module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, MHA):
+            module.checkpointing = value
     def prepare_inputs_for_generation(
         self,