feat: configurable use_reentrant

#37

by gmastrapas - opened Aug 22

base: refs/heads/main

←

from: refs/pr/37

Discussion Files changed

+11

-4

Files changed (2) hide show

configuration_xlm_roberta.py +6 -0
modeling_xlm_roberta.py +5 -4

configuration_xlm_roberta.py CHANGED Viewed

@@ -5,6 +5,9 @@ from transformers import PretrainedConfig
 class XLMRobertaFlashConfig(PretrainedConfig):
     def __init__(
         self,
         vocab_size: int = 250002,
@@ -25,6 +28,7 @@ class XLMRobertaFlashConfig(PretrainedConfig):
         position_embedding_type: str = "rotary",
         rotary_emb_base: float = 10000.0,
         use_cache: bool = True,
         classifier_dropout: Optional[float] = None,
         lora_adaptations: Optional[List[str]] = None,
         lora_prompts: Optional[Dict[str, str]] = None,
@@ -62,6 +66,7 @@ class XLMRobertaFlashConfig(PretrainedConfig):
             position_embedding_type (str): Type of position embeddings. Options are 'absolute', 'alibi', or 'rotary'.
             rotary_emb_base (float): Base for rotary embeddings.
             use_cache (bool): Whether or not the model should return the last key/values attentions (not used by all models).
             classifier_dropout (Optional[float]): The dropout ratio for the classification head.
             lora_adaptations (Optional[List[str]]): LoRA adaptations configuration.
             lora_prompts (Optional[Dict[str, str]]): LoRA prompts configuration.
@@ -100,6 +105,7 @@ class XLMRobertaFlashConfig(PretrainedConfig):
         self.position_embedding_type = position_embedding_type
         self.rotary_emb_base = rotary_emb_base
         self.use_cache = use_cache
         self.classifier_dropout = classifier_dropout
         self.load_trained_adapters = load_trained_adapters
         self.lora_adaptations = lora_adaptations

 class XLMRobertaFlashConfig(PretrainedConfig):
+    model_type = "xlm-roberta"
     def __init__(
         self,
         vocab_size: int = 250002,
         position_embedding_type: str = "rotary",
         rotary_emb_base: float = 10000.0,
         use_cache: bool = True,
+        use_reentrant: bool = False,
         classifier_dropout: Optional[float] = None,
         lora_adaptations: Optional[List[str]] = None,
         lora_prompts: Optional[Dict[str, str]] = None,
             position_embedding_type (str): Type of position embeddings. Options are 'absolute', 'alibi', or 'rotary'.
             rotary_emb_base (float): Base for rotary embeddings.
             use_cache (bool): Whether or not the model should return the last key/values attentions (not used by all models).
+            use_reentrant (bool): Whether or not the model should enable the 'use_reentrant' flag in gradient checkpointing.
             classifier_dropout (Optional[float]): The dropout ratio for the classification head.
             lora_adaptations (Optional[List[str]]): LoRA adaptations configuration.
             lora_prompts (Optional[Dict[str, str]]): LoRA prompts configuration.
         self.position_embedding_type = position_embedding_type
         self.rotary_emb_base = rotary_emb_base
         self.use_cache = use_cache
+        self.use_reentrant = use_reentrant
         self.classifier_dropout = classifier_dropout
         self.load_trained_adapters = load_trained_adapters
         self.lora_adaptations = lora_adaptations

modeling_xlm_roberta.py CHANGED Viewed

@@ -181,6 +181,7 @@ class XLMRobertaEncoder(nn.Module):
     def __init__(self, config: XLMRobertaFlashConfig):
         super().__init__()
         self.use_flash_attn = get_use_flash_attn(config)
         self.layers = nn.ModuleList(
             [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
         )
@@ -210,7 +211,7 @@ class XLMRobertaEncoder(nn.Module):
                     hidden_states = torch.utils.checkpoint.checkpoint(
                         layer,
                         hidden_states,
-                        use_reentrant=False,
                         mixer_kwargs=mixer_kwargs,
                     )
                 else:
@@ -234,7 +235,7 @@ class XLMRobertaEncoder(nn.Module):
                         hidden_states = torch.utils.checkpoint.checkpoint(
                             layer,
                             hidden_states,
-                            use_reentrant=False,
                             mixer_kwargs=mixer_kwargs,
                         )
                     else:
@@ -246,7 +247,7 @@ class XLMRobertaEncoder(nn.Module):
                         hidden_states = torch.utils.checkpoint.checkpoint(
                             layer,
                             hidden_states,
-                            use_reentrant=False,
                             mixer_kwargs=mixer_kwargs,
                         )
                     else:
@@ -284,7 +285,7 @@ class XLMRobertaEncoder(nn.Module):
                     torch.utils.checkpoint.checkpoint(
                         self.layers[-1],
                         hidden_states_subset,
-                        use_reentrant=False,
                         mixer_kwargs=mixer_kwargs,
                     )
                 else:

     def __init__(self, config: XLMRobertaFlashConfig):
         super().__init__()
         self.use_flash_attn = get_use_flash_attn(config)
+        self.use_reentrant = config.use_reentrant
         self.layers = nn.ModuleList(
             [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
         )
                     hidden_states = torch.utils.checkpoint.checkpoint(
                         layer,
                         hidden_states,
+                        use_reentrant=self.use_reentrant,
                         mixer_kwargs=mixer_kwargs,
                     )
                 else:
                         hidden_states = torch.utils.checkpoint.checkpoint(
                             layer,
                             hidden_states,
+                            use_reentrant=self.use_reentrant,
                             mixer_kwargs=mixer_kwargs,
                         )
                     else:
                         hidden_states = torch.utils.checkpoint.checkpoint(
                             layer,
                             hidden_states,
+                            use_reentrant=self.use_reentrant,
                             mixer_kwargs=mixer_kwargs,
                         )
                     else:
                     torch.utils.checkpoint.checkpoint(
                         self.layers[-1],
                         hidden_states_subset,
+                        use_reentrant=self.use_reentrant,
                         mixer_kwargs=mixer_kwargs,
                     )
                 else: