refine-codebase

#33

by jupyterjazz - opened Aug 13

base: refs/heads/main

←

from: refs/pr/33

Discussion Files changed

+495

-505

Files changed (16) hide show

README.md +11 -4
block.py +5 -4
config.json +0 -31
configuration_xlm_roberta.py +85 -36
embedding.py +27 -13
mha.py +101 -42
mlp.py +33 -15
modeling_lora.py +49 -22
modeling_xlm_roberta.py +116 -194
modeling_xlm_roberta_for_glue.py +0 -109
pytorch_model.bin +0 -3
rotary.py +43 -16
stochastic_depth.py +1 -1
tokenizer.json +0 -0
tokenizer_config.json +0 -4
xlm_padding.py +24 -10

README.md CHANGED Viewed

@@ -1,5 +1,12 @@
-# Converting Weights
-```
-python3 -m "xlm-roberta-flash-implementation".convert_roberta_weights_to_flash --output pytorch_model_xlmr_flash.bin
-```

+Core implementation of Jina XLM-RoBERTa
+This implementation is adapted from [XLM-Roberta](https://huggingface.co/docs/transformers/en/model_doc/xlm-roberta). In contrast to the original implementation, this model uses Rotary positional encodings and supports flash-attention 2.
+### Models that use this implementation
+to be added soon
+### Converting weights
+Weights from an [original XLMRoberta model](https://huggingface.co/FacebookAI/xlm-roberta-large) can be converted using the `convert_roberta_weights_to_flash.py` script in the model repository.

block.py CHANGED Viewed

@@ -8,15 +8,14 @@ from typing import Optional
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from torch import Tensor
-from .stochastic_depth import StochasticDepth
 from .mha import MHA
 from .mlp import Mlp
 try:
-    from flash_attn.ops.triton.layer_norm import layer_norm_fn, RMSNorm
 except ImportError:
     layer_norm_fn, RMSNorm = None, None
@@ -233,7 +232,9 @@ class Block(nn.Module):
                     is_rms_norm=isinstance(self.norm1, RMSNorm),
                 )
             if not isinstance(self.mlp, nn.Identity):
-                mlp_out = self.mlp(hidden_states, adapter_mask=mixer_kwargs.get('adapter_mask'))
                 if self.return_residual:  # mlp out is actually a pair here
                     mlp_out, hidden_states = mlp_out
                 if not self.fused_dropout_add_ln:

 import torch
 import torch.nn as nn
 from torch import Tensor
 from .mha import MHA
 from .mlp import Mlp
+from .stochastic_depth import StochasticDepth
 try:
+    from flash_attn.ops.triton.layer_norm import RMSNorm, layer_norm_fn
 except ImportError:
     layer_norm_fn, RMSNorm = None, None
                     is_rms_norm=isinstance(self.norm1, RMSNorm),
                 )
             if not isinstance(self.mlp, nn.Identity):
+                mlp_out = self.mlp(
+                    hidden_states, adapter_mask=mixer_kwargs.get("adapter_mask")
+                )
                 if self.return_residual:  # mlp out is actually a pair here
                     mlp_out, hidden_states = mlp_out
                 if not self.fused_dropout_add_ln:

config.json DELETED Viewed

@@ -1,31 +0,0 @@
-{
-  "auto_map": {
-    "AutoConfig": "configuration_xlm_roberta.XLMRobertaFlashConfig",
-    "AutoModel": "modeling_xlm_roberta.XLMRobertaModel",
-    "AutoModelForPreTraining": "modeling_xlm_roberta.XLMRobertaForPreTraining",
-    "AutoModelForMaskedLM": "modeling_xlm_roberta.XLMRobertaForMaskedLM",
-    "AutoModelForSequenceClassification":"modeling_xlm_roberta.XLMRobertaForSequenceClassification"
-  },
-  "architectures": [
-    "XLMRobertaModel"
-  ],
-  "attention_probs_dropout_prob": 0.1,
-  "bos_token_id": 0,
-  "eos_token_id": 2,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 768,
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "layer_norm_eps": 1e-05,
-  "max_position_embeddings": 8194,
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "output_past": true,
-  "pad_token_id": 1,
-  "position_embedding_type": "absolute",
-  "transformers_version": "4.17.0.dev0",
-  "type_vocab_size": 1,
-  "use_cache": false,
-  "vocab_size": 250002
-}

configuration_xlm_roberta.py CHANGED Viewed

@@ -1,44 +1,89 @@
-from transformers import PretrainedConfig
 import torch
 class XLMRobertaFlashConfig(PretrainedConfig):
     def __init__(
-            self,
-            vocab_size=30522,
-            hidden_size=768,
-            num_hidden_layers=12,
-            num_attention_heads=12,
-            intermediate_size=3072,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=2,
-            initializer_range=0.02,
-            layer_norm_eps=1e-12,
-            pad_token_id=1,
-            bos_token_id=0,
-            eos_token_id=2,
-            position_embedding_type="absolute",
-            rotary_emb_base=10000.0,
-            use_cache=True,
-            classifier_dropout=None,
-            lora_adaptations=None,
-            lora_prompts=None,
-            lora_rank=4,
-            lora_dropout_p=0.0,
-            lora_alpha=1,
-            lora_main_params_trainable=False,
-            load_trained_adapters=False,
-            use_flash_attn=True,
-            torch_dtype=None,
-            emb_pooler=None,
-            matryoshka_dimensions=None,
-            truncate_dim=None,
-            **kwargs,
     ):
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
@@ -67,7 +112,11 @@ class XLMRobertaFlashConfig(PretrainedConfig):
         self.emb_pooler = emb_pooler
         self.matryoshka_dimensions = matryoshka_dimensions
         self.truncate_dim = truncate_dim
-        if torch_dtype and hasattr(torch, torch_dtype) and type(getattr(torch, torch_dtype)) is torch.dtype:
             self.torch_dtype = getattr(torch, torch_dtype)
         else:
             self.torch_dtype = torch_dtype

+from typing import Any, Dict, List, Optional, Union
 import torch
+from transformers import PretrainedConfig
 class XLMRobertaFlashConfig(PretrainedConfig):
     def __init__(
+        self,
+        vocab_size: int = 250002,
+        hidden_size: int = 1024,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 16,
+        intermediate_size: int = 4096,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 8194,
+        type_vocab_size: int = 1,
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-05,
+        pad_token_id: int = 1,
+        bos_token_id: int = 0,
+        eos_token_id: int = 2,
+        position_embedding_type: str = "rotary",
+        rotary_emb_base: float = 10000.0,
+        use_cache: bool = True,
+        classifier_dropout: Optional[float] = None,
+        lora_adaptations: Optional[List[str]] = None,
+        lora_prompts: Optional[Dict[str, str]] = None,
+        lora_rank: int = 4,
+        lora_dropout_p: float = 0.0,
+        lora_alpha: int = 1,
+        lora_main_params_trainable: bool = False,
+        load_trained_adapters: bool = False,
+        use_flash_attn: bool = True,
+        torch_dtype: Optional[Union[str, torch.dtype]] = None,
+        emb_pooler: Optional[str] = None,
+        matryoshka_dimensions: Optional[List[int]] = None,
+        truncate_dim: Optional[int] = None,
+        **kwargs: Dict[str, Any],
     ):
+        """
+        Initialize the XLMRobertaFlashConfig configuration.
+        Args:
+            vocab_size (int): Size of the vocabulary.
+            hidden_size (int): Dimensionality of the encoder layers and the pooler layer.
+            num_hidden_layers (int): Number of hidden layers in the Transformer encoder.
+            num_attention_heads (int): Number of attention heads for each attention layer in the Transformer encoder.
+            intermediate_size (int): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer.
+            hidden_act (str): The activation function to use.
+            hidden_dropout_prob (float): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob (float): The dropout ratio for the attention probabilities.
+            max_position_embeddings (int): The maximum length of the position embeddings.
+            type_vocab_size (int): The vocabulary size of the token type ids.
+            initializer_range (float): The standard deviation for initializing all weight matrices.
+            layer_norm_eps (float): The epsilon used by the layer normalization layers.
+            pad_token_id (int): The ID of the padding token.
+            bos_token_id (int): The ID of the beginning-of-sequence token.
+            eos_token_id (int): The ID of the end-of-sequence token.
+            position_embedding_type (str): Type of position embeddings. Options are 'absolute', 'alibi', or 'rotary'.
+            rotary_emb_base (float): Base for rotary embeddings.
+            use_cache (bool): Whether or not the model should return the last key/values attentions (not used by all models).
+            classifier_dropout (Optional[float]): The dropout ratio for the classification head.
+            lora_adaptations (Optional[List[str]]): LoRA adaptations configuration.
+            lora_prompts (Optional[Dict[str, str]]): LoRA prompts configuration.
+            lora_rank (int): Rank for LoRA adaptations.
+            lora_dropout_p (float): Dropout probability for LoRA adaptations.
+            lora_alpha (int): Alpha parameter for LoRA.
+            lora_main_params_trainable (bool): Whether to make the main model parameters trainable when using LoRA.
+            load_trained_adapters (bool): Whether to load trained adapters.
+            use_flash_attn (bool): Whether to use FlashAttention.
+            torch_dtype (Optional[Union[str, torch.dtype]]): Data type for the tensors.
+            emb_pooler (Optional[str]): Pooling layer configuration.
+            matryoshka_dimensions (Optional[List[int]]): Configuration for matryoshka dimension reduction.
+            truncate_dim (Optional[int]): Dimension to truncate embeddings to, if any.
+            **kwargs (Dict[str, Any]): Additional keyword arguments passed to the configuration.
+        """
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.emb_pooler = emb_pooler
         self.matryoshka_dimensions = matryoshka_dimensions
         self.truncate_dim = truncate_dim
+        if (
+            torch_dtype
+            and hasattr(torch, torch_dtype)
+            and type(getattr(torch, torch_dtype)) is torch.dtype
+        ):
             self.torch_dtype = getattr(torch, torch_dtype)
         else:
             self.torch_dtype = torch_dtype

embedding.py CHANGED Viewed

@@ -5,10 +5,8 @@
 import torch
 import torch.nn as nn
-from einops import rearrange
-from torch import Tensor
-from transformers.models.xlm_roberta.modeling_xlm_roberta import create_position_ids_from_input_ids
 class XLMRobertaEmbeddings(nn.Module):
@@ -38,20 +36,29 @@ class XLMRobertaEmbeddings(nn.Module):
                 max_position_embeddings, embed_dim, **factory_kwargs
             )
         if self.type_vocab_size > 0:
-            self.token_type_embeddings = nn.Embedding(type_vocab_size, embed_dim, **factory_kwargs)
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, adapter_mask=None):
         """
         input_ids: (batch, seqlen)
         position_ids: (batch, seqlen)
         token_type_ids: (batch, seqlen)
         """
         batch_size, seqlen = input_ids.shape
         if adapter_mask is not None:
             unique_tasks = torch.unique(adapter_mask)
             embedding_dtype = next(self.word_embeddings.parameters()).dtype
-            embeddings = torch.empty(*input_ids.shape, self.word_embeddings.embedding_dim,
-                                            dtype=embedding_dtype, device=input_ids.device)
             for task_id in unique_tasks:
                 task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_input_ids = input_ids[task_indices]
@@ -61,20 +68,27 @@ class XLMRobertaEmbeddings(nn.Module):
             embeddings = self.word_embeddings(input_ids)
         if self.max_position_embeddings > 0:
             if position_ids is None:
-                position_ids = create_position_ids_from_input_ids(input_ids, padding_idx=self.word_embeddings.padding_idx).to(input_ids.device)
-                # position_ids = torch.arange(seqlen, dtype=torch.long, device=input_ids.device)
             position_embeddings = self.position_embeddings(position_ids)
             embeddings = embeddings + position_embeddings
         if self.type_vocab_size > 0:
             if token_type_ids is None:
-                token_type_ids = torch.zeros(seqlen, dtype=torch.long, device=input_ids.device)
             if adapter_mask is not None:
                 unique_tasks = torch.unique(adapter_mask)
                 for task_id in unique_tasks:
-                    task_token_type_embeddings = self.token_type_embeddings(token_type_ids, task_id=task_id)
                     task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
-                    embeddings[task_indices] = embeddings[task_indices] + task_token_type_embeddings
             else:
                 token_type_embeddings = self.token_type_embeddings(token_type_ids)
                 embeddings = embeddings + token_type_embeddings

 import torch
 import torch.nn as nn
+from transformers.models.xlm_roberta.modeling_xlm_roberta import \
+    create_position_ids_from_input_ids
 class XLMRobertaEmbeddings(nn.Module):
                 max_position_embeddings, embed_dim, **factory_kwargs
             )
         if self.type_vocab_size > 0:
+            self.token_type_embeddings = nn.Embedding(
+                type_vocab_size, embed_dim, **factory_kwargs
+            )
+    def forward(
+        self, input_ids, position_ids=None, token_type_ids=None, adapter_mask=None
+    ):
         """
         input_ids: (batch, seqlen)
         position_ids: (batch, seqlen)
         token_type_ids: (batch, seqlen)
+        adapter_mask: (batch, 1)
         """
         batch_size, seqlen = input_ids.shape
         if adapter_mask is not None:
             unique_tasks = torch.unique(adapter_mask)
             embedding_dtype = next(self.word_embeddings.parameters()).dtype
+            embeddings = torch.empty(
+                *input_ids.shape,
+                self.word_embeddings.embedding_dim,
+                dtype=embedding_dtype,
+                device=input_ids.device
+            )
             for task_id in unique_tasks:
                 task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_input_ids = input_ids[task_indices]
             embeddings = self.word_embeddings(input_ids)
         if self.max_position_embeddings > 0:
             if position_ids is None:
+                position_ids = create_position_ids_from_input_ids(
+                    input_ids, padding_idx=self.word_embeddings.padding_idx
+                ).to(input_ids.device)
             position_embeddings = self.position_embeddings(position_ids)
             embeddings = embeddings + position_embeddings
         if self.type_vocab_size > 0:
             if token_type_ids is None:
+                token_type_ids = torch.zeros(
+                    seqlen, dtype=torch.long, device=input_ids.device
+                )
             if adapter_mask is not None:
                 unique_tasks = torch.unique(adapter_mask)
                 for task_id in unique_tasks:
+                    task_token_type_embeddings = self.token_type_embeddings(
+                        token_type_ids, task_id=task_id
+                    )
                     task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
+                    embeddings[task_indices] = (
+                        embeddings[task_indices] + task_token_type_embeddings
+                    )
             else:
                 token_type_embeddings = self.token_type_embeddings(token_type_ids)
                 embeddings = embeddings + token_type_embeddings

mha.py CHANGED Viewed

@@ -1,5 +1,8 @@
 # Copyright (c) 2023, Tri Dao.
-# Adapted from https://github.com/Dao-AILab/flash-attention/pull/556
 import math
 from functools import partial
@@ -9,20 +12,19 @@ import torch.nn as nn
 from einops import rearrange, repeat
 try:
-    from flash_attn import (
-        flash_attn_kvpacked_func,
-        flash_attn_qkvpacked_func,
-        flash_attn_varlen_kvpacked_func,
-        flash_attn_varlen_qkvpacked_func,
-        flash_attn_with_kvcache,
-    )
 except ImportError:
     flash_attn_varlen_qkvpacked_func, flash_attn_varlen_kvpacked_func = None, None
     flash_attn_qkvpacked_func, flash_attn_kvpacked_func = None, None
     flash_attn_with_kvcache = None
 try:
-    from flash_attn.ops.fused_dense import ColumnParallelLinear, FusedDense, RowParallelLinear
 except ImportError:
     FusedDense, ColumnParallelLinear, RowParallelLinear = None, None, None
@@ -42,7 +44,9 @@ def get_alibi_slopes(nheads):
         closest_power_of_2 = 2 ** math.floor(math.log2(nheads))
         return (
             get_slopes_power_of_2(closest_power_of_2)
-            + get_alibi_slopes(2 * closest_power_of_2)[0::2][: nheads - closest_power_of_2]
         )
@@ -67,7 +71,9 @@ class FlashSelfAttention(nn.Module):
         deterministic=False,
     ):
         super().__init__()
-        assert flash_attn_varlen_qkvpacked_func is not None, "FlashAttention is not installed"
         assert flash_attn_qkvpacked_func is not None, "FlashAttention is not installed"
         self.causal = causal
         self.softmax_scale = softmax_scale
@@ -147,7 +153,9 @@ class FlashCrossAttention(nn.Module):
         deterministic=False,
     ):
         super().__init__()
-        assert flash_attn_varlen_kvpacked_func is not None, "FlashAttention is not installed"
         assert flash_attn_kvpacked_func is not None, "FlashAttention is not installed"
         self.causal = causal
         self.softmax_scale = softmax_scale
@@ -313,7 +321,10 @@ class CrossAttention(nn.Module):
         scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
         if key_padding_mask is not None:
             padding_mask = torch.full(
-                (batch_size, seqlen_k), -10000.0, dtype=scores.dtype, device=scores.device
             )
             padding_mask.masked_fill_(key_padding_mask, 0.0)
             # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
@@ -425,20 +436,26 @@ class MHA(nn.Module):
         else:
             alibi_slopes = None
         if window_size != (-1, -1):
-            assert use_flash_attn, "Local (sliding window) attention code path requires flash_attn"
         self.num_heads = num_heads
         self.num_heads_kv = num_heads_kv if num_heads_kv is not None else num_heads
         assert (
             self.num_heads % self.num_heads_kv == 0
         ), "num_heads must be divisible by num_heads_kv"
-        assert self.embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
         self.head_dim = self.embed_dim // num_heads
         qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv)
         kv_dim = 2 * self.head_dim * self.num_heads_kv
         if self.rotary_emb_dim > 0:
-            assert not cross_attn, "MHA with rotary embedding does not support cross-attention yet"
             assert RotaryEmbedding is not None, "rotary_emb is not installed"
             self.rotary_emb = RotaryEmbedding(
                 self.rotary_emb_dim,
@@ -453,23 +470,33 @@ class MHA(nn.Module):
         linear_cls = nn.Linear if not fused_bias_fc else FusedDense
         linear_resid_cls = (
-            LinearResidual if not fused_bias_fc else partial(FusedDense, return_residual=True)
         )
         wqkv_cls = linear_cls if not self.return_residual else linear_resid_cls
         inner_attn_cls = (
-            partial(FlashSelfAttention, alibi_slopes=alibi_slopes, window_size=window_size)
             if use_flash_attn
             else SelfAttention
         )
         inner_cross_attn_cls = (
-            partial(FlashCrossAttention, alibi_slopes=alibi_slopes, window_size=window_size)
             if use_flash_attn
             else CrossAttention
         )
         if not self.cross_attn:
-            self.Wqkv = wqkv_cls(embed_dim, qkv_dim, bias=qkv_proj_bias, **factory_kwargs)
         else:
-            self.Wq = linear_cls(embed_dim, embed_dim, bias=qkv_proj_bias, **factory_kwargs)
             self.Wkv = wqkv_cls(embed_dim, kv_dim, bias=qkv_proj_bias, **factory_kwargs)
         if self.dwconv:
             if self.num_heads_kv == self.num_heads:
@@ -480,7 +507,9 @@ class MHA(nn.Module):
                 self.dwconv_q = nn.Conv1d(
                     embed_dim, embed_dim, kernel_size=3, padding=2, groups=embed_dim
                 )
-                self.dwconv_kv = nn.Conv1d(kv_dim, kv_dim, kernel_size=3, padding=2, groups=kv_dim)
         self.inner_attn = inner_attn_cls(
             causal=causal,
             softmax_scale=softmax_scale,
@@ -489,7 +518,9 @@ class MHA(nn.Module):
         self.inner_cross_attn = inner_cross_attn_cls(
             causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
         )
-        self.out_proj = linear_cls(embed_dim, embed_dim, bias=out_proj_bias, **factory_kwargs)
     def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
         dtype = self.out_proj.weight.dtype if dtype is None else dtype
@@ -507,7 +538,9 @@ class MHA(nn.Module):
     def _update_kv_cache(self, kv, inference_params):
         """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)"""
         assert not self.dwconv, "Generation does not support dwconv yet"
-        assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
         return _update_kv_cache(kv, inference_params, self.layer_idx)
     def _apply_rotary_update_kvcache_attention(self, q, kv, inference_params):
@@ -523,7 +556,10 @@ class MHA(nn.Module):
             self.rotary_emb._update_cos_sin_cache(
                 inference_params.max_seqlen, device=q.device, dtype=q.dtype
             )
-            rotary_cos, rotary_sin = self.rotary_emb._cos_cached, self.rotary_emb._sin_cached
         else:
             rotary_cos, rotary_sin = None, None
         batch = q.shape[0]
@@ -545,7 +581,9 @@ class MHA(nn.Module):
             cache_seqlens=cache_seqlens,
             softmax_scale=self.inner_cross_attn.softmax_scale,
             causal=self.inner_cross_attn.causal,
-            rotary_interleaved=self.rotary_emb.interleaved if self.rotary_emb_dim > 0 else False,
             alibi_slopes=alibi_slopes,
         )
         return context
@@ -640,40 +678,49 @@ class MHA(nn.Module):
             )
         )
         rotary_max_seqlen = (
-            inference_params.max_sequence_len if inference_params is not None else max_seqlen
         )
-        batch, seqlen = x.shape[:2]
-        lora_kwargs = {}
         if not self.cross_attn and self.num_heads_kv == self.num_heads:
             assert x_kv is None and mixer_subset is None
             if adapter_mask is not None:
                 unique_tasks = torch.unique(adapter_mask)
                 qkv_dtype = next(self.Wqkv.parameters()).dtype
-                qkv = torch.empty(*x.shape[:-1], self.Wqkv.out_features,
-                                         dtype=qkv_dtype, device=x.device)
                 for task_id in unique_tasks:
                     task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
                     task_tensor = x[task_indices]
                     if not self.return_residual:
                         task_qkv = self.Wqkv(task_tensor, task_id=task_id)
                     else:
-                        task_qkv, _ = self.Wqkv(task_tensor, task_id=task_id, residual=True)
                     qkv[task_indices] = task_qkv
             else:
                 if not self.return_residual:
                     qkv = self.Wqkv(x)
                 else:
-                    if hasattr(self.Wqkv, 'parametrizations'):
                         qkv, x = self.Wqkv(x, residual=True)
                     else:
                         qkv, x = self.Wqkv(x)
             if self.dwconv:
                 qkv = rearrange(
-                    self.dwconv_qkv(rearrange(qkv, "b s d -> b d s"))[..., :-2], "b d s -> b s d"
                 ).contiguous()
-            qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim)
             if (
                 inference_params is None
                 or inference_params.seqlen_offset == 0
@@ -691,7 +738,9 @@ class MHA(nn.Module):
                     if not self.checkpointing:
                         context = self.inner_attn(qkv, **kwargs)
                     else:
-                        context = torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, **kwargs)
                 else:
                     context = self._update_kvcache_attention(
                         qkv[:, :, 0], qkv[:, :, 1:], inference_params
@@ -720,13 +769,17 @@ class MHA(nn.Module):
                 q = qkv[..., : self.num_heads * self.head_dim]
                 kv = qkv[..., self.num_heads * self.head_dim :]
             q = rearrange(q, "... (h d) -> ... h d", d=self.head_dim)
-            kv = rearrange(kv, "... (two hkv d) -> ... two hkv d", two=2, d=self.head_dim)
             if self.dwconv:
                 q = rearrange(
-                    self.dwconv_q(rearrange(q, "b s d -> b d s"))[..., :-2], "b d s -> b s d"
                 ).contiguous()
                 kv = rearrange(
-                    self.dwconv_kv(rearrange(kv, "b s d -> b d s"))[..., :-2], "b d s -> b s d"
                 ).contiguous()
             if (
                 inference_params is None
@@ -752,14 +805,20 @@ class MHA(nn.Module):
                 else:
                     context = self._update_kvcache_attention(q, kv, inference_params)
             else:
-                context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params)
         inp = rearrange(context, "... h d -> ... (h d)")
         if adapter_mask is not None:
             unique_tasks = torch.unique(adapter_mask)
             out_dtype = next(self.out_proj.parameters()).dtype
-            out = torch.empty(*inp.shape[:-1], self.out_proj.out_features,
-                                   dtype=out_dtype, device=inp.device)
             for task_id in unique_tasks:
                 task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_tensor = inp[task_indices]

+# This implementation was adapted from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/mha.py
+# Commit id: 6bbc532388e61185a92e2a563126739967b4c8c5
+# Rotary varlen support from https://github.com/Dao-AILab/flash-attention/pull/556
 # Copyright (c) 2023, Tri Dao.
 import math
 from functools import partial
 from einops import rearrange, repeat
 try:
+    from flash_attn import (flash_attn_kvpacked_func,
+                            flash_attn_qkvpacked_func,
+                            flash_attn_varlen_kvpacked_func,
+                            flash_attn_varlen_qkvpacked_func,
+                            flash_attn_with_kvcache)
 except ImportError:
     flash_attn_varlen_qkvpacked_func, flash_attn_varlen_kvpacked_func = None, None
     flash_attn_qkvpacked_func, flash_attn_kvpacked_func = None, None
     flash_attn_with_kvcache = None
 try:
+    from flash_attn.ops.fused_dense import (ColumnParallelLinear, FusedDense,
+                                            RowParallelLinear)
 except ImportError:
     FusedDense, ColumnParallelLinear, RowParallelLinear = None, None, None
         closest_power_of_2 = 2 ** math.floor(math.log2(nheads))
         return (
             get_slopes_power_of_2(closest_power_of_2)
+            + get_alibi_slopes(2 * closest_power_of_2)[0::2][
+                : nheads - closest_power_of_2
+            ]
         )
         deterministic=False,
     ):
         super().__init__()
+        assert (
+            flash_attn_varlen_qkvpacked_func is not None
+        ), "FlashAttention is not installed"
         assert flash_attn_qkvpacked_func is not None, "FlashAttention is not installed"
         self.causal = causal
         self.softmax_scale = softmax_scale
         deterministic=False,
     ):
         super().__init__()
+        assert (
+            flash_attn_varlen_kvpacked_func is not None
+        ), "FlashAttention is not installed"
         assert flash_attn_kvpacked_func is not None, "FlashAttention is not installed"
         self.causal = causal
         self.softmax_scale = softmax_scale
         scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
         if key_padding_mask is not None:
             padding_mask = torch.full(
+                (batch_size, seqlen_k),
+                -10000.0,
+                dtype=scores.dtype,
+                device=scores.device,
             )
             padding_mask.masked_fill_(key_padding_mask, 0.0)
             # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
         else:
             alibi_slopes = None
         if window_size != (-1, -1):
+            assert (
+                use_flash_attn
+            ), "Local (sliding window) attention code path requires flash_attn"
         self.num_heads = num_heads
         self.num_heads_kv = num_heads_kv if num_heads_kv is not None else num_heads
         assert (
             self.num_heads % self.num_heads_kv == 0
         ), "num_heads must be divisible by num_heads_kv"
+        assert (
+            self.embed_dim % num_heads == 0
+        ), "embed_dim must be divisible by num_heads"
         self.head_dim = self.embed_dim // num_heads
         qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv)
         kv_dim = 2 * self.head_dim * self.num_heads_kv
         if self.rotary_emb_dim > 0:
+            assert (
+                not cross_attn
+            ), "MHA with rotary embedding does not support cross-attention yet"
             assert RotaryEmbedding is not None, "rotary_emb is not installed"
             self.rotary_emb = RotaryEmbedding(
                 self.rotary_emb_dim,
         linear_cls = nn.Linear if not fused_bias_fc else FusedDense
         linear_resid_cls = (
+            LinearResidual
+            if not fused_bias_fc
+            else partial(FusedDense, return_residual=True)
         )
         wqkv_cls = linear_cls if not self.return_residual else linear_resid_cls
         inner_attn_cls = (
+            partial(
+                FlashSelfAttention, alibi_slopes=alibi_slopes, window_size=window_size
+            )
             if use_flash_attn
             else SelfAttention
         )
         inner_cross_attn_cls = (
+            partial(
+                FlashCrossAttention, alibi_slopes=alibi_slopes, window_size=window_size
+            )
             if use_flash_attn
             else CrossAttention
         )
         if not self.cross_attn:
+            self.Wqkv = wqkv_cls(
+                embed_dim, qkv_dim, bias=qkv_proj_bias, **factory_kwargs
+            )
         else:
+            self.Wq = linear_cls(
+                embed_dim, embed_dim, bias=qkv_proj_bias, **factory_kwargs
+            )
             self.Wkv = wqkv_cls(embed_dim, kv_dim, bias=qkv_proj_bias, **factory_kwargs)
         if self.dwconv:
             if self.num_heads_kv == self.num_heads:
                 self.dwconv_q = nn.Conv1d(
                     embed_dim, embed_dim, kernel_size=3, padding=2, groups=embed_dim
                 )
+                self.dwconv_kv = nn.Conv1d(
+                    kv_dim, kv_dim, kernel_size=3, padding=2, groups=kv_dim
+                )
         self.inner_attn = inner_attn_cls(
             causal=causal,
             softmax_scale=softmax_scale,
         self.inner_cross_attn = inner_cross_attn_cls(
             causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
         )
+        self.out_proj = linear_cls(
+            embed_dim, embed_dim, bias=out_proj_bias, **factory_kwargs
+        )
     def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
         dtype = self.out_proj.weight.dtype if dtype is None else dtype
     def _update_kv_cache(self, kv, inference_params):
         """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)"""
         assert not self.dwconv, "Generation does not support dwconv yet"
+        assert (
+            self.layer_idx is not None
+        ), "Generation requires layer_idx in the constructor"
         return _update_kv_cache(kv, inference_params, self.layer_idx)
     def _apply_rotary_update_kvcache_attention(self, q, kv, inference_params):
             self.rotary_emb._update_cos_sin_cache(
                 inference_params.max_seqlen, device=q.device, dtype=q.dtype
             )
+            rotary_cos, rotary_sin = (
+                self.rotary_emb._cos_cached,
+                self.rotary_emb._sin_cached,
+            )
         else:
             rotary_cos, rotary_sin = None, None
         batch = q.shape[0]
             cache_seqlens=cache_seqlens,
             softmax_scale=self.inner_cross_attn.softmax_scale,
             causal=self.inner_cross_attn.causal,
+            rotary_interleaved=(
+                self.rotary_emb.interleaved if self.rotary_emb_dim > 0 else False
+            ),
             alibi_slopes=alibi_slopes,
         )
         return context
             )
         )
         rotary_max_seqlen = (
+            inference_params.max_sequence_len
+            if inference_params is not None
+            else max_seqlen
         )
         if not self.cross_attn and self.num_heads_kv == self.num_heads:
             assert x_kv is None and mixer_subset is None
             if adapter_mask is not None:
                 unique_tasks = torch.unique(adapter_mask)
                 qkv_dtype = next(self.Wqkv.parameters()).dtype
+                qkv = torch.empty(
+                    *x.shape[:-1],
+                    self.Wqkv.out_features,
+                    dtype=qkv_dtype,
+                    device=x.device,
+                )
                 for task_id in unique_tasks:
                     task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
                     task_tensor = x[task_indices]
                     if not self.return_residual:
                         task_qkv = self.Wqkv(task_tensor, task_id=task_id)
                     else:
+                        task_qkv, _ = self.Wqkv(
+                            task_tensor, task_id=task_id, residual=True
+                        )
                     qkv[task_indices] = task_qkv
             else:
                 if not self.return_residual:
                     qkv = self.Wqkv(x)
                 else:
+                    if hasattr(self.Wqkv, "parametrizations"):
                         qkv, x = self.Wqkv(x, residual=True)
                     else:
                         qkv, x = self.Wqkv(x)
             if self.dwconv:
                 qkv = rearrange(
+                    self.dwconv_qkv(rearrange(qkv, "b s d -> b d s"))[..., :-2],
+                    "b d s -> b s d",
                 ).contiguous()
+            qkv = rearrange(
+                qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim
+            )
             if (
                 inference_params is None
                 or inference_params.seqlen_offset == 0
                     if not self.checkpointing:
                         context = self.inner_attn(qkv, **kwargs)
                     else:
+                        context = torch.utils.checkpoint.checkpoint(
+                            self.inner_attn, qkv, **kwargs
+                        )
                 else:
                     context = self._update_kvcache_attention(
                         qkv[:, :, 0], qkv[:, :, 1:], inference_params
                 q = qkv[..., : self.num_heads * self.head_dim]
                 kv = qkv[..., self.num_heads * self.head_dim :]
             q = rearrange(q, "... (h d) -> ... h d", d=self.head_dim)
+            kv = rearrange(
+                kv, "... (two hkv d) -> ... two hkv d", two=2, d=self.head_dim
+            )
             if self.dwconv:
                 q = rearrange(
+                    self.dwconv_q(rearrange(q, "b s d -> b d s"))[..., :-2],
+                    "b d s -> b s d",
                 ).contiguous()
                 kv = rearrange(
+                    self.dwconv_kv(rearrange(kv, "b s d -> b d s"))[..., :-2],
+                    "b d s -> b s d",
                 ).contiguous()
             if (
                 inference_params is None
                 else:
                     context = self._update_kvcache_attention(q, kv, inference_params)
             else:
+                context = self._apply_rotary_update_kvcache_attention(
+                    q, kv, inference_params
+                )
         inp = rearrange(context, "... h d -> ... (h d)")
         if adapter_mask is not None:
             unique_tasks = torch.unique(adapter_mask)
             out_dtype = next(self.out_proj.parameters()).dtype
+            out = torch.empty(
+                *inp.shape[:-1],
+                self.out_proj.out_features,
+                dtype=out_dtype,
+                device=inp.device,
+            )
             for task_id in unique_tasks:
                 task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_tensor = inp[task_indices]

mlp.py CHANGED Viewed

@@ -8,14 +8,14 @@ import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributed import ProcessGroup
 try:
     from flash_attn.ops.activations import swiglu
 except ImportError:
     swiglu = None
 try:
-    from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear
 except ImportError:
     ColumnParallelLinear, RowParallelLinear = None, None
@@ -41,18 +41,23 @@ class Mlp(nn.Module):
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         out_features = out_features if out_features is not None else in_features
-        hidden_features = hidden_features if hidden_features is not None else in_features * 4
         self.return_residual = return_residual
         self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs)
         self.activation = activation
-        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
     def forward(self, x, adapter_mask=None):
         if adapter_mask is not None:
             unique_tasks = torch.unique(adapter_mask)
             fc1_dtype = next(self.fc1.parameters()).dtype
-            y = torch.empty(*x.shape[:-1], self.fc1.out_features,
-                              dtype=fc1_dtype, device=x.device)
             for task_id in unique_tasks:
                 task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_tensor = x[task_indices]
@@ -66,8 +71,9 @@ class Mlp(nn.Module):
         if adapter_mask is not None:
             unique_tasks = torch.unique(adapter_mask)
             fc2_dtype = next(self.fc2.parameters()).dtype
-            out = torch.empty(*y.shape[:-1], self.fc2.out_features,
-                              dtype=fc2_dtype, device=y.device)
             for task_id in unique_tasks:
                 task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_tensor = y[task_indices]
@@ -98,7 +104,9 @@ class ParallelMLP(nn.Module):
         assert ColumnParallelLinear is not None, "Need to install fused_dense"
         assert RowParallelLinear is not None, "Need to install fused_dense"
         out_features = out_features if out_features is not None else in_features
-        hidden_features = hidden_features if hidden_features is not None else in_features * 4
         self.fc1 = ColumnParallelLinear(
             in_features,
             hidden_features,
@@ -144,17 +152,25 @@ class GatedMlp(nn.Module):
         hidden_features = (
             hidden_features if hidden_features is not None else int(8 * in_features / 3)
         )
-        hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of
         self.return_residual = return_residual
-        self.fc1 = nn.Linear(in_features, 2 * hidden_features, bias=bias1, **factory_kwargs)
         self.activation = activation
-        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
     def forward(self, x):
         y = self.fc1(x)
         if self.activation == F.sigmoid:  # Special case for GLU
             y = F.glu(y, dim=-1)
-        elif self.activation == F.silu and swiglu is not None:  # Special case for SwiGLU
             y, gate = y.chunk(2, dim=-1)
             y = swiglu(gate, y)
         else:
@@ -187,7 +203,9 @@ class ParallelGatedMlp(nn.Module):
         hidden_features = (
             hidden_features if hidden_features is not None else int(8 * in_features / 3)
         )
-        hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of
         if ColumnParallelLinear is None or RowParallelLinear is None:
             raise ImportError("fused_dense is not installed")
         self.fc1 = ColumnParallelLinear(
@@ -216,4 +234,4 @@ class ParallelGatedMlp(nn.Module):
             y, gate = y.chunk(2, dim=-1)
             y = y * self.activation(gate)
         y = self.fc2(y)
-        return y

 import torch.nn.functional as F
 from torch.distributed import ProcessGroup
 try:
     from flash_attn.ops.activations import swiglu
 except ImportError:
     swiglu = None
 try:
+    from flash_attn.ops.fused_dense import (ColumnParallelLinear,
+                                            RowParallelLinear)
 except ImportError:
     ColumnParallelLinear, RowParallelLinear = None, None
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         out_features = out_features if out_features is not None else in_features
+        hidden_features = (
+            hidden_features if hidden_features is not None else in_features * 4
+        )
         self.return_residual = return_residual
         self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs)
         self.activation = activation
+        self.fc2 = nn.Linear(
+            hidden_features, out_features, bias=bias2, **factory_kwargs
+        )
     def forward(self, x, adapter_mask=None):
         if adapter_mask is not None:
             unique_tasks = torch.unique(adapter_mask)
             fc1_dtype = next(self.fc1.parameters()).dtype
+            y = torch.empty(
+                *x.shape[:-1], self.fc1.out_features, dtype=fc1_dtype, device=x.device
+            )
             for task_id in unique_tasks:
                 task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_tensor = x[task_indices]
         if adapter_mask is not None:
             unique_tasks = torch.unique(adapter_mask)
             fc2_dtype = next(self.fc2.parameters()).dtype
+            out = torch.empty(
+                *y.shape[:-1], self.fc2.out_features, dtype=fc2_dtype, device=y.device
+            )
             for task_id in unique_tasks:
                 task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_tensor = y[task_indices]
         assert ColumnParallelLinear is not None, "Need to install fused_dense"
         assert RowParallelLinear is not None, "Need to install fused_dense"
         out_features = out_features if out_features is not None else in_features
+        hidden_features = (
+            hidden_features if hidden_features is not None else in_features * 4
+        )
         self.fc1 = ColumnParallelLinear(
             in_features,
             hidden_features,
         hidden_features = (
             hidden_features if hidden_features is not None else int(8 * in_features / 3)
         )
+        hidden_features = (
+            (hidden_features + multiple_of - 1) // multiple_of * multiple_of
+        )
         self.return_residual = return_residual
+        self.fc1 = nn.Linear(
+            in_features, 2 * hidden_features, bias=bias1, **factory_kwargs
+        )
         self.activation = activation
+        self.fc2 = nn.Linear(
+            hidden_features, out_features, bias=bias2, **factory_kwargs
+        )
     def forward(self, x):
         y = self.fc1(x)
         if self.activation == F.sigmoid:  # Special case for GLU
             y = F.glu(y, dim=-1)
+        elif (
+            self.activation == F.silu and swiglu is not None
+        ):  # Special case for SwiGLU
             y, gate = y.chunk(2, dim=-1)
             y = swiglu(gate, y)
         else:
         hidden_features = (
             hidden_features if hidden_features is not None else int(8 * in_features / 3)
         )
+        hidden_features = (
+            (hidden_features + multiple_of - 1) // multiple_of * multiple_of
+        )
         if ColumnParallelLinear is None or RowParallelLinear is None:
             raise ImportError("fused_dense is not installed")
         self.fc1 = ColumnParallelLinear(
             y, gate = y.chunk(2, dim=-1)
             y = y * self.activation(gate)
         y = self.fc2(y)
+        return y

modeling_lora.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import math
 import os
-import warnings
 from functools import partial
 from typing import Iterator, List, Optional, Tuple, Union
@@ -12,7 +11,8 @@ from torch.nn import Parameter
 from torch.nn import functional as F
 from transformers import PretrainedConfig
-from .modeling_xlm_roberta import XLMRobertaFlashConfig, XLMRobertaModel, XLMRobertaPreTrainedModel
 def initialized_weights(
@@ -162,6 +162,16 @@ class LoRAParametrization(nn.Module):
         dropout_p: float,
         alpha: float,
     ):
         if isinstance(layer, nn.Linear):
             parametrize.register_parametrization(
                 layer,
@@ -177,7 +187,9 @@ class LoRAParametrization(nn.Module):
             def new_forward(self, input, task_id=None, residual=False):
                 if task_id is not None:
-                    weights = self.parametrizations.weight[0].lora_forward(self.weight, current_task=task_id)
                 else:
                     weights = self.weight
@@ -204,13 +216,21 @@ class LoRAParametrization(nn.Module):
             def new_forward(self, input, task_id=None):
                 if task_id is not None:
-                    weights = self.parametrizations.weight[0].lora_forward(self.weight, current_task=task_id)
                 else:
                     weights = self.weight
                 out = F.embedding(
-                    input, weights, self.padding_idx, self.max_norm,
-                    self.norm_type, self.scale_grad_by_freq, self.sparse)
                 return out
@@ -218,10 +238,11 @@ class LoRAParametrization(nn.Module):
 class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
     def __init__(
-        self,
-        config: XLMRobertaFlashConfig,
-        roberta: Optional[XLMRobertaModel] = None
     ):
         super().__init__(config)
         if roberta is None:
@@ -235,7 +256,7 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
             or len(self._lora_adaptations) < 1
         ):
             raise ValueError(
-                f'`lora_adaptations` must be a list and contain at least one element'
             )
         self._lora_prompts = config.lora_prompts
         if (
@@ -244,9 +265,9 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
             or not all([v in self._lora_adaptations for v in self._lora_prompts.keys()])
         ):
             raise ValueError(
-                f'`lora_prompts` must be a dict and contain the same number of elements '
-                f'as `lora_adaptations` with all keys in `lora_prompts` present in `lora_adaptations`.'
-        )
         self._adaptation_map = {
             name: idx for idx, name in enumerate(self._lora_adaptations)
         }
@@ -261,7 +282,6 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
         )
         self.main_params_trainable = config.lora_main_params_trainable
     @property
     def rotary_emb_base(self):
         return self.roberta.rotary_emb_base
@@ -305,13 +325,14 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
         config = XLMRobertaFlashConfig.from_pretrained(
             pretrained_model_name_or_path, *model_args, **kwargs
         )
-        if config.load_trained_adapters:
             return super().from_pretrained(
                 pretrained_model_name_or_path, *model_args, **kwargs
             )
-        else:
-            roberta = XLMRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
             return cls(config, roberta=roberta)
     def _register_lora(self, num_adaptations, rank, dropout_p, alpha):
@@ -350,10 +371,12 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
         **kwargs,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
-        Computes sentence embeddings
         task_type(`str`, *optional*, defaults to `None`):
-            Specifies the task for which the encoding is intended. If `task_type` is not provide,
             all LoRA adapters are disabled, and the model reverts to its original,
             general-purpose weights.
         """
@@ -367,5 +390,9 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
         if task_type:
             task_id = self._adaptation_map[task_type]
             num_examples = 1 if isinstance(sentences, str) else len(sentences)
-            adapter_mask = torch.full((num_examples,), task_id, dtype=torch.int32, device=self.device)
-        return self.roberta.encode(sentences, *args, adapter_mask=adapter_mask, **kwargs)

 import math
 import os
 from functools import partial
 from typing import Iterator, List, Optional, Tuple, Union
 from torch.nn import functional as F
 from transformers import PretrainedConfig
+from .modeling_xlm_roberta import (XLMRobertaFlashConfig, XLMRobertaModel,
+                                   XLMRobertaPreTrainedModel)
 def initialized_weights(
         dropout_p: float,
         alpha: float,
     ):
+        """
+        Registering LoRA adapters to all embedding and linear layers.
+        Additionally, we implement a custom forward function for LoRA parametrization.
+        This function modifies the layer's forward pass to optionally use task-specific
+        parameters. When a `task_id` is provided, it employs a LoRA parametrization
+        to modify the original weights according to the specific task. This allows
+        the layer to adapt dynamically to different tasks at runtime. If no `task_id`
+        is specified, the layer uses its original weights.
+        """
         if isinstance(layer, nn.Linear):
             parametrize.register_parametrization(
                 layer,
             def new_forward(self, input, task_id=None, residual=False):
                 if task_id is not None:
+                    weights = self.parametrizations.weight[0].lora_forward(
+                        self.weight, current_task=task_id
+                    )
                 else:
                     weights = self.weight
             def new_forward(self, input, task_id=None):
                 if task_id is not None:
+                    weights = self.parametrizations.weight[0].lora_forward(
+                        self.weight, current_task=task_id
+                    )
                 else:
                     weights = self.weight
                 out = F.embedding(
+                    input,
+                    weights,
+                    self.padding_idx,
+                    self.max_norm,
+                    self.norm_type,
+                    self.scale_grad_by_freq,
+                    self.sparse,
+                )
                 return out
 class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
+    """
+    A wrapper class around the Jina XLM-RoBERTa model that integrates LoRA (Low-Rank Adaptation) adapters.
+    """
     def __init__(
+        self, config: XLMRobertaFlashConfig, roberta: Optional[XLMRobertaModel] = None
     ):
         super().__init__(config)
         if roberta is None:
             or len(self._lora_adaptations) < 1
         ):
             raise ValueError(
+                f"`lora_adaptations` must be a list and contain at least one element"
             )
         self._lora_prompts = config.lora_prompts
         if (
             or not all([v in self._lora_adaptations for v in self._lora_prompts.keys()])
         ):
             raise ValueError(
+                f"`lora_prompts` must be a dict and contain the same number of elements "
+                f"as `lora_adaptations` with all keys in `lora_prompts` present in `lora_adaptations`."
+            )
         self._adaptation_map = {
             name: idx for idx, name in enumerate(self._lora_adaptations)
         }
         )
         self.main_params_trainable = config.lora_main_params_trainable
     @property
     def rotary_emb_base(self):
         return self.roberta.rotary_emb_base
         config = XLMRobertaFlashConfig.from_pretrained(
             pretrained_model_name_or_path, *model_args, **kwargs
         )
+        if config.load_trained_adapters: # checkpoint already contains LoRA adapters
             return super().from_pretrained(
                 pretrained_model_name_or_path, *model_args, **kwargs
             )
+        else: # initializing new adapters
+            roberta = XLMRobertaModel.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
             return cls(config, roberta=roberta)
     def _register_lora(self, num_adaptations, rank, dropout_p, alpha):
         **kwargs,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
+        Computes sentence embeddings.
+        sentences(`str` or `List[str]`):
+            Sentence or sentences to be encoded
         task_type(`str`, *optional*, defaults to `None`):
+            Specifies the task for which the encoding is intended. If `task_type` is not provided,
             all LoRA adapters are disabled, and the model reverts to its original,
             general-purpose weights.
         """
         if task_type:
             task_id = self._adaptation_map[task_type]
             num_examples = 1 if isinstance(sentences, str) else len(sentences)
+            adapter_mask = torch.full(
+                (num_examples,), task_id, dtype=torch.int32, device=self.device
+            )
+        return self.roberta.encode(
+            sentences, *args, adapter_mask=adapter_mask, **kwargs
+        )

modeling_xlm_roberta.py CHANGED Viewed

@@ -13,39 +13,29 @@ import re
 from collections import OrderedDict
 from collections.abc import Sequence
 from functools import partial
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from einops import rearrange
-from transformers import PretrainedConfig, AutoTokenizer
 from transformers.modeling_utils import PreTrainedModel
-from transformers.modeling_outputs import MaskedLMOutput,SequenceClassifierOutput
-from transformers.models.xlm_roberta.modeling_xlm_roberta import XLMRobertaLMHead
 from transformers.models.bert.modeling_bert import (
-    BaseModelOutputWithPoolingAndCrossAttentions,
-    BertForPreTrainingOutput,
-)
-from typing import List, Optional, Tuple, Union
-from .xlm_padding import (
-    index_first_axis,
-    index_first_axis_residual,
-    pad_input,
-    unpad_input,
-)
-from .configuration_xlm_roberta import XLMRobertaFlashConfig
 from .block import Block
 from .embedding import XLMRobertaEmbeddings
 from .mha import MHA
 from .mlp import FusedMLP, Mlp
-from .stochastic_depth import StochasticDepth
-from .rotary import RotaryEmbedding
 try:
     from flash_attn.ops.fused_dense import FusedDense
@@ -79,7 +69,7 @@ def get_use_flash_attn(config: XLMRobertaFlashConfig):
         return False
     if importlib.util.find_spec("flash_attn") is None:
         logger.warning(
-            'flash_attn is not installed. Using PyTorch native attention implementation.'
         )
         return False
     return True
@@ -109,7 +99,7 @@ def create_mixer_cls(config, cross_attn=False, return_residual=False):
         fused_bias_fc=fused_bias_fc,
         use_flash_attn=use_flash_attn,
         return_residual=return_residual,
-        use_alibi=config.position_embedding_type == 'alibi',
         **rotary_kwargs,
     )
     return mixer_cls
@@ -204,15 +194,17 @@ class XLMRobertaEncoder(nn.Module):
     def gradient_checkpointing(self, value):
         self._grad_checkpointing = value
-    def forward(self, hidden_states, key_padding_mask=None, subset_mask=None, adapter_mask=None):
         """If subset_mask is not None, we only want output for the subset of the sequence.
         This means that we only compute the last layer output for these tokens.
         subset_mask: (batch, seqlen), dtype=torch.bool
         """
         if key_padding_mask is None or not self.use_flash_attn:
-            mixer_kwargs = {'adapter_mask': adapter_mask}
             if key_padding_mask is not None:
-                mixer_kwargs['key_padding_mask'] = key_padding_mask.bool()
             for layer in self.layers:
                 if self._grad_checkpointing:
                     hidden_states = torch.utils.checkpoint.checkpoint(
@@ -227,10 +219,14 @@ class XLMRobertaEncoder(nn.Module):
                 hidden_states = hidden_states[subset_mask]
         else:
             batch, seqlen = hidden_states.shape[:2]
-            hidden_states, indices, cu_seqlens, max_seqlen_in_batch, cu_adapter_mask = unpad_input(
-                hidden_states, key_padding_mask, adapter_mask
             )
-            mixer_kwargs = {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen_in_batch, "adapter_mask": cu_adapter_mask}
             if subset_mask is None:
                 for layer in self.layers:
@@ -315,12 +311,18 @@ class XLMRobertaPooler(nn.Module):
         if adapter_mask is not None:
             unique_tasks = torch.unique(adapter_mask)
             pool_dtype = next(self.dense.parameters()).dtype
-            pooled_output = torch.empty(first_token_tensor.shape[0], self.dense.out_features,
-                                            dtype=pool_dtype, device=first_token_tensor.device)
             for task_id in unique_tasks:
                 task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_first_token_tensor = first_token_tensor[task_indices]
-                task_pooled_output = self.dense(task_first_token_tensor, task_id=task_id)
                 pooled_output[task_indices] = task_pooled_output
         else:
             pooled_output = self.dense(first_token_tensor)
@@ -413,12 +415,11 @@ class XLMRobertaPreTrainedModel(PreTrainedModel):
         *args,
         **kwargs,
     ):
-        if not 'torch_dtype' in kwargs:
-            kwargs['torch_dtype'] = 'auto'
         return super().from_pretrained(*args, **kwargs)
 class XLMRobertaModel(XLMRobertaPreTrainedModel):
     def __init__(self, config: XLMRobertaFlashConfig, add_pooling_layer=True):
         super().__init__(config)
@@ -439,7 +440,11 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         self.embeddings = XLMRobertaEmbeddings(
             config.hidden_size,
             config.vocab_size,
-            config.max_position_embeddings if config.position_embedding_type == 'absolute' else -1,
             config.type_vocab_size,
             padding_idx=config.pad_token_id,
         )
@@ -449,16 +454,18 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         self.pooler = XLMRobertaPooler(config) if add_pooling_layer else None
         self.apply(partial(_init_weights, initializer_range=config.initializer_range))
-        self.tokenizer = AutoTokenizer.from_pretrained(self.name_or_path, trust_remote_code=True)
         self._rotary_emb_base = config.rotary_emb_base
     @torch.inference_mode()
     def encode(
-        self: 'XLMRobertaModel',
         sentences: Union[str, List[str]],
         batch_size: int = 32,
         show_progress_bar: Optional[bool] = None,
-        output_value: str = 'sentence_embedding',
         convert_to_numpy: bool = True,
         convert_to_tensor: bool = False,
         device: Optional[torch.device] = None,
@@ -516,12 +523,12 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         if convert_to_tensor:
             convert_to_numpy = False
-        if output_value != 'sentence_embedding':
             convert_to_tensor = False
             convert_to_numpy = False
         input_was_string = False
-        if isinstance(sentences, str) or not hasattr(sentences, '__len__'):
             sentences = [sentences]
             input_was_string = True
@@ -532,11 +539,11 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         inverse_permutation = np.argsort(permutation)
         sentences = [sentences[idx] for idx in permutation]
-        tokenizer_kwargs['padding'] = tokenizer_kwargs.get('padding', True)
-        tokenizer_kwargs['max_length'] = tokenizer_kwargs.get(
-            'max_length', self.tokenizer.init_kwargs.get('model_max_length', 8192)
         )
-        tokenizer_kwargs['truncation'] = tokenizer_kwargs.get('truncation', True)
         all_embeddings = []
@@ -550,11 +557,13 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
             )
         else:
             range_iter = range(0, len(sentences), batch_size)
-        lora_arguments = {'adapter_mask': adapter_mask} if adapter_mask is not None else {}
         for i in range_iter:
             encoded_input = self.tokenizer(
                 sentences[i : i + batch_size],
-                return_tensors='pt',
                 **tokenizer_kwargs,
             ).to(self.device)
             token_embs = self.forward(**encoded_input, **lora_arguments)[0]
@@ -562,18 +571,18 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
             # Accumulate in fp32 to avoid overflow
             token_embs = token_embs.float()
-            if output_value == 'token_embeddings':
                 raise NotImplementedError
             elif output_value is None:
                 raise NotImplementedError
             else:
-                if self.config.emb_pooler == 'cls':
                     embeddings = self.cls_pooling(
-                        token_embs, encoded_input['attention_mask']
                     )
                 else:
                     embeddings = self.mean_pooling(
-                        token_embs, encoded_input['attention_mask']
                     )
                 if normalize_embeddings:
@@ -603,14 +612,16 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
     def truncate_embeddings(self, embeddings, truncate_dim):
         if not self.config.matryoshka_dimensions:
             logger.warning(
-                'Matryoshka embeddings are not supported, so dimension truncation will not be performed.'
             )
             return embeddings
         elif truncate_dim in self.config.matryoshka_dimensions:
             return [tensor[:truncate_dim] for tensor in embeddings]
         else:
-            raise ValueError(f'The provided `truncate_dim` value of {truncate_dim} is not supported. '
-                             f'Supported dimensions are {self.config.matryoshka_dimensions}.')
     def mean_pooling(
         self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor
@@ -622,10 +633,8 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
             input_mask_expanded.sum(1), min=1e-9
         )
-    def cls_pooling(
-        self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor
-    ):
-        return token_embeddings[:,0]
     @property
     def rotary_emb_base(self):
@@ -635,7 +644,7 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
     def rotary_emb_base(self, base):
         if not isinstance(base, (int, float)):
             raise TypeError("Base must be an integer or float")
-        logger.info(f'Changing RoPE base value to {base}')
         for layer in self.encoder.layers:
             layer.mixer.rotary_emb.base = base
         self._rotary_emb_base = base
@@ -655,12 +664,12 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         layer output for these tokens.
         masked_tokens_mask: (batch, seqlen), dtype=torch.bool
         """
-        adapter_mask = kwargs.pop('adapter_mask', None)
         if kwargs:
             for key, value in kwargs.items():
                 if value is not None:
                     logger.warning(
-                        'Flash attention implementation does not support kwargs: %s',
                         key,
                     )
@@ -669,7 +678,10 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
         )
         hidden_states = self.embeddings(
-            input_ids, position_ids=position_ids, token_type_ids=token_type_ids, adapter_mask=adapter_mask
         )
         # TD [2022-12:18]: Don't need to force residual in fp32
         # BERT puts embedding LayerNorm before embedding dropout.
@@ -693,12 +705,17 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
             subset_mask = None
         sequence_output = self.encoder(
-            hidden_states, key_padding_mask=attention_mask, subset_mask=subset_mask, adapter_mask=adapter_mask
         )
         if masked_tokens_mask is None:
             pooled_output = (
-                self.pooler(sequence_output, adapter_mask=adapter_mask) if self.pooler is not None else None
             )
         else:
             # TD [2022-03-01]: the indexing here is very tricky.
@@ -712,7 +729,9 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
                 pool_input = sequence_output[first_col_mask[subset_mask]]
                 sequence_output = sequence_output[masked_tokens_mask[subset_mask]]
             pooled_output = (
-                self.pooler(pool_input, pool=False, adapter_mask=adapter_mask) if self.pooler is not None else None
             )
         if not return_dict:
@@ -817,103 +836,6 @@ class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
         )
-# class XLMRobertaForPreTraining(XLMRobertaPreTrainedModel):
-#     def __init__(self, config: XLMRobertaFlashConfig):
-#         super().__init__(config)
-#         # If dense_seq_output, we only need to pass the hidden states for the masked out tokens
-#         # (around 15%) to the classifier heads.
-#         self.dense_seq_output = getattr(config, "dense_seq_output", False)
-#         # If last_layer_subset, we only need the compute the last layer for a subset of tokens
-#         # (e.g., the tokens we need to compute the masked LM loss and the next-sentence prediction).
-#         self.last_layer_subset = getattr(config, "last_layer_subset", False)
-#         if self.last_layer_subset:
-#             assert self.dense_seq_output, "last_layer_subset requires dense_seq_output"
-#         use_xentropy = getattr(config, "use_xentropy", False)
-#         if use_xentropy and CrossEntropyLoss is None:
-#             raise ImportError("xentropy_cuda is not installed")
-#         loss_cls = (
-#             nn.CrossEntropyLoss
-#             if not use_xentropy
-#             else partial(CrossEntropyLoss, inplace_backward=True)
-#         )
-#
-#         self.xlm = XLMRobertaModel(config)
-#         self.cls = XLMRobertaPreTrainingHeads(config)
-#         self.mlm_loss = loss_cls(ignore_index=0)
-#         self.nsp_loss = loss_cls(ignore_index=-1)
-#
-#         # Initialize weights and apply final processing
-#         self.apply(partial(_init_weights, initializer_range=config.initializer_range))
-#         self.tie_weights()
-#
-#     def tie_weights(self):
-#         self.cls.predictions.decoder.weight = self.xlm.embeddings.word_embeddings.weight
-#
-#     def forward(
-#         self,
-#         input_ids,
-#         position_ids=None,
-#         token_type_ids=None,
-#         attention_mask=None,
-#         labels=None,
-#         next_sentence_label=None,
-#     ):
-#         """
-#         If labels are provided, they must be 0 for masked out tokens (as specified in the attention
-#         mask).
-#         Outputs:
-#             if `labels` and `next_sentence_label` are not `None`:
-#                 Outputs the total_loss which is the sum of the masked language modeling loss and the next
-#                 sentence classification loss.
-#             if `labels` or `next_sentence_label` is `None`:
-#                 Outputs a tuple comprising
-#                 - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
-#                 - the next sentence classification logits of shape [batch_size, 2].
-#
-#         """
-#         masked_tokens_mask = labels > 0 if (self.last_layer_subset and labels is not None) else None
-#         outputs = self.xlm(
-#             input_ids,
-#             position_ids=position_ids,
-#             token_type_ids=token_type_ids,
-#             attention_mask=attention_mask.bool() if attention_mask is not None else None,
-#             masked_tokens_mask=masked_tokens_mask,
-#         )
-#         sequence_output, pooled_output = outputs.last_hidden_state, outputs.pooler_output
-#         if self.dense_seq_output and labels is not None:
-#             masked_token_idx = torch.nonzero(labels.flatten() > 0, as_tuple=False).flatten()
-#             if not self.last_layer_subset:
-#                 sequence_output = index_first_axis(
-#                     rearrange(sequence_output, "b s d -> (b s) d"), masked_token_idx
-#                 )
-#         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
-#
-#         total_loss = None
-#         if labels is not None and next_sentence_label is not None:
-#             if (
-#                 self.dense_seq_output and labels is not None
-#             ):  # prediction_scores are already flattened
-#                 masked_lm_loss = self.mlm_loss(
-#                     prediction_scores, labels.flatten()[masked_token_idx]
-#                 )
-#             else:
-#                 masked_lm_loss = self.mlm_loss(
-#                     rearrange(prediction_scores, "... v -> (...) v"),
-#                     rearrange(labels, "... -> (...)"),
-#                 )
-#             next_sentence_loss = self.nsp_loss(
-#                 rearrange(seq_relationship_score, "... t -> (...) t"),
-#                 rearrange(next_sentence_label, "... -> (...)"),
-#             )
-#             total_loss = masked_lm_loss.float() + next_sentence_loss.float()
-#
-#         return BertForPreTrainingOutput(
-#             loss=total_loss,
-#             prediction_logits=prediction_scores,
-#             seq_relationship_logits=seq_relationship_score,
-#         )
 def remap_state_dict(state_dict, config: PretrainedConfig):
     """
     Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
@@ -1065,47 +987,47 @@ def inv_remap_state_dict(state_dict, config: PretrainedConfig):
         if not last_layer_subset or d != (config.num_hidden_layers - 1):
             Wqkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.weight")
             Wqkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.bias")
-            state_dict[
-                f"bert.encoder.layers.{d}.attention.self.query.weight"
-            ] = Wqkv_weights[: Wqkv_weights.shape[0] // 3, :]
-            state_dict[
-                f"bert.encoder.layers.{d}.attention.self.key.weight"
-            ] = Wqkv_weights[
-                Wqkv_weights.shape[0] // 3 : 2 * Wqkv_weights.shape[0] // 3, :
-            ]
-            state_dict[
-                f"bert.encoder.layers.{d}.attention.self.value.weight"
-            ] = Wqkv_weights[2 * Wqkv_weights.shape[0] // 3 :, :]
-            state_dict[
-                f"bert.encoder.layers.{d}.attention.self.query.bias"
-            ] = Wqkv_biases[: Wqkv_biases.shape[0] // 3]
-            state_dict[
-                f"bert.encoder.layers.{d}.attention.self.key.bias"
-            ] = Wqkv_biases[Wqkv_biases.shape[0] // 3 : 2 * Wqkv_biases.shape[0] // 3]
-            state_dict[
-                f"bert.encoder.layers.{d}.attention.self.value.bias"
-            ] = Wqkv_biases[2 * Wqkv_biases.shape[0] // 3 :]
         else:
             Wq_weight = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.weight")
             Wkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.weight")
             Wq_bias = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.bias")
             Wkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.bias")
-            state_dict[
-                f"bert.encoder.layers.{d}.attention.self.query.weight"
-            ] = Wq_weight
-            state_dict[
-                f"bert.encoder.layers.{d}.attention.self.key.weight"
-            ] = Wkv_weights[: Wkv_weights.shape[0] // 2, :]
-            state_dict[
-                f"bert.encoder.layers.{d}.attention.self.value.weight"
-            ] = Wkv_weights[Wkv_weights.shape[0] // 2 :, :]
             state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wq_bias
             state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wkv_biases[
                 : Wkv_biases.shape[0] // 2
             ]
-            state_dict[
-                f"bert.encoder.layers.{d}.attention.self.value.bias"
-            ] = Wkv_biases[Wkv_biases.shape[0] // 2 :]
     def inv_key_mapping_ln(key):
         key = re.sub(r"bert.emb_ln.", "bert.embeddings.LayerNorm.", key)
@@ -1294,4 +1216,4 @@ class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel):
             logits=logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-        )

 from collections import OrderedDict
 from collections.abc import Sequence
 from functools import partial
+from typing import List, Optional, Tuple, Union
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers import AutoTokenizer, PretrainedConfig
+from transformers.modeling_outputs import (MaskedLMOutput,
+                                           SequenceClassifierOutput)
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.bert.modeling_bert import (
+    BaseModelOutputWithPoolingAndCrossAttentions, BertForPreTrainingOutput)
+from transformers.models.xlm_roberta.modeling_xlm_roberta import \
+    XLMRobertaLMHead
 from .block import Block
+from .configuration_xlm_roberta import XLMRobertaFlashConfig
 from .embedding import XLMRobertaEmbeddings
 from .mha import MHA
 from .mlp import FusedMLP, Mlp
+from .xlm_padding import index_first_axis_residual, pad_input, unpad_input
 try:
     from flash_attn.ops.fused_dense import FusedDense
         return False
     if importlib.util.find_spec("flash_attn") is None:
         logger.warning(
+            "flash_attn is not installed. Using PyTorch native attention implementation."
         )
         return False
     return True
         fused_bias_fc=fused_bias_fc,
         use_flash_attn=use_flash_attn,
         return_residual=return_residual,
+        use_alibi=config.position_embedding_type == "alibi",
         **rotary_kwargs,
     )
     return mixer_cls
     def gradient_checkpointing(self, value):
         self._grad_checkpointing = value
+    def forward(
+        self, hidden_states, key_padding_mask=None, subset_mask=None, adapter_mask=None
+    ):
         """If subset_mask is not None, we only want output for the subset of the sequence.
         This means that we only compute the last layer output for these tokens.
         subset_mask: (batch, seqlen), dtype=torch.bool
         """
         if key_padding_mask is None or not self.use_flash_attn:
+            mixer_kwargs = {"adapter_mask": adapter_mask}
             if key_padding_mask is not None:
+                mixer_kwargs["key_padding_mask"] = key_padding_mask.bool()
             for layer in self.layers:
                 if self._grad_checkpointing:
                     hidden_states = torch.utils.checkpoint.checkpoint(
                 hidden_states = hidden_states[subset_mask]
         else:
             batch, seqlen = hidden_states.shape[:2]
+            hidden_states, indices, cu_seqlens, max_seqlen_in_batch, cu_adapter_mask = (
+                unpad_input(hidden_states, key_padding_mask, adapter_mask)
             )
+            mixer_kwargs = {
+                "cu_seqlens": cu_seqlens,
+                "max_seqlen": max_seqlen_in_batch,
+                "adapter_mask": cu_adapter_mask,
+            }
             if subset_mask is None:
                 for layer in self.layers:
         if adapter_mask is not None:
             unique_tasks = torch.unique(adapter_mask)
             pool_dtype = next(self.dense.parameters()).dtype
+            pooled_output = torch.empty(
+                first_token_tensor.shape[0],
+                self.dense.out_features,
+                dtype=pool_dtype,
+                device=first_token_tensor.device,
+            )
             for task_id in unique_tasks:
                 task_indices = (adapter_mask == task_id).nonzero(as_tuple=True)[0]
                 task_first_token_tensor = first_token_tensor[task_indices]
+                task_pooled_output = self.dense(
+                    task_first_token_tensor, task_id=task_id
+                )
                 pooled_output[task_indices] = task_pooled_output
         else:
             pooled_output = self.dense(first_token_tensor)
         *args,
         **kwargs,
     ):
+        if not "torch_dtype" in kwargs:
+            kwargs["torch_dtype"] = "auto"
         return super().from_pretrained(*args, **kwargs)
 class XLMRobertaModel(XLMRobertaPreTrainedModel):
     def __init__(self, config: XLMRobertaFlashConfig, add_pooling_layer=True):
         super().__init__(config)
         self.embeddings = XLMRobertaEmbeddings(
             config.hidden_size,
             config.vocab_size,
+            (
+                config.max_position_embeddings
+                if config.position_embedding_type == "absolute"
+                else -1
+            ),
             config.type_vocab_size,
             padding_idx=config.pad_token_id,
         )
         self.pooler = XLMRobertaPooler(config) if add_pooling_layer else None
         self.apply(partial(_init_weights, initializer_range=config.initializer_range))
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.name_or_path, trust_remote_code=True
+        )
         self._rotary_emb_base = config.rotary_emb_base
     @torch.inference_mode()
     def encode(
+        self: "XLMRobertaModel",
         sentences: Union[str, List[str]],
         batch_size: int = 32,
         show_progress_bar: Optional[bool] = None,
+        output_value: str = "sentence_embedding",
         convert_to_numpy: bool = True,
         convert_to_tensor: bool = False,
         device: Optional[torch.device] = None,
         if convert_to_tensor:
             convert_to_numpy = False
+        if output_value != "sentence_embedding":
             convert_to_tensor = False
             convert_to_numpy = False
         input_was_string = False
+        if isinstance(sentences, str) or not hasattr(sentences, "__len__"):
             sentences = [sentences]
             input_was_string = True
         inverse_permutation = np.argsort(permutation)
         sentences = [sentences[idx] for idx in permutation]
+        tokenizer_kwargs["padding"] = tokenizer_kwargs.get("padding", True)
+        tokenizer_kwargs["max_length"] = tokenizer_kwargs.get(
+            "max_length", self.tokenizer.init_kwargs.get("model_max_length", 8192)
         )
+        tokenizer_kwargs["truncation"] = tokenizer_kwargs.get("truncation", True)
         all_embeddings = []
             )
         else:
             range_iter = range(0, len(sentences), batch_size)
+        lora_arguments = (
+            {"adapter_mask": adapter_mask} if adapter_mask is not None else {}
+        )
         for i in range_iter:
             encoded_input = self.tokenizer(
                 sentences[i : i + batch_size],
+                return_tensors="pt",
                 **tokenizer_kwargs,
             ).to(self.device)
             token_embs = self.forward(**encoded_input, **lora_arguments)[0]
             # Accumulate in fp32 to avoid overflow
             token_embs = token_embs.float()
+            if output_value == "token_embeddings":
                 raise NotImplementedError
             elif output_value is None:
                 raise NotImplementedError
             else:
+                if self.config.emb_pooler == "cls":
                     embeddings = self.cls_pooling(
+                        token_embs, encoded_input["attention_mask"]
                     )
                 else:
                     embeddings = self.mean_pooling(
+                        token_embs, encoded_input["attention_mask"]
                     )
                 if normalize_embeddings:
     def truncate_embeddings(self, embeddings, truncate_dim):
         if not self.config.matryoshka_dimensions:
             logger.warning(
+                "Matryoshka embeddings are not supported, so dimension truncation will not be performed."
             )
             return embeddings
         elif truncate_dim in self.config.matryoshka_dimensions:
             return [tensor[:truncate_dim] for tensor in embeddings]
         else:
+            raise ValueError(
+                f"The provided `truncate_dim` value of {truncate_dim} is not supported. "
+                f"Supported dimensions are {self.config.matryoshka_dimensions}."
+            )
     def mean_pooling(
         self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor
             input_mask_expanded.sum(1), min=1e-9
         )
+    def cls_pooling(self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor):
+        return token_embeddings[:, 0]
     @property
     def rotary_emb_base(self):
     def rotary_emb_base(self, base):
         if not isinstance(base, (int, float)):
             raise TypeError("Base must be an integer or float")
+        logger.info(f"Changing RoPE base value to {base}")
         for layer in self.encoder.layers:
             layer.mixer.rotary_emb.base = base
         self._rotary_emb_base = base
         layer output for these tokens.
         masked_tokens_mask: (batch, seqlen), dtype=torch.bool
         """
+        adapter_mask = kwargs.pop("adapter_mask", None)
         if kwargs:
             for key, value in kwargs.items():
                 if value is not None:
                     logger.warning(
+                        "Flash attention implementation does not support kwargs: %s",
                         key,
                     )
         )
         hidden_states = self.embeddings(
+            input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            adapter_mask=adapter_mask,
         )
         # TD [2022-12:18]: Don't need to force residual in fp32
         # BERT puts embedding LayerNorm before embedding dropout.
             subset_mask = None
         sequence_output = self.encoder(
+            hidden_states,
+            key_padding_mask=attention_mask,
+            subset_mask=subset_mask,
+            adapter_mask=adapter_mask,
         )
         if masked_tokens_mask is None:
             pooled_output = (
+                self.pooler(sequence_output, adapter_mask=adapter_mask)
+                if self.pooler is not None
+                else None
             )
         else:
             # TD [2022-03-01]: the indexing here is very tricky.
                 pool_input = sequence_output[first_col_mask[subset_mask]]
                 sequence_output = sequence_output[masked_tokens_mask[subset_mask]]
             pooled_output = (
+                self.pooler(pool_input, pool=False, adapter_mask=adapter_mask)
+                if self.pooler is not None
+                else None
             )
         if not return_dict:
         )
 def remap_state_dict(state_dict, config: PretrainedConfig):
     """
     Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
         if not last_layer_subset or d != (config.num_hidden_layers - 1):
             Wqkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.weight")
             Wqkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.bias")
+            state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = (
+                Wqkv_weights[: Wqkv_weights.shape[0] // 3, :]
+            )
+            state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = (
+                Wqkv_weights[
+                    Wqkv_weights.shape[0] // 3 : 2 * Wqkv_weights.shape[0] // 3, :
+                ]
+            )
+            state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = (
+                Wqkv_weights[2 * Wqkv_weights.shape[0] // 3 :, :]
+            )
+            state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = (
+                Wqkv_biases[: Wqkv_biases.shape[0] // 3]
+            )
+            state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = (
+                Wqkv_biases[Wqkv_biases.shape[0] // 3 : 2 * Wqkv_biases.shape[0] // 3]
+            )
+            state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = (
+                Wqkv_biases[2 * Wqkv_biases.shape[0] // 3 :]
+            )
         else:
             Wq_weight = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.weight")
             Wkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.weight")
             Wq_bias = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.bias")
             Wkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.bias")
+            state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = (
+                Wq_weight
+            )
+            state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = (
+                Wkv_weights[: Wkv_weights.shape[0] // 2, :]
+            )
+            state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = (
+                Wkv_weights[Wkv_weights.shape[0] // 2 :, :]
+            )
             state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wq_bias
             state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wkv_biases[
                 : Wkv_biases.shape[0] // 2
             ]
+            state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = (
+                Wkv_biases[Wkv_biases.shape[0] // 2 :]
+            )
     def inv_key_mapping_ln(key):
         key = re.sub(r"bert.emb_ln.", "bert.embeddings.LayerNorm.", key)
             logits=logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
+        )

modeling_xlm_roberta_for_glue.py DELETED Viewed

@@ -1,109 +0,0 @@
-from typing import Optional, Union, Tuple
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss
-from transformers.modeling_outputs import SequenceClassifierOutput, QuestionAnsweringModelOutput, TokenClassifierOutput
-from .modeling_xlm_roberta import XLMRobertaPreTrainedModel, XLMRobertaModel
-from .configuration_xlm_roberta import XLMRobertaFlashConfig
-class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel):
-    def __init__(self, config: XLMRobertaFlashConfig):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-        self.roberta = XLMRobertaModel(config)
-        classifier_dropout = (
-            config.classifier_dropout
-            if config.classifier_dropout is not None
-            else config.hidden_dropout_prob
-        )
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-        assert head_mask is None
-        assert inputs_embeds is None
-        assert output_attentions is None
-        assert output_hidden_states is None
-        assert return_dict
-        outputs = self.roberta(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        pooled_output = outputs[1]
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (
-                    labels.dtype == torch.long or labels.dtype == torch.int
-                ):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )

pytorch_model.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cfa8fa7c7e120199548fe7149512c0adfe58f6bc13ce19f09b895aa25e8af910
-size 1113232188

rotary.py CHANGED Viewed

@@ -1,4 +1,7 @@
-# Adapted from https://github.com/Dao-AILab/flash-attention/pull/556
 # Copyright (c) 2023, Tri Dao.
 import math
@@ -11,8 +14,9 @@ if torch.cuda.is_available():
     try:
         from flash_attn.ops.triton.rotary import apply_rotary
     except ImportError:
         def apply_rotary(*args, **kwargs):
-            raise RuntimeError('RoPE requires flash-attention to be installed')
 def rotate_half(x, interleaved=False):
@@ -21,7 +25,9 @@ def rotate_half(x, interleaved=False):
         return torch.cat((-x2, x1), dim=-1)
     else:
         x1, x2 = x[..., ::2], x[..., 1::2]
-        return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2)
 def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
@@ -32,13 +38,20 @@ def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
     ro_dim = cos.shape[-1] * 2
     assert ro_dim <= x.shape[-1]
     cos, sin = (
-        cos[:x.shape[1]],
-        sin[:x.shape[1]],
     )
-    cos = repeat(cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
-    sin = repeat(sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
     return torch.cat(
-        [x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]],
         dim=-1,
     )
@@ -68,7 +81,9 @@ class ApplyRotaryEmb(torch.autograd.Function):
         )
         if isinstance(seqlen_offsets, int):
-            ctx.save_for_backward(cos, sin, cu_seqlens)  # Can't save int with save_for_backward
             ctx.seqlen_offsets = seqlen_offsets
         else:
             ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
@@ -336,7 +351,9 @@ class ApplyRotaryEmbKV_(torch.autograd.Function):
             max_seqlen=max_seqlen,
         )
         if isinstance(seqlen_offsets, int):
-            ctx.save_for_backward(cos, sin, cu_seqlens)  # Can't save int with save_for_backward
             ctx.seqlen_offsets = seqlen_offsets
         else:
             ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
@@ -451,7 +468,8 @@ class RotaryEmbedding(torch.nn.Module):
         self.interleaved = interleaved
         self.scale_base = scale_base
         scale = (
-            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
             if scale_base is not None
             else None
         )
@@ -477,7 +495,10 @@ class RotaryEmbedding(torch.nn.Module):
     def _compute_inv_freq(self, device=None):
         return 1.0 / (
             self.base
-            ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim)
         )
     def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
@@ -516,10 +537,14 @@ class RotaryEmbedding(torch.nn.Module):
                 self._sin_cached = torch.sin(freqs).to(dtype)
             else:
                 power = (
-                    torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device)
                     - seqlen // 2
                 ) / self.scale_base
-                scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
                 # We want the multiplication by scale to happen in fp32
                 self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
                 self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
@@ -550,7 +575,9 @@ class RotaryEmbedding(torch.nn.Module):
         if max_seqlen is not None:
             self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)
         elif isinstance(seqlen_offset, int):
-            self._update_cos_sin_cache(seqlen + seqlen_offset, device=qkv.device, dtype=qkv.dtype)
         if kv is None:
             if self.scale is None:
                 return apply_rotary_emb_qkv_(
@@ -606,4 +633,4 @@ class RotaryEmbedding(torch.nn.Module):
                     cu_seqlens=cu_seqlens,
                     max_seqlen=max_seqlen,
                 )
-            return q, kv

+# This implementation was adapted from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/layers/rotary.py
+# Commit id: 3566596ad867ee415dd3c12616dd50c610176f6c
+# Rotary varlen support from https://github.com/Dao-AILab/flash-attention/pull/556
 # Copyright (c) 2023, Tri Dao.
 import math
     try:
         from flash_attn.ops.triton.rotary import apply_rotary
     except ImportError:
         def apply_rotary(*args, **kwargs):
+            raise RuntimeError("RoPE requires flash-attention to be installed")
 def rotate_half(x, interleaved=False):
         return torch.cat((-x2, x1), dim=-1)
     else:
         x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(
+            torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
+        )
 def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
     ro_dim = cos.shape[-1] * 2
     assert ro_dim <= x.shape[-1]
     cos, sin = (
+        cos[: x.shape[1]],
+        sin[: x.shape[1]],
+    )
+    cos = repeat(
+        cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    )
+    sin = repeat(
+        sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
     )
     return torch.cat(
+        [
+            x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
+            x[..., ro_dim:],
+        ],
         dim=-1,
     )
         )
         if isinstance(seqlen_offsets, int):
+            ctx.save_for_backward(
+                cos, sin, cu_seqlens
+            )  # Can't save int with save_for_backward
             ctx.seqlen_offsets = seqlen_offsets
         else:
             ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
             max_seqlen=max_seqlen,
         )
         if isinstance(seqlen_offsets, int):
+            ctx.save_for_backward(
+                cos, sin, cu_seqlens
+            )  # Can't save int with save_for_backward
             ctx.seqlen_offsets = seqlen_offsets
         else:
             ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets)
         self.interleaved = interleaved
         self.scale_base = scale_base
         scale = (
+            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim)
+            / (1.4 * dim)
             if scale_base is not None
             else None
         )
     def _compute_inv_freq(self, device=None):
         return 1.0 / (
             self.base
+            ** (
+                torch.arange(0, self.dim, 2, device=device, dtype=torch.float32)
+                / self.dim
+            )
         )
     def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
                 self._sin_cached = torch.sin(freqs).to(dtype)
             else:
                 power = (
+                    torch.arange(
+                        seqlen, dtype=self.scale.dtype, device=self.scale.device
+                    )
                     - seqlen // 2
                 ) / self.scale_base
+                scale = self.scale.to(device=power.device) ** rearrange(
+                    power, "s -> s 1"
+                )
                 # We want the multiplication by scale to happen in fp32
                 self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
                 self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
         if max_seqlen is not None:
             self._update_cos_sin_cache(max_seqlen, device=qkv.device, dtype=qkv.dtype)
         elif isinstance(seqlen_offset, int):
+            self._update_cos_sin_cache(
+                seqlen + seqlen_offset, device=qkv.device, dtype=qkv.dtype
+            )
         if kv is None:
             if self.scale is None:
                 return apply_rotary_emb_qkv_(
                     cu_seqlens=cu_seqlens,
                     max_seqlen=max_seqlen,
                 )
+            return q, kv

stochastic_depth.py CHANGED Viewed

@@ -34,7 +34,7 @@
 import torch
 import torch.fx
-from torch import nn, Tensor
 def stochastic_depth(

 import torch
 import torch.fx
+from torch import Tensor, nn
 def stochastic_depth(

tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json DELETED Viewed

@@ -1,4 +0,0 @@
-{
-  "model_max_length": 8194,
-  "tokenizer_class": "XLMRobertaTokenizer"
-}

xlm_padding.py CHANGED Viewed

@@ -18,7 +18,9 @@ class IndexFirstAxis(torch.autograd.Function):
         # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
         # return input[indices]
         return torch.gather(
-            rearrange(input, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim)
         ).reshape(-1, *other_shape)
     @staticmethod
@@ -34,7 +36,9 @@ class IndexFirstAxis(torch.autograd.Function):
         )
         # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
         # grad_input[indices] = grad_output
-        grad_input.scatter_(0, repeat(indices, "z -> z d", d=grad_output.shape[1]), grad_output)
         return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
@@ -112,9 +116,15 @@ def unpad_input(hidden_states, attention_mask, adapter_mask=None):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    cu_adapter_mask = torch.repeat_interleave(adapter_mask, cu_seqlens[1:] - cu_seqlens[:-1]) if adapter_mask is not None else None
     # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
     # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
@@ -184,14 +194,18 @@ def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_leng
     """
     length = attention_mask_in_length.sum(dim=-1)
     seqlen = attention_mask_in_length.size(-1)
-    attention_mask_2d = torch.arange(seqlen, device=length.device, dtype=length.dtype).expand(len(length),
-                                                                                              seqlen) < length.unsqueeze(
-        1)
-    real_indices_idx = torch.nonzero(attention_mask_in_length.flatten(), as_tuple=False).flatten()
     seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx]
     indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
     # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
     # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
     # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
@@ -219,4 +233,4 @@ def pad_input(hidden_states, indices, batch, seqlen):
     # output = torch.zeros((batch * seqlen), dim, device=hidden_states.device, dtype=hidden_states.dtype)
     # output[indices] = hidden_states
     output = index_put_first_axis(hidden_states, indices, batch * seqlen)
-    return rearrange(output, "(b s) ... -> b s ...", b=batch)

         # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
         # return input[indices]
         return torch.gather(
+            rearrange(input, "b ... -> b (...)"),
+            0,
+            repeat(indices, "z -> z d", d=second_dim),
         ).reshape(-1, *other_shape)
     @staticmethod
         )
         # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
         # grad_input[indices] = grad_output
+        grad_input.scatter_(
+            0, repeat(indices, "z -> z d", d=grad_output.shape[1]), grad_output
+        )
         return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(
+        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)
+    )
+    cu_adapter_mask = (
+        torch.repeat_interleave(adapter_mask, cu_seqlens[1:] - cu_seqlens[:-1])
+        if adapter_mask is not None
+        else None
+    )
     # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
     # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
     """
     length = attention_mask_in_length.sum(dim=-1)
     seqlen = attention_mask_in_length.size(-1)
+    attention_mask_2d = torch.arange(
+        seqlen, device=length.device, dtype=length.dtype
+    ).expand(len(length), seqlen) < length.unsqueeze(1)
+    real_indices_idx = torch.nonzero(
+        attention_mask_in_length.flatten(), as_tuple=False
+    ).flatten()
     seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx]
     indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(
+        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)
+    )
     # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
     # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
     # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
     # output = torch.zeros((batch * seqlen), dim, device=hidden_states.device, dtype=hidden_states.dtype)
     # output[indices] = hidden_states
     output = index_put_first_axis(hidden_states, indices, batch * seqlen)
+    return rearrange(output, "(b s) ... -> b s ...", b=batch)