ccdv
/

lsg-camembert-base-4096

@@ -1,5 +1,5 @@
 from logging import warn
-from transformers.models.roberta.modeling_roberta import *
 import torch
 import torch.nn as nn
 from transformers.models.camembert.configuration_camembert import CamembertConfig
@@ -156,7 +156,7 @@ class BaseAttentionProduct(nn.Module):
             # Apply the attention mask is (precomputed for all layers in CamembertModel forward() function)
             attention_scores = attention_scores + attention_mask
             del attention_mask
         # Normalize the attention scores to probabilities.
         attention_probs = nn.Softmax(dim=-1)(attention_scores)
@@ -198,7 +198,7 @@ class CausalAttentionProduct(nn.Module):
                 diagonal=-1
                 )
             causal_mask = causal_mask.T * torch.finfo(attention_scores.dtype).min
-            attention_scores[..., -causal_shape[0]:, -causal_shape[1]:] = causal_mask
             del attention_mask
@@ -296,7 +296,7 @@ class LSGAttentionProduct(nn.Module):
             ).transpose(-1, -2)
         del sparse_mask
         del global_mask
         # expect (..., t, d) shape
         # Compute attention
         context_layer = self.attention(
@@ -391,7 +391,7 @@ class LSGAttentionProduct(nn.Module):
         return x.reshape(*x.size()[:-2], n_blocks, -1, d)
-class LSGCamembertEmbeddings(RobertaEmbeddings):
     def __init__(self, config):
         super().__init__(config)
@@ -447,7 +447,7 @@ class LSGCamembertEmbeddings(RobertaEmbeddings):
         return embeddings
-class LSGAttention(RobertaAttention):
     def __init__(self, config):
@@ -546,7 +546,8 @@ class LSGSelfAttention(BaseSelfAttention):
         keys = keys.sum(dim=-2) / (mask + 1e-6)
         values = values.sum(dim=-2) / (mask + 1e-6)
-        mask = (1. - mask.clamp(0, 1)) * torch.finfo(mask.dtype).min
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.expand(-1, h, -1, -1).transpose(-1, -2)
     def get_sparse_tokens_with_stride(self, keys, values, mask):
@@ -611,7 +612,8 @@ class LSGSelfAttention(BaseSelfAttention):
         keys /= mask + 1e-8
         values /= mask + 1e-8
-        mask = (1. - mask.clamp(0, 1)) * torch.finfo(mask.dtype).min
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.transpose(-1, -2).reshape(n, h, 1, -1)
@@ -879,7 +881,7 @@ class LSGSelfAttention(BaseSelfAttention):
         return x.reshape(n, h, -1, chunk_size, d)
-class LSGCamembertLayer(RobertaLayer):
     def __init__(self, config):
@@ -891,7 +893,7 @@ class LSGCamembertLayer(RobertaLayer):
             self.crossattention = LSGAttention(config)
-class LSGCamembertEncoder(RobertaEncoder):
     def __init__(self, config):
@@ -899,8 +901,73 @@ class LSGCamembertEncoder(RobertaEncoder):
         self.layer = nn.ModuleList([LSGCamembertLayer(config) for _ in range(config.num_hidden_layers)])
-class LSGCamembertPreTrainedModel(RobertaPreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
@@ -909,11 +976,11 @@ class LSGCamembertPreTrainedModel(RobertaPreTrainedModel):
     config_class = LSGCamembertConfig
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (RobertaEncoder, LSGCamembertEncoder)):
             module.gradient_checkpointing = value
-class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
     """
     This class overrides :class:`~transformers.CamembertModel`. Please check the superclass for the appropriate
     documentation alongside usage examples.
@@ -926,19 +993,9 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
         LSGCamembertPreTrainedModel.__init__(self, config)
-        assert hasattr(config, "num_global_tokens")
-        self.num_global_tokens = config.num_global_tokens
-        self.pad_idx = config.pad_token_id
-        assert hasattr(config, "block_size") and hasattr(config, "adaptive")
-        self.block_size = config.block_size
-        self.adaptive = config.adaptive
-        self.mask_first_token = config.mask_first_token
-        self.pool_with_global = config.pool_with_global
         self.embeddings = LSGCamembertEmbeddings(config)
         self.encoder = LSGCamembertEncoder(config)
-        self.pooler = RobertaPooler(config) if add_pooling_layer else None
         if config.add_cross_attention:
             logger.warning(
@@ -948,94 +1005,6 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
         # Initialize weights and apply final processing
         self.post_init()
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None
-        ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        inputs_ = input_ids if input_ids is not None else inputs_embeds
-        n, t = inputs_.size()[:2]
-        if attention_mask is None:
-            attention_mask = torch.ones(n, t, device=inputs_.device, dtype=inputs_.dtype)
-        if self.mask_first_token:
-            attention_mask[:,0] = 0
-        b = self.block_size * 2
-        pad = t % self.block_size
-        # Check if t is multiple of block_size and pad
-        if self.adaptive and t > b and pad > 0:
-            pad_length = self.block_size - pad
-            if input_ids is not None:
-                input_ids = torch.nn.functional.pad(input_ids, (0, pad_length), value=self.pad_idx)
-            else:
-                inputs_embeds = torch.nn.functional.pad(inputs_embeds.transpose(-1, -2), (0, pad_length), value=0.).transpose(-1, -2)
-            attention_mask = torch.nn.functional.pad(attention_mask, (0, pad_length), value=0)
-            if token_type_ids is not None:
-                token_type_ids = torch.nn.functional.pad(token_type_ids, (0, pad_length), value=0)
-            if position_ids is not None:
-                position_ids = torch.nn.functional.pad(position_ids, (0, pad_length), value=0)
-        n, t_ = attention_mask.size()
-        encoder_outputs = super().forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict
-            )
-        sequence_output = encoder_outputs[0]
-        if self.pool_with_global:
-            sequence_output[:, self.num_global_tokens] = sequence_output[:, 0]
-        diff = t - t_
-        n, _, d = sequence_output.size()
-        sequence_output = sequence_output[..., self.num_global_tokens:, :]
-        # Adapt sequence to initial shape
-        if diff < 0:
-            sequence_output = sequence_output[:, :t]
-        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
-        if not return_dict:
-            return (sequence_output, pooled_output) + encoder_outputs[1:]
-        encoder_outputs.last_hidden_state = sequence_output
-        encoder_outputs.pooler_output = pooled_output
-        return encoder_outputs
     def get_extended_attention_mask(self, attention_mask, input_shape, device=None):
         # Do not rely on original triangular mask from BERT/RoBERTa for causalLM
@@ -1054,7 +1023,7 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
         return extended_attention_mask
-class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, RobertaForCausalLM):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
@@ -1068,7 +1037,7 @@ class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, RobertaForCausalLM):
             logger.warning("If you want to use `LSGCamembertLMHeadModel` as a standalone, add `is_decoder=True.`")
         self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
-        self.lm_head = RobertaLMHead(config)
         # The LM head weights require special treatment only when they are tied with the word embeddings
         self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1077,7 +1046,7 @@ class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, RobertaForCausalLM):
         self.post_init()
-class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, RobertaForMaskedLM):
     """
     This class overrides :class:`~transformers.CamembertForMaskedLM`. Please check the superclass for the appropriate
     documentation alongside usage examples.
@@ -1098,7 +1067,7 @@ class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, RobertaForMaskedLM):
             )
         self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
-        self.lm_head = RobertaLMHead(config)
         # The LM head weights require special treatment only when they are tied with the word embeddings
         self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1107,7 +1076,7 @@ class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, RobertaForMaskedLM):
         self.post_init()
-class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, RobertaForSequenceClassification):
     """
     This class overrides :class:`~transformers.CamembertForSequenceClassification`. Please check the superclass for the
     appropriate documentation alongside usage examples.
@@ -1123,13 +1092,13 @@ class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, Roberta
         self.config = config
         self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
-        self.classifier = RobertaClassificationHead(config)
         # Initialize weights and apply final processing
         self.post_init()
-class LSGCamembertForMultipleChoice(LSGCamembertPreTrainedModel, RobertaForMultipleChoice):
     """
     This class overrides :class:`~transformers.CamembertForMultipleChoice`. Please check the superclass for the
     appropriate documentation alongside usage examples.
@@ -1149,7 +1118,7 @@ class LSGCamembertForMultipleChoice(LSGCamembertPreTrainedModel, RobertaForMulti
         self.post_init()
-class LSGCamembertForTokenClassification(LSGCamembertPreTrainedModel, RobertaForTokenClassification):
     """
     This class overrides :class:`~transformers.CamembertForTokenClassification`. Please check the superclass for the
     appropriate documentation alongside usage examples.
@@ -1175,7 +1144,7 @@ class LSGCamembertForTokenClassification(LSGCamembertPreTrainedModel, RobertaFor
         self.post_init()
-class LSGCamembertForQuestionAnswering(LSGCamembertPreTrainedModel, RobertaForQuestionAnswering):
     """
     This class overrides :class:`~transformers.CamembertForQuestionAnswering`. Please check the superclass for the
     appropriate documentation alongside usage examples.

 from logging import warn
+from transformers.models.camembert.modeling_camembert import *
 import torch
 import torch.nn as nn
 from transformers.models.camembert.configuration_camembert import CamembertConfig
             # Apply the attention mask is (precomputed for all layers in CamembertModel forward() function)
             attention_scores = attention_scores + attention_mask
             del attention_mask
         # Normalize the attention scores to probabilities.
         attention_probs = nn.Softmax(dim=-1)(attention_scores)
                 diagonal=-1
                 )
             causal_mask = causal_mask.T * torch.finfo(attention_scores.dtype).min
+            attention_scores[..., -causal_shape[0]:, -causal_shape[1] + 1:] = causal_mask[:, 1:]
             del attention_mask
             ).transpose(-1, -2)
         del sparse_mask
         del global_mask
         # expect (..., t, d) shape
         # Compute attention
         context_layer = self.attention(
         return x.reshape(*x.size()[:-2], n_blocks, -1, d)
+class LSGCamembertEmbeddings(CamembertEmbeddings):
     def __init__(self, config):
         super().__init__(config)
         return embeddings
+class LSGAttention(CamembertAttention):
     def __init__(self, config):
         keys = keys.sum(dim=-2) / (mask + 1e-6)
         values = values.sum(dim=-2) / (mask + 1e-6)
+        mask = (1. - mask.clamp(0, 1))
+        mask *= torch.finfo(mask.dtype).min
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.expand(-1, h, -1, -1).transpose(-1, -2)
     def get_sparse_tokens_with_stride(self, keys, values, mask):
         keys /= mask + 1e-8
         values /= mask + 1e-8
+        mask = (1. - mask.clamp(0, 1))
+        mask *= torch.finfo(mask.dtype).min
         return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.transpose(-1, -2).reshape(n, h, 1, -1)
         return x.reshape(n, h, -1, chunk_size, d)
+class LSGCamembertLayer(CamembertLayer):
     def __init__(self, config):
             self.crossattention = LSGAttention(config)
+class LSGCamembertEncoder(CamembertEncoder):
     def __init__(self, config):
         self.layer = nn.ModuleList([LSGCamembertLayer(config) for _ in range(config.num_hidden_layers)])
+        assert hasattr(config, "num_global_tokens")
+        self.num_global_tokens = config.num_global_tokens
+        self.pad_idx = config.pad_token_id
+        assert hasattr(config, "block_size") and hasattr(config, "adaptive")
+        self.block_size = config.block_size
+        self.adaptive = config.adaptive
+        self.mask_first_token = config.mask_first_token
+        self.pool_with_global = config.pool_with_global
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        mask_value = torch.finfo(attention_mask.dtype).min
+        n, _, __, t = attention_mask.size()
+        if not (self.config.is_decoder and encoder_hidden_states is not None):
+            b = self.block_size * 2
+            pad = t % self.block_size
+            # Check if t is multiple of block_size and pad
+            if self.adaptive and t > b and pad > 0:
+                pad_length = self.block_size - pad
+                hidden_states = torch.nn.functional.pad(hidden_states.transpose(-1, -2), (0, pad_length), value=0.).transpose(-1, -2)
+                attention_mask = torch.nn.functional.pad(attention_mask, (0, pad_length), value=mask_value)
+            if self.mask_first_token:
+                attention_mask[..., 0] = mask_value
+        encoder_outputs = super().forward(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+            )
+        sequence_output = encoder_outputs[0]
+        if self.pool_with_global:
+            sequence_output[:, self.num_global_tokens] = sequence_output[:, 0]
+        # Adapt sequence to initial shape
+        sequence_output = sequence_output[..., self.num_global_tokens: t + self.num_global_tokens, :]
+        if not return_dict:
+            return (sequence_output, ) + encoder_outputs[1:]
+        encoder_outputs.last_hidden_state = sequence_output
+        return encoder_outputs
+class LSGCamembertPreTrainedModel(CamembertPreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     config_class = LSGCamembertConfig
     def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CamembertEncoder, LSGCamembertEncoder)):
             module.gradient_checkpointing = value
+class LSGCamembertModel(LSGCamembertPreTrainedModel, CamembertModel):
     """
     This class overrides :class:`~transformers.CamembertModel`. Please check the superclass for the appropriate
     documentation alongside usage examples.
         LSGCamembertPreTrainedModel.__init__(self, config)
         self.embeddings = LSGCamembertEmbeddings(config)
         self.encoder = LSGCamembertEncoder(config)
+        self.pooler = CamembertPooler(config) if add_pooling_layer else None
         if config.add_cross_attention:
             logger.warning(
         # Initialize weights and apply final processing
         self.post_init()
     def get_extended_attention_mask(self, attention_mask, input_shape, device=None):
         # Do not rely on original triangular mask from BERT/RoBERTa for causalLM
         return extended_attention_mask
+class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, CamembertForCausalLM):
     _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
             logger.warning("If you want to use `LSGCamembertLMHeadModel` as a standalone, add `is_decoder=True.`")
         self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
+        self.lm_head = CamembertLMHead(config)
         # The LM head weights require special treatment only when they are tied with the word embeddings
         self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
         self.post_init()
+class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, CamembertForMaskedLM):
     """
     This class overrides :class:`~transformers.CamembertForMaskedLM`. Please check the superclass for the appropriate
     documentation alongside usage examples.
             )
         self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
+        self.lm_head = CamembertLMHead(config)
         # The LM head weights require special treatment only when they are tied with the word embeddings
         self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
         self.post_init()
+class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, CamembertForSequenceClassification):
     """
     This class overrides :class:`~transformers.CamembertForSequenceClassification`. Please check the superclass for the
     appropriate documentation alongside usage examples.
         self.config = config
         self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
+        self.classifier = CamembertClassificationHead(config)
         # Initialize weights and apply final processing
         self.post_init()
+class LSGCamembertForMultipleChoice(LSGCamembertPreTrainedModel, CamembertForMultipleChoice):
     """
     This class overrides :class:`~transformers.CamembertForMultipleChoice`. Please check the superclass for the
     appropriate documentation alongside usage examples.
         self.post_init()
+class LSGCamembertForTokenClassification(LSGCamembertPreTrainedModel, CamembertForTokenClassification):
     """
     This class overrides :class:`~transformers.CamembertForTokenClassification`. Please check the superclass for the
     appropriate documentation alongside usage examples.
         self.post_init()
+class LSGCamembertForQuestionAnswering(LSGCamembertPreTrainedModel, CamembertForQuestionAnswering):
     """
     This class overrides :class:`~transformers.CamembertForQuestionAnswering`. Please check the superclass for the
     appropriate documentation alongside usage examples.