small fix with torch.finfo
Browse files- modeling_lsg_camembert.py +90 -121
modeling_lsg_camembert.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from logging import warn
|
2 |
-
from transformers.models.
|
3 |
import torch
|
4 |
import torch.nn as nn
|
5 |
from transformers.models.camembert.configuration_camembert import CamembertConfig
|
@@ -156,7 +156,7 @@ class BaseAttentionProduct(nn.Module):
|
|
156 |
# Apply the attention mask is (precomputed for all layers in CamembertModel forward() function)
|
157 |
attention_scores = attention_scores + attention_mask
|
158 |
del attention_mask
|
159 |
-
|
160 |
# Normalize the attention scores to probabilities.
|
161 |
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
162 |
|
@@ -198,7 +198,7 @@ class CausalAttentionProduct(nn.Module):
|
|
198 |
diagonal=-1
|
199 |
)
|
200 |
causal_mask = causal_mask.T * torch.finfo(attention_scores.dtype).min
|
201 |
-
attention_scores[..., -causal_shape[0]:, -causal_shape[1]:] = causal_mask
|
202 |
|
203 |
del attention_mask
|
204 |
|
@@ -296,7 +296,7 @@ class LSGAttentionProduct(nn.Module):
|
|
296 |
).transpose(-1, -2)
|
297 |
del sparse_mask
|
298 |
del global_mask
|
299 |
-
|
300 |
# expect (..., t, d) shape
|
301 |
# Compute attention
|
302 |
context_layer = self.attention(
|
@@ -391,7 +391,7 @@ class LSGAttentionProduct(nn.Module):
|
|
391 |
return x.reshape(*x.size()[:-2], n_blocks, -1, d)
|
392 |
|
393 |
|
394 |
-
class LSGCamembertEmbeddings(
|
395 |
|
396 |
def __init__(self, config):
|
397 |
super().__init__(config)
|
@@ -447,7 +447,7 @@ class LSGCamembertEmbeddings(RobertaEmbeddings):
|
|
447 |
return embeddings
|
448 |
|
449 |
|
450 |
-
class LSGAttention(
|
451 |
|
452 |
def __init__(self, config):
|
453 |
|
@@ -546,7 +546,8 @@ class LSGSelfAttention(BaseSelfAttention):
|
|
546 |
keys = keys.sum(dim=-2) / (mask + 1e-6)
|
547 |
values = values.sum(dim=-2) / (mask + 1e-6)
|
548 |
|
549 |
-
mask = (1. - mask.clamp(0, 1))
|
|
|
550 |
return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.expand(-1, h, -1, -1).transpose(-1, -2)
|
551 |
|
552 |
def get_sparse_tokens_with_stride(self, keys, values, mask):
|
@@ -611,7 +612,8 @@ class LSGSelfAttention(BaseSelfAttention):
|
|
611 |
keys /= mask + 1e-8
|
612 |
values /= mask + 1e-8
|
613 |
|
614 |
-
mask = (1. - mask.clamp(0, 1))
|
|
|
615 |
|
616 |
return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.transpose(-1, -2).reshape(n, h, 1, -1)
|
617 |
|
@@ -879,7 +881,7 @@ class LSGSelfAttention(BaseSelfAttention):
|
|
879 |
return x.reshape(n, h, -1, chunk_size, d)
|
880 |
|
881 |
|
882 |
-
class LSGCamembertLayer(
|
883 |
|
884 |
def __init__(self, config):
|
885 |
|
@@ -891,7 +893,7 @@ class LSGCamembertLayer(RobertaLayer):
|
|
891 |
self.crossattention = LSGAttention(config)
|
892 |
|
893 |
|
894 |
-
class LSGCamembertEncoder(
|
895 |
|
896 |
def __init__(self, config):
|
897 |
|
@@ -899,8 +901,73 @@ class LSGCamembertEncoder(RobertaEncoder):
|
|
899 |
|
900 |
self.layer = nn.ModuleList([LSGCamembertLayer(config) for _ in range(config.num_hidden_layers)])
|
901 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
902 |
|
903 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
904 |
"""
|
905 |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
906 |
models.
|
@@ -909,11 +976,11 @@ class LSGCamembertPreTrainedModel(RobertaPreTrainedModel):
|
|
909 |
config_class = LSGCamembertConfig
|
910 |
|
911 |
def _set_gradient_checkpointing(self, module, value=False):
|
912 |
-
if isinstance(module, (
|
913 |
module.gradient_checkpointing = value
|
914 |
|
915 |
|
916 |
-
class LSGCamembertModel(LSGCamembertPreTrainedModel,
|
917 |
"""
|
918 |
This class overrides :class:`~transformers.CamembertModel`. Please check the superclass for the appropriate
|
919 |
documentation alongside usage examples.
|
@@ -926,19 +993,9 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
|
|
926 |
|
927 |
LSGCamembertPreTrainedModel.__init__(self, config)
|
928 |
|
929 |
-
assert hasattr(config, "num_global_tokens")
|
930 |
-
self.num_global_tokens = config.num_global_tokens
|
931 |
-
self.pad_idx = config.pad_token_id
|
932 |
-
|
933 |
-
assert hasattr(config, "block_size") and hasattr(config, "adaptive")
|
934 |
-
self.block_size = config.block_size
|
935 |
-
self.adaptive = config.adaptive
|
936 |
-
self.mask_first_token = config.mask_first_token
|
937 |
-
self.pool_with_global = config.pool_with_global
|
938 |
-
|
939 |
self.embeddings = LSGCamembertEmbeddings(config)
|
940 |
self.encoder = LSGCamembertEncoder(config)
|
941 |
-
self.pooler =
|
942 |
|
943 |
if config.add_cross_attention:
|
944 |
logger.warning(
|
@@ -948,94 +1005,6 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
|
|
948 |
# Initialize weights and apply final processing
|
949 |
self.post_init()
|
950 |
|
951 |
-
def forward(
|
952 |
-
self,
|
953 |
-
input_ids=None,
|
954 |
-
attention_mask=None,
|
955 |
-
token_type_ids=None,
|
956 |
-
position_ids=None,
|
957 |
-
head_mask=None,
|
958 |
-
inputs_embeds=None,
|
959 |
-
encoder_hidden_states=None,
|
960 |
-
encoder_attention_mask=None,
|
961 |
-
past_key_values=None,
|
962 |
-
use_cache=None,
|
963 |
-
output_attentions=None,
|
964 |
-
output_hidden_states=None,
|
965 |
-
return_dict=None
|
966 |
-
):
|
967 |
-
|
968 |
-
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
969 |
-
output_hidden_states = (
|
970 |
-
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
971 |
-
)
|
972 |
-
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
973 |
-
|
974 |
-
inputs_ = input_ids if input_ids is not None else inputs_embeds
|
975 |
-
n, t = inputs_.size()[:2]
|
976 |
-
|
977 |
-
if attention_mask is None:
|
978 |
-
attention_mask = torch.ones(n, t, device=inputs_.device, dtype=inputs_.dtype)
|
979 |
-
if self.mask_first_token:
|
980 |
-
attention_mask[:,0] = 0
|
981 |
-
|
982 |
-
b = self.block_size * 2
|
983 |
-
pad = t % self.block_size
|
984 |
-
|
985 |
-
# Check if t is multiple of block_size and pad
|
986 |
-
if self.adaptive and t > b and pad > 0:
|
987 |
-
pad_length = self.block_size - pad
|
988 |
-
if input_ids is not None:
|
989 |
-
input_ids = torch.nn.functional.pad(input_ids, (0, pad_length), value=self.pad_idx)
|
990 |
-
else:
|
991 |
-
inputs_embeds = torch.nn.functional.pad(inputs_embeds.transpose(-1, -2), (0, pad_length), value=0.).transpose(-1, -2)
|
992 |
-
|
993 |
-
attention_mask = torch.nn.functional.pad(attention_mask, (0, pad_length), value=0)
|
994 |
-
|
995 |
-
if token_type_ids is not None:
|
996 |
-
token_type_ids = torch.nn.functional.pad(token_type_ids, (0, pad_length), value=0)
|
997 |
-
if position_ids is not None:
|
998 |
-
position_ids = torch.nn.functional.pad(position_ids, (0, pad_length), value=0)
|
999 |
-
|
1000 |
-
n, t_ = attention_mask.size()
|
1001 |
-
|
1002 |
-
encoder_outputs = super().forward(
|
1003 |
-
input_ids=input_ids,
|
1004 |
-
attention_mask=attention_mask,
|
1005 |
-
token_type_ids=token_type_ids,
|
1006 |
-
position_ids=position_ids,
|
1007 |
-
head_mask=head_mask,
|
1008 |
-
inputs_embeds=inputs_embeds,
|
1009 |
-
encoder_hidden_states=encoder_hidden_states,
|
1010 |
-
encoder_attention_mask=encoder_attention_mask,
|
1011 |
-
past_key_values=past_key_values,
|
1012 |
-
use_cache=use_cache,
|
1013 |
-
output_attentions=output_attentions,
|
1014 |
-
output_hidden_states=output_hidden_states,
|
1015 |
-
return_dict=return_dict
|
1016 |
-
)
|
1017 |
-
|
1018 |
-
sequence_output = encoder_outputs[0]
|
1019 |
-
if self.pool_with_global:
|
1020 |
-
sequence_output[:, self.num_global_tokens] = sequence_output[:, 0]
|
1021 |
-
|
1022 |
-
diff = t - t_
|
1023 |
-
n, _, d = sequence_output.size()
|
1024 |
-
sequence_output = sequence_output[..., self.num_global_tokens:, :]
|
1025 |
-
|
1026 |
-
# Adapt sequence to initial shape
|
1027 |
-
if diff < 0:
|
1028 |
-
sequence_output = sequence_output[:, :t]
|
1029 |
-
|
1030 |
-
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
|
1031 |
-
|
1032 |
-
if not return_dict:
|
1033 |
-
return (sequence_output, pooled_output) + encoder_outputs[1:]
|
1034 |
-
|
1035 |
-
encoder_outputs.last_hidden_state = sequence_output
|
1036 |
-
encoder_outputs.pooler_output = pooled_output
|
1037 |
-
return encoder_outputs
|
1038 |
-
|
1039 |
def get_extended_attention_mask(self, attention_mask, input_shape, device=None):
|
1040 |
|
1041 |
# Do not rely on original triangular mask from BERT/RoBERTa for causalLM
|
@@ -1054,7 +1023,7 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
|
|
1054 |
return extended_attention_mask
|
1055 |
|
1056 |
|
1057 |
-
class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel,
|
1058 |
|
1059 |
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
|
1060 |
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
|
@@ -1068,7 +1037,7 @@ class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, RobertaForCausalLM):
|
|
1068 |
logger.warning("If you want to use `LSGCamembertLMHeadModel` as a standalone, add `is_decoder=True.`")
|
1069 |
|
1070 |
self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
|
1071 |
-
self.lm_head =
|
1072 |
|
1073 |
# The LM head weights require special treatment only when they are tied with the word embeddings
|
1074 |
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
|
@@ -1077,7 +1046,7 @@ class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, RobertaForCausalLM):
|
|
1077 |
self.post_init()
|
1078 |
|
1079 |
|
1080 |
-
class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel,
|
1081 |
"""
|
1082 |
This class overrides :class:`~transformers.CamembertForMaskedLM`. Please check the superclass for the appropriate
|
1083 |
documentation alongside usage examples.
|
@@ -1098,7 +1067,7 @@ class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, RobertaForMaskedLM):
|
|
1098 |
)
|
1099 |
|
1100 |
self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
|
1101 |
-
self.lm_head =
|
1102 |
|
1103 |
# The LM head weights require special treatment only when they are tied with the word embeddings
|
1104 |
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
|
@@ -1107,7 +1076,7 @@ class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, RobertaForMaskedLM):
|
|
1107 |
self.post_init()
|
1108 |
|
1109 |
|
1110 |
-
class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel,
|
1111 |
"""
|
1112 |
This class overrides :class:`~transformers.CamembertForSequenceClassification`. Please check the superclass for the
|
1113 |
appropriate documentation alongside usage examples.
|
@@ -1123,13 +1092,13 @@ class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, Roberta
|
|
1123 |
self.config = config
|
1124 |
|
1125 |
self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
|
1126 |
-
self.classifier =
|
1127 |
|
1128 |
# Initialize weights and apply final processing
|
1129 |
self.post_init()
|
1130 |
|
1131 |
|
1132 |
-
class LSGCamembertForMultipleChoice(LSGCamembertPreTrainedModel,
|
1133 |
"""
|
1134 |
This class overrides :class:`~transformers.CamembertForMultipleChoice`. Please check the superclass for the
|
1135 |
appropriate documentation alongside usage examples.
|
@@ -1149,7 +1118,7 @@ class LSGCamembertForMultipleChoice(LSGCamembertPreTrainedModel, RobertaForMulti
|
|
1149 |
self.post_init()
|
1150 |
|
1151 |
|
1152 |
-
class LSGCamembertForTokenClassification(LSGCamembertPreTrainedModel,
|
1153 |
"""
|
1154 |
This class overrides :class:`~transformers.CamembertForTokenClassification`. Please check the superclass for the
|
1155 |
appropriate documentation alongside usage examples.
|
@@ -1175,7 +1144,7 @@ class LSGCamembertForTokenClassification(LSGCamembertPreTrainedModel, RobertaFor
|
|
1175 |
self.post_init()
|
1176 |
|
1177 |
|
1178 |
-
class LSGCamembertForQuestionAnswering(LSGCamembertPreTrainedModel,
|
1179 |
"""
|
1180 |
This class overrides :class:`~transformers.CamembertForQuestionAnswering`. Please check the superclass for the
|
1181 |
appropriate documentation alongside usage examples.
|
|
|
1 |
from logging import warn
|
2 |
+
from transformers.models.camembert.modeling_camembert import *
|
3 |
import torch
|
4 |
import torch.nn as nn
|
5 |
from transformers.models.camembert.configuration_camembert import CamembertConfig
|
|
|
156 |
# Apply the attention mask is (precomputed for all layers in CamembertModel forward() function)
|
157 |
attention_scores = attention_scores + attention_mask
|
158 |
del attention_mask
|
159 |
+
|
160 |
# Normalize the attention scores to probabilities.
|
161 |
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
162 |
|
|
|
198 |
diagonal=-1
|
199 |
)
|
200 |
causal_mask = causal_mask.T * torch.finfo(attention_scores.dtype).min
|
201 |
+
attention_scores[..., -causal_shape[0]:, -causal_shape[1] + 1:] = causal_mask[:, 1:]
|
202 |
|
203 |
del attention_mask
|
204 |
|
|
|
296 |
).transpose(-1, -2)
|
297 |
del sparse_mask
|
298 |
del global_mask
|
299 |
+
|
300 |
# expect (..., t, d) shape
|
301 |
# Compute attention
|
302 |
context_layer = self.attention(
|
|
|
391 |
return x.reshape(*x.size()[:-2], n_blocks, -1, d)
|
392 |
|
393 |
|
394 |
+
class LSGCamembertEmbeddings(CamembertEmbeddings):
|
395 |
|
396 |
def __init__(self, config):
|
397 |
super().__init__(config)
|
|
|
447 |
return embeddings
|
448 |
|
449 |
|
450 |
+
class LSGAttention(CamembertAttention):
|
451 |
|
452 |
def __init__(self, config):
|
453 |
|
|
|
546 |
keys = keys.sum(dim=-2) / (mask + 1e-6)
|
547 |
values = values.sum(dim=-2) / (mask + 1e-6)
|
548 |
|
549 |
+
mask = (1. - mask.clamp(0, 1))
|
550 |
+
mask *= torch.finfo(mask.dtype).min
|
551 |
return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.expand(-1, h, -1, -1).transpose(-1, -2)
|
552 |
|
553 |
def get_sparse_tokens_with_stride(self, keys, values, mask):
|
|
|
612 |
keys /= mask + 1e-8
|
613 |
values /= mask + 1e-8
|
614 |
|
615 |
+
mask = (1. - mask.clamp(0, 1))
|
616 |
+
mask *= torch.finfo(mask.dtype).min
|
617 |
|
618 |
return keys.reshape(n, h, -1, d), values.reshape(n, h, -1, d), mask.transpose(-1, -2).reshape(n, h, 1, -1)
|
619 |
|
|
|
881 |
return x.reshape(n, h, -1, chunk_size, d)
|
882 |
|
883 |
|
884 |
+
class LSGCamembertLayer(CamembertLayer):
|
885 |
|
886 |
def __init__(self, config):
|
887 |
|
|
|
893 |
self.crossattention = LSGAttention(config)
|
894 |
|
895 |
|
896 |
+
class LSGCamembertEncoder(CamembertEncoder):
|
897 |
|
898 |
def __init__(self, config):
|
899 |
|
|
|
901 |
|
902 |
self.layer = nn.ModuleList([LSGCamembertLayer(config) for _ in range(config.num_hidden_layers)])
|
903 |
|
904 |
+
assert hasattr(config, "num_global_tokens")
|
905 |
+
self.num_global_tokens = config.num_global_tokens
|
906 |
+
self.pad_idx = config.pad_token_id
|
907 |
+
|
908 |
+
assert hasattr(config, "block_size") and hasattr(config, "adaptive")
|
909 |
+
self.block_size = config.block_size
|
910 |
+
self.adaptive = config.adaptive
|
911 |
+
self.mask_first_token = config.mask_first_token
|
912 |
+
self.pool_with_global = config.pool_with_global
|
913 |
+
|
914 |
+
def forward(
|
915 |
+
self,
|
916 |
+
hidden_states: torch.Tensor,
|
917 |
+
attention_mask: Optional[torch.FloatTensor] = None,
|
918 |
+
head_mask: Optional[torch.FloatTensor] = None,
|
919 |
+
encoder_hidden_states: Optional[torch.FloatTensor] = None,
|
920 |
+
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
921 |
+
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
922 |
+
use_cache: Optional[bool] = None,
|
923 |
+
output_attentions: Optional[bool] = False,
|
924 |
+
output_hidden_states: Optional[bool] = False,
|
925 |
+
return_dict: Optional[bool] = True,
|
926 |
+
) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
|
927 |
+
|
928 |
+
mask_value = torch.finfo(attention_mask.dtype).min
|
929 |
+
n, _, __, t = attention_mask.size()
|
930 |
+
|
931 |
+
if not (self.config.is_decoder and encoder_hidden_states is not None):
|
932 |
+
b = self.block_size * 2
|
933 |
+
pad = t % self.block_size
|
934 |
+
|
935 |
+
# Check if t is multiple of block_size and pad
|
936 |
+
if self.adaptive and t > b and pad > 0:
|
937 |
+
pad_length = self.block_size - pad
|
938 |
+
hidden_states = torch.nn.functional.pad(hidden_states.transpose(-1, -2), (0, pad_length), value=0.).transpose(-1, -2)
|
939 |
+
attention_mask = torch.nn.functional.pad(attention_mask, (0, pad_length), value=mask_value)
|
940 |
+
|
941 |
+
if self.mask_first_token:
|
942 |
+
attention_mask[..., 0] = mask_value
|
943 |
+
|
944 |
+
encoder_outputs = super().forward(
|
945 |
+
hidden_states=hidden_states,
|
946 |
+
attention_mask=attention_mask,
|
947 |
+
head_mask=head_mask,
|
948 |
+
encoder_hidden_states=encoder_hidden_states,
|
949 |
+
encoder_attention_mask=encoder_attention_mask,
|
950 |
+
past_key_values=past_key_values,
|
951 |
+
use_cache=use_cache,
|
952 |
+
output_attentions=output_attentions,
|
953 |
+
output_hidden_states=output_hidden_states,
|
954 |
+
return_dict=return_dict
|
955 |
+
)
|
956 |
+
|
957 |
+
sequence_output = encoder_outputs[0]
|
958 |
+
if self.pool_with_global:
|
959 |
+
sequence_output[:, self.num_global_tokens] = sequence_output[:, 0]
|
960 |
+
|
961 |
+
# Adapt sequence to initial shape
|
962 |
+
sequence_output = sequence_output[..., self.num_global_tokens: t + self.num_global_tokens, :]
|
963 |
|
964 |
+
if not return_dict:
|
965 |
+
return (sequence_output, ) + encoder_outputs[1:]
|
966 |
+
|
967 |
+
encoder_outputs.last_hidden_state = sequence_output
|
968 |
+
return encoder_outputs
|
969 |
+
|
970 |
+
class LSGCamembertPreTrainedModel(CamembertPreTrainedModel):
|
971 |
"""
|
972 |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
973 |
models.
|
|
|
976 |
config_class = LSGCamembertConfig
|
977 |
|
978 |
def _set_gradient_checkpointing(self, module, value=False):
|
979 |
+
if isinstance(module, (CamembertEncoder, LSGCamembertEncoder)):
|
980 |
module.gradient_checkpointing = value
|
981 |
|
982 |
|
983 |
+
class LSGCamembertModel(LSGCamembertPreTrainedModel, CamembertModel):
|
984 |
"""
|
985 |
This class overrides :class:`~transformers.CamembertModel`. Please check the superclass for the appropriate
|
986 |
documentation alongside usage examples.
|
|
|
993 |
|
994 |
LSGCamembertPreTrainedModel.__init__(self, config)
|
995 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
996 |
self.embeddings = LSGCamembertEmbeddings(config)
|
997 |
self.encoder = LSGCamembertEncoder(config)
|
998 |
+
self.pooler = CamembertPooler(config) if add_pooling_layer else None
|
999 |
|
1000 |
if config.add_cross_attention:
|
1001 |
logger.warning(
|
|
|
1005 |
# Initialize weights and apply final processing
|
1006 |
self.post_init()
|
1007 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1008 |
def get_extended_attention_mask(self, attention_mask, input_shape, device=None):
|
1009 |
|
1010 |
# Do not rely on original triangular mask from BERT/RoBERTa for causalLM
|
|
|
1023 |
return extended_attention_mask
|
1024 |
|
1025 |
|
1026 |
+
class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, CamembertForCausalLM):
|
1027 |
|
1028 |
_keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
|
1029 |
_keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
|
|
|
1037 |
logger.warning("If you want to use `LSGCamembertLMHeadModel` as a standalone, add `is_decoder=True.`")
|
1038 |
|
1039 |
self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
|
1040 |
+
self.lm_head = CamembertLMHead(config)
|
1041 |
|
1042 |
# The LM head weights require special treatment only when they are tied with the word embeddings
|
1043 |
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
|
|
|
1046 |
self.post_init()
|
1047 |
|
1048 |
|
1049 |
+
class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, CamembertForMaskedLM):
|
1050 |
"""
|
1051 |
This class overrides :class:`~transformers.CamembertForMaskedLM`. Please check the superclass for the appropriate
|
1052 |
documentation alongside usage examples.
|
|
|
1067 |
)
|
1068 |
|
1069 |
self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
|
1070 |
+
self.lm_head = CamembertLMHead(config)
|
1071 |
|
1072 |
# The LM head weights require special treatment only when they are tied with the word embeddings
|
1073 |
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
|
|
|
1076 |
self.post_init()
|
1077 |
|
1078 |
|
1079 |
+
class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, CamembertForSequenceClassification):
|
1080 |
"""
|
1081 |
This class overrides :class:`~transformers.CamembertForSequenceClassification`. Please check the superclass for the
|
1082 |
appropriate documentation alongside usage examples.
|
|
|
1092 |
self.config = config
|
1093 |
|
1094 |
self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
|
1095 |
+
self.classifier = CamembertClassificationHead(config)
|
1096 |
|
1097 |
# Initialize weights and apply final processing
|
1098 |
self.post_init()
|
1099 |
|
1100 |
|
1101 |
+
class LSGCamembertForMultipleChoice(LSGCamembertPreTrainedModel, CamembertForMultipleChoice):
|
1102 |
"""
|
1103 |
This class overrides :class:`~transformers.CamembertForMultipleChoice`. Please check the superclass for the
|
1104 |
appropriate documentation alongside usage examples.
|
|
|
1118 |
self.post_init()
|
1119 |
|
1120 |
|
1121 |
+
class LSGCamembertForTokenClassification(LSGCamembertPreTrainedModel, CamembertForTokenClassification):
|
1122 |
"""
|
1123 |
This class overrides :class:`~transformers.CamembertForTokenClassification`. Please check the superclass for the
|
1124 |
appropriate documentation alongside usage examples.
|
|
|
1144 |
self.post_init()
|
1145 |
|
1146 |
|
1147 |
+
class LSGCamembertForQuestionAnswering(LSGCamembertPreTrainedModel, CamembertForQuestionAnswering):
|
1148 |
"""
|
1149 |
This class overrides :class:`~transformers.CamembertForQuestionAnswering`. Please check the superclass for the
|
1150 |
appropriate documentation alongside usage examples.
|