small fix
Browse files- modeling_lsg_camembert.py +27 -58
modeling_lsg_camembert.py
CHANGED
@@ -55,7 +55,8 @@ class LSGCamembertConfig(CamembertConfig):
|
|
55 |
|
56 |
if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
|
57 |
logger.warning(
|
58 |
-
"[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'],
|
|
|
59 |
self.sparsity_type = None
|
60 |
|
61 |
if self.sparsity_type in ["stride", "block_stride"]:
|
@@ -71,7 +72,7 @@ class LSGCamembertConfig(CamembertConfig):
|
|
71 |
self.num_global_tokens = 1
|
72 |
elif self.num_global_tokens > 512:
|
73 |
logger.warning(
|
74 |
-
"[WARNING CONFIG]: num_global_tokens > 512 is not
|
75 |
)
|
76 |
self.num_global_tokens = 512
|
77 |
|
@@ -79,6 +80,16 @@ class LSGCamembertConfig(CamembertConfig):
|
|
79 |
assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
|
80 |
assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
|
81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
class BaseSelfAttention(nn.Module):
|
84 |
|
@@ -436,39 +447,13 @@ class LSGCamembertEmbeddings(RobertaEmbeddings):
|
|
436 |
return embeddings
|
437 |
|
438 |
|
439 |
-
class LSGCamembertSelfOutput(RobertaSelfOutput):
|
440 |
-
|
441 |
-
def __init__(self, config):
|
442 |
-
super().__init__(config)
|
443 |
-
|
444 |
-
|
445 |
class LSGAttention(RobertaAttention):
|
446 |
|
447 |
def __init__(self, config):
|
448 |
|
449 |
-
|
450 |
|
451 |
self.self = LSGSelfAttention(config)
|
452 |
-
self.output = LSGCamembertSelfOutput(config)
|
453 |
-
self.pruned_heads = set()
|
454 |
-
|
455 |
-
|
456 |
-
class LSGCamembertIntermediate(RobertaIntermediate):
|
457 |
-
|
458 |
-
def __init__(self, config):
|
459 |
-
super().__init__(config)
|
460 |
-
|
461 |
-
|
462 |
-
class LSGCamembertOutput(RobertaOutput):
|
463 |
-
|
464 |
-
def __init__(self, config):
|
465 |
-
super().__init__(config)
|
466 |
-
|
467 |
-
|
468 |
-
class LSGCamembertPooler(RobertaPooler):
|
469 |
-
|
470 |
-
def __init__(self, config):
|
471 |
-
super().__init__(config)
|
472 |
|
473 |
|
474 |
class LSGSelfAttention(BaseSelfAttention):
|
@@ -898,29 +883,21 @@ class LSGCamembertLayer(RobertaLayer):
|
|
898 |
|
899 |
def __init__(self, config):
|
900 |
|
901 |
-
|
902 |
|
903 |
-
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
904 |
-
self.seq_len_dim = 1
|
905 |
self.attention = LSGAttention(config)
|
906 |
-
self.is_decoder = config.is_decoder
|
907 |
-
self.add_cross_attention = config.add_cross_attention
|
908 |
if self.add_cross_attention:
|
909 |
assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
|
910 |
self.crossattention = LSGAttention(config)
|
911 |
-
self.intermediate = LSGCamembertIntermediate(config)
|
912 |
-
self.output = LSGCamembertOutput(config)
|
913 |
|
914 |
|
915 |
class LSGCamembertEncoder(RobertaEncoder):
|
916 |
|
917 |
def __init__(self, config):
|
918 |
|
919 |
-
|
920 |
|
921 |
-
self.config = config
|
922 |
self.layer = nn.ModuleList([LSGCamembertLayer(config) for _ in range(config.num_hidden_layers)])
|
923 |
-
self.gradient_checkpointing = False
|
924 |
|
925 |
|
926 |
class LSGCamembertPreTrainedModel(RobertaPreTrainedModel):
|
@@ -945,7 +922,7 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
|
|
945 |
config_class = LSGCamembertConfig
|
946 |
|
947 |
|
948 |
-
def __init__(self, config, add_pooling_layer=
|
949 |
|
950 |
LSGCamembertPreTrainedModel.__init__(self, config)
|
951 |
|
@@ -961,7 +938,7 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
|
|
961 |
|
962 |
self.embeddings = LSGCamembertEmbeddings(config)
|
963 |
self.encoder = LSGCamembertEncoder(config)
|
964 |
-
self.pooler =
|
965 |
|
966 |
if config.add_cross_attention:
|
967 |
logger.warning(
|
@@ -988,6 +965,12 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
|
|
988 |
return_dict=None
|
989 |
):
|
990 |
|
|
|
|
|
|
|
|
|
|
|
|
|
991 |
inputs_ = input_ids if input_ids is not None else inputs_embeds
|
992 |
n, t = inputs_.size()[:2]
|
993 |
|
@@ -1085,7 +1068,7 @@ class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, RobertaForCausalLM):
|
|
1085 |
logger.warning("If you want to use `LSGCamembertLMHeadModel` as a standalone, add `is_decoder=True.`")
|
1086 |
|
1087 |
self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
|
1088 |
-
self.lm_head =
|
1089 |
|
1090 |
# The LM head weights require special treatment only when they are tied with the word embeddings
|
1091 |
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
|
@@ -1115,7 +1098,7 @@ class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, RobertaForMaskedLM):
|
|
1115 |
)
|
1116 |
|
1117 |
self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
|
1118 |
-
self.lm_head =
|
1119 |
|
1120 |
# The LM head weights require special treatment only when they are tied with the word embeddings
|
1121 |
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
|
@@ -1124,13 +1107,6 @@ class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, RobertaForMaskedLM):
|
|
1124 |
self.post_init()
|
1125 |
|
1126 |
|
1127 |
-
class LSGCamembertLMHead(RobertaLMHead):
|
1128 |
-
"""LSG Head for masked language modeling."""
|
1129 |
-
|
1130 |
-
def __init__(self, config):
|
1131 |
-
super().__init__(config)
|
1132 |
-
|
1133 |
-
|
1134 |
class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, RobertaForSequenceClassification):
|
1135 |
"""
|
1136 |
This class overrides :class:`~transformers.CamembertForSequenceClassification`. Please check the superclass for the
|
@@ -1147,19 +1123,12 @@ class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, Roberta
|
|
1147 |
self.config = config
|
1148 |
|
1149 |
self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
|
1150 |
-
self.classifier =
|
1151 |
|
1152 |
# Initialize weights and apply final processing
|
1153 |
self.post_init()
|
1154 |
|
1155 |
|
1156 |
-
class LSGCamembertClassificationHead(RobertaClassificationHead):
|
1157 |
-
"""Head for sentence-level classification tasks."""
|
1158 |
-
|
1159 |
-
def __init__(self, config):
|
1160 |
-
super().__init__(config)
|
1161 |
-
|
1162 |
-
|
1163 |
class LSGCamembertForMultipleChoice(LSGCamembertPreTrainedModel, RobertaForMultipleChoice):
|
1164 |
"""
|
1165 |
This class overrides :class:`~transformers.CamembertForMultipleChoice`. Please check the superclass for the
|
|
|
55 |
|
56 |
if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
|
57 |
logger.warning(
|
58 |
+
"[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], \
|
59 |
+
setting sparsity_type=None, computation will skip sparse attention")
|
60 |
self.sparsity_type = None
|
61 |
|
62 |
if self.sparsity_type in ["stride", "block_stride"]:
|
|
|
72 |
self.num_global_tokens = 1
|
73 |
elif self.num_global_tokens > 512:
|
74 |
logger.warning(
|
75 |
+
"[WARNING CONFIG]: num_global_tokens > 512 is not allowed, setting num_global_tokens=512"
|
76 |
)
|
77 |
self.num_global_tokens = 512
|
78 |
|
|
|
80 |
assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
|
81 |
assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
|
82 |
|
83 |
+
if self.mask_first_token and not pool_with_global:
|
84 |
+
logger.warning(
|
85 |
+
"[WARNING CONFIG]: pool_with_global==False is not compatible with mask_first_token==True. Setting pool_with_global to True.")
|
86 |
+
self.pool_with_global = True
|
87 |
+
|
88 |
+
if hasattr(self, "position_embedding_type"):
|
89 |
+
if self.position_embedding_type != "absolute":
|
90 |
+
logger.warning(
|
91 |
+
"[WARNING CONFIG]: LSG Attention is not compatible with relative positional embedding and will skip its computation. Set position_embedding_type='absolute' to remove this warning.")
|
92 |
+
|
93 |
|
94 |
class BaseSelfAttention(nn.Module):
|
95 |
|
|
|
447 |
return embeddings
|
448 |
|
449 |
|
|
|
|
|
|
|
|
|
|
|
|
|
450 |
class LSGAttention(RobertaAttention):
|
451 |
|
452 |
def __init__(self, config):
|
453 |
|
454 |
+
super().__init__(config)
|
455 |
|
456 |
self.self = LSGSelfAttention(config)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
457 |
|
458 |
|
459 |
class LSGSelfAttention(BaseSelfAttention):
|
|
|
883 |
|
884 |
def __init__(self, config):
|
885 |
|
886 |
+
super().__init__(config)
|
887 |
|
|
|
|
|
888 |
self.attention = LSGAttention(config)
|
|
|
|
|
889 |
if self.add_cross_attention:
|
890 |
assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
|
891 |
self.crossattention = LSGAttention(config)
|
|
|
|
|
892 |
|
893 |
|
894 |
class LSGCamembertEncoder(RobertaEncoder):
|
895 |
|
896 |
def __init__(self, config):
|
897 |
|
898 |
+
super().__init__(config)
|
899 |
|
|
|
900 |
self.layer = nn.ModuleList([LSGCamembertLayer(config) for _ in range(config.num_hidden_layers)])
|
|
|
901 |
|
902 |
|
903 |
class LSGCamembertPreTrainedModel(RobertaPreTrainedModel):
|
|
|
922 |
config_class = LSGCamembertConfig
|
923 |
|
924 |
|
925 |
+
def __init__(self, config, add_pooling_layer=True):
|
926 |
|
927 |
LSGCamembertPreTrainedModel.__init__(self, config)
|
928 |
|
|
|
938 |
|
939 |
self.embeddings = LSGCamembertEmbeddings(config)
|
940 |
self.encoder = LSGCamembertEncoder(config)
|
941 |
+
self.pooler = RobertaPooler(config) if add_pooling_layer else None
|
942 |
|
943 |
if config.add_cross_attention:
|
944 |
logger.warning(
|
|
|
965 |
return_dict=None
|
966 |
):
|
967 |
|
968 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
969 |
+
output_hidden_states = (
|
970 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
971 |
+
)
|
972 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
973 |
+
|
974 |
inputs_ = input_ids if input_ids is not None else inputs_embeds
|
975 |
n, t = inputs_.size()[:2]
|
976 |
|
|
|
1068 |
logger.warning("If you want to use `LSGCamembertLMHeadModel` as a standalone, add `is_decoder=True.`")
|
1069 |
|
1070 |
self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
|
1071 |
+
self.lm_head = RobertaLMHead(config)
|
1072 |
|
1073 |
# The LM head weights require special treatment only when they are tied with the word embeddings
|
1074 |
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
|
|
|
1098 |
)
|
1099 |
|
1100 |
self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
|
1101 |
+
self.lm_head = RobertaLMHead(config)
|
1102 |
|
1103 |
# The LM head weights require special treatment only when they are tied with the word embeddings
|
1104 |
self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
|
|
|
1107 |
self.post_init()
|
1108 |
|
1109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1110 |
class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, RobertaForSequenceClassification):
|
1111 |
"""
|
1112 |
This class overrides :class:`~transformers.CamembertForSequenceClassification`. Please check the superclass for the
|
|
|
1123 |
self.config = config
|
1124 |
|
1125 |
self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
|
1126 |
+
self.classifier = RobertaClassificationHead(config)
|
1127 |
|
1128 |
# Initialize weights and apply final processing
|
1129 |
self.post_init()
|
1130 |
|
1131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1132 |
class LSGCamembertForMultipleChoice(LSGCamembertPreTrainedModel, RobertaForMultipleChoice):
|
1133 |
"""
|
1134 |
This class overrides :class:`~transformers.CamembertForMultipleChoice`. Please check the superclass for the
|