ccdv commited on
Commit
073319f
1 Parent(s): 9808534
Files changed (1) hide show
  1. modeling_lsg_camembert.py +27 -58
modeling_lsg_camembert.py CHANGED
@@ -55,7 +55,8 @@ class LSGCamembertConfig(CamembertConfig):
55
 
56
  if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
57
  logger.warning(
58
- "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], setting sparsity_type=None, computation will skip sparse attention")
 
59
  self.sparsity_type = None
60
 
61
  if self.sparsity_type in ["stride", "block_stride"]:
@@ -71,7 +72,7 @@ class LSGCamembertConfig(CamembertConfig):
71
  self.num_global_tokens = 1
72
  elif self.num_global_tokens > 512:
73
  logger.warning(
74
- "[WARNING CONFIG]: num_global_tokens > 512 is not compatible, setting num_global_tokens=512"
75
  )
76
  self.num_global_tokens = 512
77
 
@@ -79,6 +80,16 @@ class LSGCamembertConfig(CamembertConfig):
79
  assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
80
  assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
81
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  class BaseSelfAttention(nn.Module):
84
 
@@ -436,39 +447,13 @@ class LSGCamembertEmbeddings(RobertaEmbeddings):
436
  return embeddings
437
 
438
 
439
- class LSGCamembertSelfOutput(RobertaSelfOutput):
440
-
441
- def __init__(self, config):
442
- super().__init__(config)
443
-
444
-
445
  class LSGAttention(RobertaAttention):
446
 
447
  def __init__(self, config):
448
 
449
- nn.Module.__init__(self)
450
 
451
  self.self = LSGSelfAttention(config)
452
- self.output = LSGCamembertSelfOutput(config)
453
- self.pruned_heads = set()
454
-
455
-
456
- class LSGCamembertIntermediate(RobertaIntermediate):
457
-
458
- def __init__(self, config):
459
- super().__init__(config)
460
-
461
-
462
- class LSGCamembertOutput(RobertaOutput):
463
-
464
- def __init__(self, config):
465
- super().__init__(config)
466
-
467
-
468
- class LSGCamembertPooler(RobertaPooler):
469
-
470
- def __init__(self, config):
471
- super().__init__(config)
472
 
473
 
474
  class LSGSelfAttention(BaseSelfAttention):
@@ -898,29 +883,21 @@ class LSGCamembertLayer(RobertaLayer):
898
 
899
  def __init__(self, config):
900
 
901
- nn.Module.__init__(self)
902
 
903
- self.chunk_size_feed_forward = config.chunk_size_feed_forward
904
- self.seq_len_dim = 1
905
  self.attention = LSGAttention(config)
906
- self.is_decoder = config.is_decoder
907
- self.add_cross_attention = config.add_cross_attention
908
  if self.add_cross_attention:
909
  assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
910
  self.crossattention = LSGAttention(config)
911
- self.intermediate = LSGCamembertIntermediate(config)
912
- self.output = LSGCamembertOutput(config)
913
 
914
 
915
  class LSGCamembertEncoder(RobertaEncoder):
916
 
917
  def __init__(self, config):
918
 
919
- nn.Module.__init__(self)
920
 
921
- self.config = config
922
  self.layer = nn.ModuleList([LSGCamembertLayer(config) for _ in range(config.num_hidden_layers)])
923
- self.gradient_checkpointing = False
924
 
925
 
926
  class LSGCamembertPreTrainedModel(RobertaPreTrainedModel):
@@ -945,7 +922,7 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
945
  config_class = LSGCamembertConfig
946
 
947
 
948
- def __init__(self, config, add_pooling_layer=False):
949
 
950
  LSGCamembertPreTrainedModel.__init__(self, config)
951
 
@@ -961,7 +938,7 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
961
 
962
  self.embeddings = LSGCamembertEmbeddings(config)
963
  self.encoder = LSGCamembertEncoder(config)
964
- self.pooler = LSGCamembertPooler(config) if add_pooling_layer else None
965
 
966
  if config.add_cross_attention:
967
  logger.warning(
@@ -988,6 +965,12 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, RobertaModel):
988
  return_dict=None
989
  ):
990
 
 
 
 
 
 
 
991
  inputs_ = input_ids if input_ids is not None else inputs_embeds
992
  n, t = inputs_.size()[:2]
993
 
@@ -1085,7 +1068,7 @@ class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, RobertaForCausalLM):
1085
  logger.warning("If you want to use `LSGCamembertLMHeadModel` as a standalone, add `is_decoder=True.`")
1086
 
1087
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1088
- self.lm_head = LSGCamembertLMHead(config)
1089
 
1090
  # The LM head weights require special treatment only when they are tied with the word embeddings
1091
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1115,7 +1098,7 @@ class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, RobertaForMaskedLM):
1115
  )
1116
 
1117
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1118
- self.lm_head = LSGCamembertLMHead(config)
1119
 
1120
  # The LM head weights require special treatment only when they are tied with the word embeddings
1121
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1124,13 +1107,6 @@ class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, RobertaForMaskedLM):
1124
  self.post_init()
1125
 
1126
 
1127
- class LSGCamembertLMHead(RobertaLMHead):
1128
- """LSG Head for masked language modeling."""
1129
-
1130
- def __init__(self, config):
1131
- super().__init__(config)
1132
-
1133
-
1134
  class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, RobertaForSequenceClassification):
1135
  """
1136
  This class overrides :class:`~transformers.CamembertForSequenceClassification`. Please check the superclass for the
@@ -1147,19 +1123,12 @@ class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, Roberta
1147
  self.config = config
1148
 
1149
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1150
- self.classifier = LSGCamembertClassificationHead(config)
1151
 
1152
  # Initialize weights and apply final processing
1153
  self.post_init()
1154
 
1155
 
1156
- class LSGCamembertClassificationHead(RobertaClassificationHead):
1157
- """Head for sentence-level classification tasks."""
1158
-
1159
- def __init__(self, config):
1160
- super().__init__(config)
1161
-
1162
-
1163
  class LSGCamembertForMultipleChoice(LSGCamembertPreTrainedModel, RobertaForMultipleChoice):
1164
  """
1165
  This class overrides :class:`~transformers.CamembertForMultipleChoice`. Please check the superclass for the
 
55
 
56
  if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
57
  logger.warning(
58
+ "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], \
59
+ setting sparsity_type=None, computation will skip sparse attention")
60
  self.sparsity_type = None
61
 
62
  if self.sparsity_type in ["stride", "block_stride"]:
 
72
  self.num_global_tokens = 1
73
  elif self.num_global_tokens > 512:
74
  logger.warning(
75
+ "[WARNING CONFIG]: num_global_tokens > 512 is not allowed, setting num_global_tokens=512"
76
  )
77
  self.num_global_tokens = 512
78
 
 
80
  assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
81
  assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
82
 
83
+ if self.mask_first_token and not pool_with_global:
84
+ logger.warning(
85
+ "[WARNING CONFIG]: pool_with_global==False is not compatible with mask_first_token==True. Setting pool_with_global to True.")
86
+ self.pool_with_global = True
87
+
88
+ if hasattr(self, "position_embedding_type"):
89
+ if self.position_embedding_type != "absolute":
90
+ logger.warning(
91
+ "[WARNING CONFIG]: LSG Attention is not compatible with relative positional embedding and will skip its computation. Set position_embedding_type='absolute' to remove this warning.")
92
+
93
 
94
  class BaseSelfAttention(nn.Module):
95
 
 
447
  return embeddings
448
 
449
 
 
 
 
 
 
 
450
  class LSGAttention(RobertaAttention):
451
 
452
  def __init__(self, config):
453
 
454
+ super().__init__(config)
455
 
456
  self.self = LSGSelfAttention(config)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
 
459
  class LSGSelfAttention(BaseSelfAttention):
 
883
 
884
  def __init__(self, config):
885
 
886
+ super().__init__(config)
887
 
 
 
888
  self.attention = LSGAttention(config)
 
 
889
  if self.add_cross_attention:
890
  assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
891
  self.crossattention = LSGAttention(config)
 
 
892
 
893
 
894
  class LSGCamembertEncoder(RobertaEncoder):
895
 
896
  def __init__(self, config):
897
 
898
+ super().__init__(config)
899
 
 
900
  self.layer = nn.ModuleList([LSGCamembertLayer(config) for _ in range(config.num_hidden_layers)])
 
901
 
902
 
903
  class LSGCamembertPreTrainedModel(RobertaPreTrainedModel):
 
922
  config_class = LSGCamembertConfig
923
 
924
 
925
+ def __init__(self, config, add_pooling_layer=True):
926
 
927
  LSGCamembertPreTrainedModel.__init__(self, config)
928
 
 
938
 
939
  self.embeddings = LSGCamembertEmbeddings(config)
940
  self.encoder = LSGCamembertEncoder(config)
941
+ self.pooler = RobertaPooler(config) if add_pooling_layer else None
942
 
943
  if config.add_cross_attention:
944
  logger.warning(
 
965
  return_dict=None
966
  ):
967
 
968
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
969
+ output_hidden_states = (
970
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
971
+ )
972
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
973
+
974
  inputs_ = input_ids if input_ids is not None else inputs_embeds
975
  n, t = inputs_.size()[:2]
976
 
 
1068
  logger.warning("If you want to use `LSGCamembertLMHeadModel` as a standalone, add `is_decoder=True.`")
1069
 
1070
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1071
+ self.lm_head = RobertaLMHead(config)
1072
 
1073
  # The LM head weights require special treatment only when they are tied with the word embeddings
1074
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
 
1098
  )
1099
 
1100
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1101
+ self.lm_head = RobertaLMHead(config)
1102
 
1103
  # The LM head weights require special treatment only when they are tied with the word embeddings
1104
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
 
1107
  self.post_init()
1108
 
1109
 
 
 
 
 
 
 
 
1110
  class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, RobertaForSequenceClassification):
1111
  """
1112
  This class overrides :class:`~transformers.CamembertForSequenceClassification`. Please check the superclass for the
 
1123
  self.config = config
1124
 
1125
  self.roberta = LSGCamembertModel(config, add_pooling_layer=False)
1126
+ self.classifier = RobertaClassificationHead(config)
1127
 
1128
  # Initialize weights and apply final processing
1129
  self.post_init()
1130
 
1131
 
 
 
 
 
 
 
 
1132
  class LSGCamembertForMultipleChoice(LSGCamembertPreTrainedModel, RobertaForMultipleChoice):
1133
  """
1134
  This class overrides :class:`~transformers.CamembertForMultipleChoice`. Please check the superclass for the