arjunanand13 commited on
Commit
54129b6
1 Parent(s): f6a677e

Create configuration_florence2.py

Browse files
Files changed (1) hide show
  1. configuration_florence2.py +303 -0
configuration_florence2.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ """ Florence-2 configuration"""
3
+
4
+ from typing import Optional
5
+
6
+ from transformers import AutoConfig
7
+ from transformers.configuration_utils import PretrainedConfig
8
+ from transformers.utils import logging
9
+
10
+ logger = logging.get_logger(__name__)
11
+
12
+ class Florence2VisionConfig(PretrainedConfig):
13
+ r"""
14
+ This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
15
+ according to the specified arguments, defining the model architecture. Instantiating a configuration with the
16
+ defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
17
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
18
+ documentation from [`PretrainedConfig`] for more information.
19
+ Args:
20
+ drop_path_rate (`float`, *optional*, defaults to 0.1):
21
+ The dropout rate of the drop path layer.
22
+ patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
23
+ The patch size of the image.
24
+ patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
25
+ The patch stride of the image.
26
+ patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
27
+ The patch padding of the image.
28
+ patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
29
+ Whether to apply layer normalization before the patch embedding layer.
30
+ enable_checkpoint (`bool`, *optional*, defaults to False):
31
+ Whether to enable checkpointing.
32
+ dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
33
+ The dimension of the embedding layer.
34
+ num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
35
+ The number of attention heads.
36
+ num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
37
+ The number of groups.
38
+ depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
39
+ The depth of the model.
40
+ window_size (`int`, *optional*, defaults to 12):
41
+ The window size of the model.
42
+ projection_dim (`int`, *optional*, defaults to 1024):
43
+ The dimension of the projection layer.
44
+ visual_temporal_embedding (`dict`, *optional*):
45
+ The configuration of the visual temporal embedding.
46
+ image_pos_embed (`dict`, *optional*):
47
+ The configuration of the image position embedding.
48
+ image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
49
+ The source of the image feature.
50
+ Example:
51
+ ```python
52
+ >>> from transformers import Florence2VisionConfig, Florence2VisionModel
53
+ >>> # Initializing a Florence2 Vision style configuration
54
+ >>> configuration = Florence2VisionConfig()
55
+ >>> # Initializing a model (with random weights)
56
+ >>> model = Florence2VisionModel(configuration)
57
+ >>> # Accessing the model configuration
58
+ >>> configuration = model.config
59
+ ```"""
60
+
61
+ model_type = "florence2_vision"
62
+ keys_to_ignore_at_inference = ["past_key_values"]
63
+
64
+ def __init__(
65
+ self,
66
+ drop_path_rate=0.1,
67
+ patch_size=[7, 3, 3, 3],
68
+ patch_stride=[4, 2, 2, 2],
69
+ patch_padding=[3, 1, 1, 1],
70
+ patch_prenorm=[False, True, True, True],
71
+ enable_checkpoint=False,
72
+ dim_embed=[256, 512, 1024, 2048],
73
+ num_heads=[8, 16, 32, 64],
74
+ num_groups=[8, 16, 32, 64],
75
+ depths=[1, 1, 9, 1],
76
+ window_size=12,
77
+ projection_dim=1024,
78
+ visual_temporal_embedding=None,
79
+ image_pos_embed=None,
80
+ image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
81
+ **kwargs,
82
+ ):
83
+ self.drop_path_rate = drop_path_rate
84
+ self.patch_size = patch_size
85
+ self.patch_stride = patch_stride
86
+ self.patch_padding = patch_padding
87
+ self.patch_prenorm = patch_prenorm
88
+ self.enable_checkpoint = enable_checkpoint
89
+ self.dim_embed = dim_embed
90
+ self.num_heads = num_heads
91
+ self.num_groups = num_groups
92
+ self.depths = depths
93
+ self.window_size = window_size
94
+ self.projection_dim = projection_dim
95
+ self.visual_temporal_embedding = visual_temporal_embedding
96
+ self.image_pos_embed = image_pos_embed
97
+ self.image_feature_source = image_feature_source
98
+
99
+ super().__init__(**kwargs)
100
+
101
+
102
+
103
+ class Florence2LanguageConfig(PretrainedConfig):
104
+ r"""
105
+ This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
106
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
107
+ defaults will yield a similar configuration to that of the BART
108
+ [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
109
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
110
+ documentation from [`PretrainedConfig`] for more information.
111
+ Args:
112
+ vocab_size (`int`, *optional*, defaults to 51289):
113
+ Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
114
+ `inputs_ids` passed when calling [`Florence2LanguageModel`].
115
+ d_model (`int`, *optional*, defaults to 1024):
116
+ Dimensionality of the layers and the pooler layer.
117
+ encoder_layers (`int`, *optional*, defaults to 12):
118
+ Number of encoder layers.
119
+ decoder_layers (`int`, *optional*, defaults to 12):
120
+ Number of decoder layers.
121
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
122
+ Number of attention heads for each attention layer in the Transformer encoder.
123
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
124
+ Number of attention heads for each attention layer in the Transformer decoder.
125
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
126
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
127
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
128
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
129
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
130
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
131
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
132
+ dropout (`float`, *optional*, defaults to 0.1):
133
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
134
+ attention_dropout (`float`, *optional*, defaults to 0.0):
135
+ The dropout ratio for the attention probabilities.
136
+ activation_dropout (`float`, *optional*, defaults to 0.0):
137
+ The dropout ratio for activations inside the fully connected layer.
138
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
139
+ The dropout ratio for classifier.
140
+ max_position_embeddings (`int`, *optional*, defaults to 1024):
141
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
142
+ just in case (e.g., 512 or 1024 or 2048).
143
+ init_std (`float`, *optional*, defaults to 0.02):
144
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
145
+ encoder_layerdrop (`float`, *optional*, defaults to 0.0):
146
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
147
+ for more details.
148
+ decoder_layerdrop (`float`, *optional*, defaults to 0.0):
149
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
150
+ for more details.
151
+ scale_embedding (`bool`, *optional*, defaults to `False`):
152
+ Scale embeddings by diving by sqrt(d_model).
153
+ use_cache (`bool`, *optional*, defaults to `True`):
154
+ Whether or not the model should return the last key/values attentions (not used by all models).
155
+ num_labels (`int`, *optional*, defaults to 3):
156
+ The number of labels to use in [`Florence2LanguageForSequenceClassification`].
157
+ forced_eos_token_id (`int`, *optional*, defaults to 2):
158
+ The id of the token to force as the last generated token when `max_length` is reached. Usually set to
159
+ `eos_token_id`.
160
+ Example:
161
+ ```python
162
+ >>> from transformers import Florence2LanguageConfig, Florence2LanguageModel
163
+ >>> # Initializing a Florence2 Language style configuration
164
+ >>> configuration = Florence2LanguageConfig()
165
+ >>> # Initializing a model (with random weights)
166
+ >>> model = Florence2LangaugeModel(configuration)
167
+ >>> # Accessing the model configuration
168
+ >>> configuration = model.config
169
+ ```"""
170
+
171
+ model_type = "florence2_language"
172
+ keys_to_ignore_at_inference = ["past_key_values"]
173
+ attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
174
+
175
+ def __init__(
176
+ self,
177
+ vocab_size=51289,
178
+ max_position_embeddings=1024,
179
+ encoder_layers=12,
180
+ encoder_ffn_dim=4096,
181
+ encoder_attention_heads=16,
182
+ decoder_layers=12,
183
+ decoder_ffn_dim=4096,
184
+ decoder_attention_heads=16,
185
+ encoder_layerdrop=0.0,
186
+ decoder_layerdrop=0.0,
187
+ activation_function="gelu",
188
+ d_model=1024,
189
+ dropout=0.1,
190
+ attention_dropout=0.0,
191
+ activation_dropout=0.0,
192
+ init_std=0.02,
193
+ classifier_dropout=0.0,
194
+ scale_embedding=False,
195
+ use_cache=True,
196
+ num_labels=3,
197
+ pad_token_id=1,
198
+ bos_token_id=0,
199
+ eos_token_id=2,
200
+ is_encoder_decoder=True,
201
+ decoder_start_token_id=2,
202
+ forced_eos_token_id=2,
203
+ **kwargs,
204
+ ):
205
+ self.vocab_size = vocab_size
206
+ self.max_position_embeddings = max_position_embeddings
207
+ self.d_model = d_model
208
+ self.encoder_ffn_dim = encoder_ffn_dim
209
+ self.encoder_layers = encoder_layers
210
+ self.encoder_attention_heads = encoder_attention_heads
211
+ self.decoder_ffn_dim = decoder_ffn_dim
212
+ self.decoder_layers = decoder_layers
213
+ self.decoder_attention_heads = decoder_attention_heads
214
+ self.dropout = dropout
215
+ self.attention_dropout = attention_dropout
216
+ self.activation_dropout = activation_dropout
217
+ self.activation_function = activation_function
218
+ self.init_std = init_std
219
+ self.encoder_layerdrop = encoder_layerdrop
220
+ self.decoder_layerdrop = decoder_layerdrop
221
+ self.classifier_dropout = classifier_dropout
222
+ self.use_cache = use_cache
223
+ self.num_hidden_layers = encoder_layers
224
+ self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
225
+
226
+ super().__init__(
227
+ num_labels=num_labels,
228
+ pad_token_id=pad_token_id,
229
+ bos_token_id=bos_token_id,
230
+ eos_token_id=eos_token_id,
231
+ is_encoder_decoder=is_encoder_decoder,
232
+ decoder_start_token_id=decoder_start_token_id,
233
+ forced_eos_token_id=forced_eos_token_id,
234
+ **kwargs,
235
+ )
236
+
237
+ # ensure backward compatibility for BART CNN models
238
+ if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
239
+ self.forced_bos_token_id = self.bos_token_id
240
+ warnings.warn(
241
+ f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
242
+ "The config can simply be saved and uploaded again to be fixed."
243
+ )
244
+
245
+ class Florence2Config(PretrainedConfig):
246
+ r"""
247
+ This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
248
+ Florence-2 model according to the specified arguments, defining the model architecture.
249
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
250
+ documentation from [`PretrainedConfig`] for more information.
251
+ Args:
252
+ vision_config (`Florence2VisionConfig`, *optional*):
253
+ Custom vision config or dict
254
+ text_config (`Union[AutoConfig, dict]`, *optional*):
255
+ The config object of the text backbone.
256
+ ignore_index (`int`, *optional*, defaults to -100):
257
+ The ignore index for the loss function.
258
+ vocab_size (`int`, *optional*, defaults to 51289):
259
+ Vocabulary size of the Florence2model. Defines the number of different tokens that can be represented by the
260
+ `inputs_ids` passed when calling [`~Florence2ForConditionalGeneration`]
261
+ projection_dim (`int`, *optional*, defaults to 1024):
262
+ Dimension of the multimodal projection space.
263
+ Example:
264
+ ```python
265
+ >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
266
+ >>> # Initializing a clip-like vision config
267
+ >>> vision_config = CLIPVisionConfig()
268
+ >>> # Initializing a Bart config
269
+ >>> text_config = BartConfig()
270
+ >>> # Initializing a Florence-2 configuration
271
+ >>> configuration = Florence2Config(vision_config, text_config)
272
+ >>> # Initializing a model from the florence-2 configuration
273
+ >>> model = Florence2ForConditionalGeneration(configuration)
274
+ >>> # Accessing the model configuration
275
+ >>> configuration = model.config
276
+ ```"""
277
+
278
+ model_type = "florence2"
279
+ is_composition = False
280
+
281
+ def __init__(
282
+ self,
283
+ vision_config=None,
284
+ text_config=None,
285
+ ignore_index=-100,
286
+ vocab_size=51289,
287
+ projection_dim=1024,
288
+ **kwargs,
289
+ ):
290
+ self.ignore_index = ignore_index
291
+ self.vocab_size = vocab_size
292
+ self.projection_dim = projection_dim
293
+ if vision_config is not None:
294
+ vision_config = PretrainedConfig(**vision_config)
295
+ self.vision_config = vision_config
296
+ self.vocab_size = self.vocab_size
297
+
298
+ self.text_config = text_config
299
+ if text_config is not None:
300
+ self.text_config = Florence2LanguageConfig(**text_config)
301
+
302
+
303
+ super().__init__(**kwargs)