thaicapgen-clip-gpt2 / configuration_clipcap.py
Natthaphon's picture
Added model
e137a83
from transformers import PretrainedConfig, AutoConfig
class CLIPEncoderDecoderConfig(PretrainedConfig):
model_type = "clip-encoder-decoder"
def __init__(
self,
decoder={'_name_or_path': '',
'activation_function': 'gelu_new',
'add_cross_attention': True,
'architectures': ['GPT2LMHeadModel'],
'attn_pdrop': 0.1,
'bad_words_ids': None,
'begin_suppress_tokens': None,
'bos_token_id': 50256,
'chunk_size_feed_forward': 0,
'cross_attention_hidden_size': None,
'decoder_start_token_id': None,
'diversity_penalty': 0.0,
'do_sample': False,
'early_stopping': False,
'embd_pdrop': 0.1,
'encoder_no_repeat_ngram_size': 0,
'eos_token_id': 50256,
'exponential_decay_length_penalty': None,
'finetuning_task': None,
'forced_bos_token_id': None,
'forced_eos_token_id': None,
'id2label': {'0': 'LABEL_0', '1': 'LABEL_1'},
'initializer_range': 0.02,
'is_decoder': True,
'is_encoder_decoder': False,
'label2id': {'LABEL_0': 0, 'LABEL_1': 1},
'layer_norm_epsilon': 1e-05,
'length_penalty': 1.0,
'max_length': 20,
'min_length': 0,
'model_type': 'gpt2',
'n_ctx': 1024,
'n_embd': 768,
'n_head': 12,
'n_inner': None,
'n_layer': 12,
'n_positions': 1024,
'no_repeat_ngram_size': 0,
'num_beam_groups': 1,
'num_beams': 1,
'num_return_sequences': 1,
'output_attentions': False,
'output_hidden_states': False,
'output_scores': False,
'pad_token_id': None,
'prefix': None,
'problem_type': None,
'pruned_heads': {},
'remove_invalid_values': False,
'reorder_and_upcast_attn': False,
'repetition_penalty': 1.0,
'resid_pdrop': 0.1,
'return_dict': True,
'return_dict_in_generate': False,
'scale_attn_by_inverse_layer_idx': False,
'scale_attn_weights': True,
'sep_token_id': None,
'summary_activation': None,
'summary_first_dropout': 0.1,
'summary_proj_to_labels': True,
'summary_type': 'cls_index',
'summary_use_proj': True,
'suppress_tokens': None,
'task_specific_params': {'text-generation': {'do_sample': True,
'max_length': 50}},
'temperature': 1.0,
'tf_legacy_loss': False,
'tie_encoder_decoder': False,
'tie_word_embeddings': True,
'tokenizer_class': None,
'top_k': 50,
'top_p': 1.0,
'torch_dtype': None,
'torchscript': False,
'typical_p': 1.0,
'use_bfloat16': False,
'use_cache': True,
'vocab_size': 50257},
**kwargs):
super().__init__(**kwargs)
self.decoder = AutoConfig.for_model(**decoder)
self.is_encoder_decoder = True
@classmethod
def from_encoder_decoder_configs(
cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
) -> PretrainedConfig:
r"""
Instantiate a [`VisionEncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model
configuration and decoder model configuration.
Returns:
[`VisionEncoderDecoderConfig`]: An instance of a configuration object
"""
decoder_config.is_decoder = True
decoder_config.add_cross_attention = True
return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)