{ "d_model": 1024, "decoder_attention_heads": 16, "decoder_ffn_dim": 2730, "decoder_layers": 12, "decoder_start_token_id": 16384, "encoder_attention_heads": 16, "encoder_ffn_dim": 2730, "encoder_layers": 12, "encoder_vocab_size": 50264, "image_length": 256, "image_vocab_size": 16384, "max_text_length": 64 }