num_audio_tokens: 626 | |
num_text_tokens: 21178 | |
gpt_config: | |
hidden_size: 768 | |
intermediate_size: 3072 | |
num_attention_heads: 12 | |
num_hidden_layers: 20 | |
use_cache: False | |
max_position_embeddings: 4096 | |
# attn_implementation: flash_attention_2 | |
spk_emb_dim: 192 | |
spk_KL: False | |
num_audio_tokens: 626 | |
num_text_tokens: null | |
num_vq: 4 | |