|
{ |
|
"_name_or_path": "suno/bark", |
|
"architectures": [ |
|
"BarkModel" |
|
], |
|
"coarse_acoustics_config": { |
|
"architectures": [ |
|
"BarkCoarseModel" |
|
], |
|
"bias": false, |
|
"hidden_size": 1024, |
|
"input_vocab_size": 12096, |
|
"model_type": "coarse_acoustics", |
|
"num_heads": 16, |
|
"num_layers": 24, |
|
"output_vocab_size": 12096, |
|
"torch_dtype": "float32" |
|
}, |
|
"codec_config": { |
|
"_name_or_path": "facebook/encodec_24khz", |
|
"architectures": [ |
|
"EncodecModel" |
|
], |
|
"model_type": "encodec", |
|
"torch_dtype": "float32" |
|
}, |
|
"fine_acoustics_config": { |
|
"architectures": [ |
|
"BarkFineModel" |
|
], |
|
"bias": false, |
|
"hidden_size": 1024, |
|
"input_vocab_size": 1056, |
|
"model_type": "fine_acoustics", |
|
"num_heads": 16, |
|
"num_layers": 24, |
|
"output_vocab_size": 1056, |
|
"torch_dtype": "float32" |
|
}, |
|
"initializer_range": 0.02, |
|
"model_type": "bark", |
|
"semantic_config": { |
|
"architectures": [ |
|
"BarkSemanticModel" |
|
], |
|
"bias": false, |
|
"hidden_size": 1024, |
|
"input_vocab_size": 129600, |
|
"model_type": "semantic", |
|
"num_heads": 16, |
|
"num_layers": 24, |
|
"torch_dtype": "float32" |
|
}, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.38.2" |
|
} |
|
|