|
--- |
|
license: mit |
|
datasets: |
|
- amaai-lab/DisfluencySpeech |
|
language: |
|
- en |
|
pipeline_tag: text-to-speech |
|
--- |
|
# Usage |
|
|
|
```python |
|
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub |
|
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface |
|
import IPython.display as ipd |
|
|
|
models, cfg, task = load_model_ensemble_and_task_from_hf_hub( |
|
"amaai-lab/DisfluencySpeech_BenchmarkB", |
|
arg_overrides={"vocoder": "hifigan", "fp16": False, "spec-bwd-max-iter": 32} |
|
) |
|
model = models[0] |
|
TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg) |
|
generator = task.build_generator(models, cfg) |
|
|
|
text = "Well, that's really funny, isn't it? What a strange world we live in." |
|
|
|
sample = TTSHubInterface.get_model_input(task, text) |
|
sample['net_input']['src_tokens'] = sample['net_input']['src_tokens'].cuda() |
|
sample['net_input']['src_lengths'] = sample['net_input']['src_lengths'].cuda() |
|
wav, rate = TTSHubInterface.get_prediction(task, model.cuda(), generator, sample) |
|
|
|
ipd.Audio(wav.cpu(), rate=rate) |
|
``` |