"""
Copyright 2022 Balacoon
TTS interactive demo
"""
import logging
from typing import cast
import gradio as gr
from balacoon_tts import TTS
from huggingface_hub import hf_hub_download, list_repo_files
# global tts module, initialized from a model selected
tts = None
def main():
logging.basicConfig(level=logging.INFO)
with gr.Blocks() as demo:
gr.Markdown(
"""
Balacoon🦝 Text-to-Speech
1. Write an utterance to generate,
2. Select the model to synthesize with
3. Select speaker (only for multi-speaker models)
4. Hit "Generate" and listen to the result!
When you select model for the first time,
it will take a little time to download it.
You can learn more about models available
[here](https://huggingface.co/balacoon/tts),
visit [Balacoon website](https://balacoon.com/) for more info.
"""
)
with gr.Row(variant="panel"):
text = gr.Textbox(label="Text", placeholder="Type something here...")
with gr.Row():
with gr.Column(variant="panel"):
repo_files = list_repo_files(repo_id="balacoon/tts")
model_files = [x for x in repo_files if x.endswith(".addon")]
model_name = gr.Dropdown(
label="Model",
choices=model_files,
)
with gr.Column(variant="panel"):
speaker = gr.Dropdown(label="Speaker", choices=[])
def set_model(model_name_str: str):
"""
gets value from `model_name`, loads model,
re-initializes tts object, gets list of
speakers that model supports and set them to `speaker`
"""
model_path = hf_hub_download(
repo_id="balacoon/tts", filename=model_name_str
)
global tts
tts = TTS(model_path)
speakers = tts.get_speakers()
if speakers:
visible = True
value = speakers[-1]
else:
visible = False
value = ""
return gr.Dropdown.update(
choices=speakers, value=value, visible=visible
)
model_name.change(set_model, inputs=model_name, outputs=speaker)
with gr.Row(variant="panel"):
generate = gr.Button("Generate")
with gr.Row(variant="panel"):
audio = gr.Audio()
def synthesize_audio(text_str: str, speaker_str: str = ""):
"""
gets utterance to synthesize from `text` Textbox
and speaker name from `speaker` dropdown list.
speaker name might be empty for single-speaker models.
Synthesizes the waveform and updates `audio` with it.
"""
if not text_str:
logging.info("text or speaker are not provided")
return None
global tts
if len(text_str) > 1024:
text_str = text_str[:1024]
samples = cast(TTS, tts).synthesize(text_str, speaker_str)
return gr.Audio.update(value=(24000, samples))
generate.click(synthesize_audio, inputs=[text, speaker], outputs=audio)
demo.launch()
if __name__ == "__main__":
main()