File size: 5,691 Bytes
355d903
 
 
 
185fc75
70399da
43f2732
70399da
 
 
 
 
 
 
43f2732
 
 
de18ed9
43f2732
 
 
3abac7b
43f2732
 
70399da
 
 
 
 
43f2732
 
 
 
70399da
 
 
 
 
 
 
 
 
43f2732
 
3abac7b
43f2732
 
70399da
 
43f2732
70399da
 
 
 
 
2c6febe
c741ce2
70399da
185fc75
 
 
 
70399da
185fc75
43f2732
 
 
70399da
 
43f2732
185fc75
70399da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43f2732
f828b7a
70399da
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import torch
torch.manual_seed(160923)


import gradio as gr
import torch.cuda
from huggingface_hub import hf_hub_download
from InferenceInterfaces.ControllableInterface import ControllableInterface
from Utility.utils import float2pcm
from Utility.utils import load_json_from_path


class TTSWebUI:

    def __init__(self,
                 gpu_id="cpu",
                 title="Controllable Text-to-Speech for over 7000 Languages",
                 article="The biggest thank you to Hugging Face🤗 for sponsoring the GPU for this space! <br> To get the code, models, additional features, and more information, check out our toolkit: https://github.com/DigitalPhonetics/IMS-Toucan <br>",
                 tts_model_path=None,
                 vocoder_model_path=None,
                 embedding_gan_path=None,
                 available_artificial_voices=10  # be careful with this, if you want too many, it might lead to an endless loop
                 ):
        path_to_iso_list = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="iso_to_fullname.json")
        iso_to_name = load_json_from_path(path_to_iso_list)
        text_selection = [f"{iso_to_name[iso_code]} ({iso_code})" for iso_code in iso_to_name]
        # accent_selection = [f"{iso_to_name[iso_code]} Accent ({iso_code})" for iso_code in iso_to_name]

        self.controllable_ui = ControllableInterface(gpu_id=gpu_id,
                                                     available_artificial_voices=available_artificial_voices,
                                                     tts_model_path=tts_model_path,
                                                     vocoder_model_path=vocoder_model_path,
                                                     embedding_gan_path=embedding_gan_path)
        self.iface = gr.Interface(fn=self.read,
                                  inputs=[gr.Textbox(lines=2,
                                                     placeholder="write what you want the synthesis to read here...",
                                                     value="What I cannot create, I do not understand.",
                                                     label="Text input"),
                                          gr.Dropdown(text_selection,
                                                      type="value",
                                                      value='English (eng)',
                                                      label="Select the Language of the Text (type on your keyboard to find it quickly)"),
                                          gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.5, label="Prosody Creativity"),
                                          gr.Slider(minimum=0.7, maximum=1.3, step=0.1, value=1.0, label="Faster - Slower"),
                                          gr.Slider(minimum=0, maximum=available_artificial_voices, step=1, value=5, label="Random Seed for the artificial Voice"),
                                          gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Gender of artificial Voice"),
                                          gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone (if left empty, will use an artificial voice instead)"),
                                          # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
                                          # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
                                          # gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
                                          ],
                                  outputs=[gr.Audio(type="numpy", label="Speech"),
                                           gr.Image(label="Visualization")],
                                  title=title,
                                  allow_flagging="never",
                                  description=article,
                                  theme=gr.themes.Ocean(primary_hue="amber", secondary_hue="orange"))
        self.iface.launch()

    def read(self,
             prompt,
             language,
             prosody_creativity,
             duration_scaling_factor,
             voice_seed,
             emb1,
             reference_audio,
             # pitch_variance_scale,
             # energy_variance_scale,
             # emb2
             ):
        sr, wav, fig = self.controllable_ui.read(prompt,
                                                 reference_audio,
                                                 language.split(" ")[-1].split("(")[1].split(")")[0],
                                                 language.split(" ")[-1].split("(")[1].split(")")[0],
                                                 voice_seed,
                                                 prosody_creativity,
                                                 duration_scaling_factor,
                                                 1.,
                                                 1.0,
                                                 1.0,
                                                 emb1,
                                                 0.,
                                                 0.,
                                                 0.,
                                                 0.,
                                                 0.,
                                                 -24.)
        return (sr, float2pcm(wav)), fig


if __name__ == '__main__':
    TTSWebUI(gpu_id="cuda" if torch.cuda.is_available() else "cpu")