File size: 4,767 Bytes
f5cf172
 
 
 
 
1e99d58
f5cf172
 
 
 
e42ffa2
facd38d
593cb11
f5cf172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436344f
 
f5cf172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e42ffa2
 
 
f5cf172
 
 
 
 
 
 
e42ffa2
 
 
f5cf172
e42ffa2
f5cf172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e4ef3d
e42ffa2
 
 
 
7087760
f5cf172
 
 
 
 
 
 
24c2e60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593cb11
24c2e60
 
 
 
 
 
 
593cb11
 
 
 
 
 
 
 
 
 
 
5cb01e5
593cb11
 
 
 
 
facd38d
593cb11
 
 
 
915d672
593cb11
 
 
5cb01e5
 
 
 
 
 
 
 
 
 
 
 
 
 
facd38d
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import IPython
from huggingface_hub.inference_api import InferenceApi
import torch
from TTS.api import TTS
import wave
import espeakng 
import subprocess
from scipy.io import wavfile
from transformers import pipeline
import os
import numpy as np
from gradio_client import Client, handle_file


def synth_mms(text:str, model:str):
    '''
    Use Huggingface inference pipeline to synthesize text.
    (Can be replaced by inference API, but that requires stored API token.)

    Inputs:
        text: Text to synthesze
        model: Model code of the form mms-tts-LAN
    Returns:
        Streaming numpy and sampling rate.
    '''
    #inference = InferenceApi(repo_id=f"facebook/{model}", 
    #                         token=API_TOKEN)
    #mms_tts = inference(inputs=text, 
    #                    raw_response=True)._content

    if model is not None:
        pipe = pipeline("text-to-speech", model=model, device=-1, #token=os.environ['TOKEN']
                       ) # Change device if it should use GPU
        mms_tts = pipe(text)
        return mms_tts['audio'], mms_tts['sampling_rate']
    else:
        return None



def synth_coqui(text:str, model:str):
    '''
    Use Coqui inference API to synthesize text.

    Inputs:
        text: Text to synthesze
        model: Model code 
    Returns:
        Streaming Wav and sampling rate. 
        
    IMPORTANT: Current implementation assumes 22050 sampling rate, this should be verified when adding a new model.
    '''
    if model is not None:
        # Get device
        device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Init TTS
        tts = TTS(model, progress_bar=False).to(device)

        # Infer
        wav = tts.tts(text=text) # is_multi_speaker=False
        
        return np.array(wav), 22050
    else:
        return None


def synth_espeakng(text:str, model:str):
    '''
    Use ESpeak-NG to synthesize text.

    Inputs:
        text: Text to synthesze
        model: Model code 
    Returns:
        Streaming Wav and sampling rate.
    '''
    if model is not None:
        
        subprocess.run(['espeak-ng', f'-v{model}', "-w test.wav", text]) 
        #esng = espeakng.Speaker()
        #esng.voice = model
        #esng.say(text, export_path="test.wav")

        sampling_rate, wav = wavfile.read('test.wav')
        os.remove("test.wav")
        
        #wav = tts.tts(text=text)
        return wav, sampling_rate
    else:
        return None
        
def synth_africanvoices(text:str, model:str):
    '''
    Use ESpeak-NG to synthesize text.

    Inputs:
        text: Text to synthesze
        model: Model code 
    Returns:
        Streaming Wav and sampling rate.
    '''
    if model is not None:
        
        subprocess.run(['flite', f'-voice {model}.flitevox', f'"{text}"', " test.wav"]) 
        #esng = espeakng.Speaker()
        #esng.voice = model
        #esng.say(text, export_path="test.wav")

        sampling_rate, wav = wavfile.read('test.wav')
        os.remove("test.wav")
        
        #wav = tts.tts(text=text)
        return wav, sampling_rate
    else:
        return None

def synth_toucan(text:str, model:str):
    '''
    Use Toucan to synthesize text.

    Inputs:
        text: Text to synthesze
        model: Model code 
    Returns:
        Streaming Wav and sampling rate.

    NOTES: (1)This wrapper does not let you explore the full range of options possible with the API. (2) The API should allow you to generate female voices, however, it does not seem to be working at the moment. (3) This uses a Huggingface Gradio Space to compute via the API.
    '''
    client = Client("Flux9665/MassivelyMultilingualTTS")
    result = client.predict(
    		prompt=text,
    		language=model,
    		reference_audio=handle_file('https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav'),
    		voice_seed=123,
    		prosody_creativity=0.1,
    		duration_scaling_factor=1,
    		emb1=0,
    		#emb2=0,
    		api_name="/predict"
    )
    sampling_rate, wav = wavfile.read(result[0])
    return wav, sampling_rate

def synth_piper(text:str, model:str):
    '''
    Use Toucan to synthesize text.

    Inputs:
        text: Text to synthesze
        model: Model code 
    Returns:
        Streaming Wav and sampling rate.

    NOTES: (1) This uses a Huggingface Gradio Space to compute via the API.
    '''
    if model is not None:
        client = Client("k2-fsa/text-to-speech")
        result =  client.predict(
        		language=model[0],
        		repo_id=model[1],
        		text=text,
        		sid="0",
        		speed=1,
        		api_name="/process"
        )
        sampling_rate, wav = wavfile.read(result[0])
        return wav, sampling_rate
    else:
        return None