File size: 4,767 Bytes
f5cf172 1e99d58 f5cf172 e42ffa2 facd38d 593cb11 f5cf172 436344f f5cf172 e42ffa2 f5cf172 e42ffa2 f5cf172 e42ffa2 f5cf172 5e4ef3d e42ffa2 7087760 f5cf172 24c2e60 593cb11 24c2e60 593cb11 5cb01e5 593cb11 facd38d 593cb11 915d672 593cb11 5cb01e5 facd38d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import IPython
from huggingface_hub.inference_api import InferenceApi
import torch
from TTS.api import TTS
import wave
import espeakng
import subprocess
from scipy.io import wavfile
from transformers import pipeline
import os
import numpy as np
from gradio_client import Client, handle_file
def synth_mms(text:str, model:str):
'''
Use Huggingface inference pipeline to synthesize text.
(Can be replaced by inference API, but that requires stored API token.)
Inputs:
text: Text to synthesze
model: Model code of the form mms-tts-LAN
Returns:
Streaming numpy and sampling rate.
'''
#inference = InferenceApi(repo_id=f"facebook/{model}",
# token=API_TOKEN)
#mms_tts = inference(inputs=text,
# raw_response=True)._content
if model is not None:
pipe = pipeline("text-to-speech", model=model, device=-1, #token=os.environ['TOKEN']
) # Change device if it should use GPU
mms_tts = pipe(text)
return mms_tts['audio'], mms_tts['sampling_rate']
else:
return None
def synth_coqui(text:str, model:str):
'''
Use Coqui inference API to synthesize text.
Inputs:
text: Text to synthesze
model: Model code
Returns:
Streaming Wav and sampling rate.
IMPORTANT: Current implementation assumes 22050 sampling rate, this should be verified when adding a new model.
'''
if model is not None:
# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Init TTS
tts = TTS(model, progress_bar=False).to(device)
# Infer
wav = tts.tts(text=text) # is_multi_speaker=False
return np.array(wav), 22050
else:
return None
def synth_espeakng(text:str, model:str):
'''
Use ESpeak-NG to synthesize text.
Inputs:
text: Text to synthesze
model: Model code
Returns:
Streaming Wav and sampling rate.
'''
if model is not None:
subprocess.run(['espeak-ng', f'-v{model}', "-w test.wav", text])
#esng = espeakng.Speaker()
#esng.voice = model
#esng.say(text, export_path="test.wav")
sampling_rate, wav = wavfile.read('test.wav')
os.remove("test.wav")
#wav = tts.tts(text=text)
return wav, sampling_rate
else:
return None
def synth_africanvoices(text:str, model:str):
'''
Use ESpeak-NG to synthesize text.
Inputs:
text: Text to synthesze
model: Model code
Returns:
Streaming Wav and sampling rate.
'''
if model is not None:
subprocess.run(['flite', f'-voice {model}.flitevox', f'"{text}"', " test.wav"])
#esng = espeakng.Speaker()
#esng.voice = model
#esng.say(text, export_path="test.wav")
sampling_rate, wav = wavfile.read('test.wav')
os.remove("test.wav")
#wav = tts.tts(text=text)
return wav, sampling_rate
else:
return None
def synth_toucan(text:str, model:str):
'''
Use Toucan to synthesize text.
Inputs:
text: Text to synthesze
model: Model code
Returns:
Streaming Wav and sampling rate.
NOTES: (1)This wrapper does not let you explore the full range of options possible with the API. (2) The API should allow you to generate female voices, however, it does not seem to be working at the moment. (3) This uses a Huggingface Gradio Space to compute via the API.
'''
client = Client("Flux9665/MassivelyMultilingualTTS")
result = client.predict(
prompt=text,
language=model,
reference_audio=handle_file('https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav'),
voice_seed=123,
prosody_creativity=0.1,
duration_scaling_factor=1,
emb1=0,
#emb2=0,
api_name="/predict"
)
sampling_rate, wav = wavfile.read(result[0])
return wav, sampling_rate
def synth_piper(text:str, model:str):
'''
Use Toucan to synthesize text.
Inputs:
text: Text to synthesze
model: Model code
Returns:
Streaming Wav and sampling rate.
NOTES: (1) This uses a Huggingface Gradio Space to compute via the API.
'''
if model is not None:
client = Client("k2-fsa/text-to-speech")
result = client.predict(
language=model[0],
repo_id=model[1],
text=text,
sid="0",
speed=1,
api_name="/process"
)
sampling_rate, wav = wavfile.read(result[0])
return wav, sampling_rate
else:
return None |