|
import IPython |
|
from huggingface_hub.inference_api import InferenceApi |
|
import torch |
|
from TTS.api import TTS |
|
import wave |
|
import espeakng |
|
import subprocess |
|
from scipy.io import wavfile |
|
from transformers import pipeline |
|
import os |
|
import numpy as np |
|
from gradio_client import Client, handle_file |
|
|
|
|
|
def synth_mms(text:str, model:str): |
|
''' |
|
Use Huggingface inference pipeline to synthesize text. |
|
(Can be replaced by inference API, but that requires stored API token.) |
|
|
|
Inputs: |
|
text: Text to synthesze |
|
model: Model code of the form mms-tts-LAN |
|
Returns: |
|
Streaming numpy and sampling rate. |
|
''' |
|
|
|
|
|
|
|
|
|
|
|
if model is not None: |
|
pipe = pipeline("text-to-speech", model=model, device=-1, |
|
) |
|
mms_tts = pipe(text) |
|
return mms_tts['audio'], mms_tts['sampling_rate'] |
|
else: |
|
return None |
|
|
|
|
|
|
|
def synth_coqui(text:str, model:str): |
|
''' |
|
Use Coqui inference API to synthesize text. |
|
|
|
Inputs: |
|
text: Text to synthesze |
|
model: Model code |
|
Returns: |
|
Streaming Wav and sampling rate. |
|
|
|
IMPORTANT: Current implementation assumes 22050 sampling rate, this should be verified when adding a new model. |
|
''' |
|
if model is not None: |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
tts = TTS(model, progress_bar=False).to(device) |
|
|
|
|
|
wav = tts.tts(text=text) |
|
|
|
return np.array(wav), 22050 |
|
else: |
|
return None |
|
|
|
|
|
def synth_espeakng(text:str, model:str): |
|
''' |
|
Use ESpeak-NG to synthesize text. |
|
|
|
Inputs: |
|
text: Text to synthesze |
|
model: Model code |
|
Returns: |
|
Streaming Wav and sampling rate. |
|
''' |
|
if model is not None: |
|
|
|
subprocess.run(['espeak-ng', f'-v{model}', "-w test.wav", text]) |
|
|
|
|
|
|
|
|
|
sampling_rate, wav = wavfile.read('test.wav') |
|
os.remove("test.wav") |
|
|
|
|
|
return wav, sampling_rate |
|
else: |
|
return None |
|
|
|
def synth_africanvoices(text:str, model:str): |
|
''' |
|
Use ESpeak-NG to synthesize text. |
|
|
|
Inputs: |
|
text: Text to synthesze |
|
model: Model code |
|
Returns: |
|
Streaming Wav and sampling rate. |
|
''' |
|
if model is not None: |
|
|
|
subprocess.run(['flite', f'-voice {model}.flitevox', f'"{text}"', " test.wav"]) |
|
|
|
|
|
|
|
|
|
sampling_rate, wav = wavfile.read('test.wav') |
|
os.remove("test.wav") |
|
|
|
|
|
return wav, sampling_rate |
|
else: |
|
return None |
|
|
|
def synth_toucan(text:str, model:str): |
|
''' |
|
Use Toucan to synthesize text. |
|
|
|
Inputs: |
|
text: Text to synthesze |
|
model: Model code |
|
Returns: |
|
Streaming Wav and sampling rate. |
|
|
|
NOTES: (1)This wrapper does not let you explore the full range of options possible with the API. (2) The API should allow you to generate female voices, however, it does not seem to be working at the moment. (3) This uses a Huggingface Gradio Space to compute via the API. |
|
''' |
|
client = Client("Flux9665/MassivelyMultilingualTTS") |
|
result = client.predict( |
|
prompt=text, |
|
language=model, |
|
reference_audio=handle_file('https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav'), |
|
voice_seed=123, |
|
prosody_creativity=0.1, |
|
duration_scaling_factor=1, |
|
emb1=0, |
|
|
|
api_name="/predict" |
|
) |
|
sampling_rate, wav = wavfile.read(result[0]) |
|
return wav, sampling_rate |
|
|
|
def synth_piper(text:str, model:str): |
|
''' |
|
Use Toucan to synthesize text. |
|
|
|
Inputs: |
|
text: Text to synthesze |
|
model: Model code |
|
Returns: |
|
Streaming Wav and sampling rate. |
|
|
|
NOTES: (1) This uses a Huggingface Gradio Space to compute via the API. |
|
''' |
|
if model is not None: |
|
client = Client("k2-fsa/text-to-speech") |
|
result = client.predict( |
|
language=model[0], |
|
repo_id=model[1], |
|
text=text, |
|
sid="0", |
|
speed=1, |
|
api_name="/process" |
|
) |
|
sampling_rate, wav = wavfile.read(result[0]) |
|
return wav, sampling_rate |
|
else: |
|
return None |