tts_mockingbird / src /synthesize.py
khof312's picture
Load initial demo.
f5cf172
raw
history blame
2.27 kB
import IPython
from huggingface_hub.inference_api import InferenceApi
import torch
from TTS.api import TTS
import wave
from espeakng import ESpeakNG
import subprocess
from scipy.io import wavfile
from transformers import pipeline
import os
def synth_mms(text:str, model:str):
'''
Use Huggingface inference pipeline to synthesize text.
(Can be replaced by inference API, but that requires stored API token.)
Inputs:
text: Text to synthesze
model: Model code of the form mms-tts-LAN
Returns:
Streaming numpy and sampling rate.
'''
#inference = InferenceApi(repo_id=f"facebook/{model}",
# token=API_TOKEN)
#mms_tts = inference(inputs=text,
# raw_response=True)._content
if model is not None:
pipe = pipeline("text-to-speech", model=model, device=-1) # Change device if it should use GPU
mms_tts = pipe(text)
return mms_tts['audio'], mms_tts['sampling_rate']
else:
return None
def synth_coqui(text:str, model:str):
'''
Use Coqui inference API to synthesize text.
Inputs:
text: Text to synthesze
model: Model code
Returns:
Streaming Wav and sampling rate.
'''
if model is not None:
# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Init TTS
tts = TTS(model, progress_bar=False).to(device)
tts.tts_to_file(text=text, file_path="test.wav", is_multi_speaker=False)
sampling_rate, wav = wavfile.read('test.wav')
os.remove("test.wav")
#wav = tts.tts(text=text)
return wav, sampling_rate
else:
return None
def synth_espeakng(text:str, model:str):
'''
Use ESpeak-NG to synthesize text.
Inputs:
text: Text to synthesze
model: Model code
Returns:
Streaming Wav and sampling rate.
'''
if model is not None:
subprocess.run(['espeak-ng', f'-v{model}', "-w test.wav", text]).returncode
sampling_rate, wav = wavfile.read('test.wav')
os.remove("test.wav")
#wav = tts.tts(text=text)
return wav, sampling_rate
else:
return None