Spaces:
Sleeping
Sleeping
import IPython | |
from huggingface_hub.inference_api import InferenceApi | |
import torch | |
from TTS.api import TTS | |
import wave | |
from espeakng import ESpeakNG | |
import subprocess | |
from scipy.io import wavfile | |
from transformers import pipeline | |
import os | |
def synth_mms(text:str, model:str): | |
''' | |
Use Huggingface inference pipeline to synthesize text. | |
(Can be replaced by inference API, but that requires stored API token.) | |
Inputs: | |
text: Text to synthesze | |
model: Model code of the form mms-tts-LAN | |
Returns: | |
Streaming numpy and sampling rate. | |
''' | |
#inference = InferenceApi(repo_id=f"facebook/{model}", | |
# token=API_TOKEN) | |
#mms_tts = inference(inputs=text, | |
# raw_response=True)._content | |
if model is not None: | |
pipe = pipeline("text-to-speech", model=model, device=-1) # Change device if it should use GPU | |
mms_tts = pipe(text) | |
return mms_tts['audio'], mms_tts['sampling_rate'] | |
else: | |
return None | |
def synth_coqui(text:str, model:str): | |
''' | |
Use Coqui inference API to synthesize text. | |
Inputs: | |
text: Text to synthesze | |
model: Model code | |
Returns: | |
Streaming Wav and sampling rate. | |
''' | |
if model is not None: | |
# Get device | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Init TTS | |
tts = TTS(model, progress_bar=False).to(device) | |
tts.tts_to_file(text=text, file_path="test.wav", is_multi_speaker=False) | |
sampling_rate, wav = wavfile.read('test.wav') | |
os.remove("test.wav") | |
#wav = tts.tts(text=text) | |
return wav, sampling_rate | |
else: | |
return None | |
def synth_espeakng(text:str, model:str): | |
''' | |
Use ESpeak-NG to synthesize text. | |
Inputs: | |
text: Text to synthesze | |
model: Model code | |
Returns: | |
Streaming Wav and sampling rate. | |
''' | |
if model is not None: | |
subprocess.run(['espeak-ng', f'-v{model}', "-w test.wav", text]).returncode | |
sampling_rate, wav = wavfile.read('test.wav') | |
os.remove("test.wav") | |
#wav = tts.tts(text=text) | |
return wav, sampling_rate | |
else: | |
return None | |