import IPython from huggingface_hub.inference_api import InferenceApi import torch from TTS.api import TTS import wave from espeakng import ESpeakNG import subprocess from scipy.io import wavfile from transformers import pipeline import os def synth_mms(text:str, model:str): ''' Use Huggingface inference pipeline to synthesize text. (Can be replaced by inference API, but that requires stored API token.) Inputs: text: Text to synthesze model: Model code of the form mms-tts-LAN Returns: Streaming numpy and sampling rate. ''' #inference = InferenceApi(repo_id=f"facebook/{model}", # token=API_TOKEN) #mms_tts = inference(inputs=text, # raw_response=True)._content if model is not None: pipe = pipeline("text-to-speech", model=model, device=-1) # Change device if it should use GPU mms_tts = pipe(text) return mms_tts['audio'], mms_tts['sampling_rate'] else: return None def synth_coqui(text:str, model:str): ''' Use Coqui inference API to synthesize text. Inputs: text: Text to synthesze model: Model code Returns: Streaming Wav and sampling rate. ''' if model is not None: # Get device device = "cuda" if torch.cuda.is_available() else "cpu" # Init TTS tts = TTS(model, progress_bar=False).to(device) tts.tts_to_file(text=text, file_path="test.wav", is_multi_speaker=False) sampling_rate, wav = wavfile.read('test.wav') os.remove("test.wav") #wav = tts.tts(text=text) return wav, sampling_rate else: return None def synth_espeakng(text:str, model:str): ''' Use ESpeak-NG to synthesize text. Inputs: text: Text to synthesze model: Model code Returns: Streaming Wav and sampling rate. ''' if model is not None: subprocess.run(['espeak-ng', f'-v{model}', "-w test.wav", text]).returncode sampling_rate, wav = wavfile.read('test.wav') os.remove("test.wav") #wav = tts.tts(text=text) return wav, sampling_rate else: return None