demo_language_moore / src /text_to_speech.py
khof312's picture
Update requirements.
571e8a3
raw
history blame
768 Bytes
import torch
from transformers import set_seed
from transformers import VitsTokenizer, VitsModel
def synthesize_facebook(s:str, iso3:str) -> str:
'''
For given text, speak it.
Parameters
----------
s: str
The written text.
is03:str
The ISO-3 code of the text's language.
Returns
----------
synth:str
The synthesized audio.
'''
# Load synthesizer
tokenizer = VitsTokenizer.from_pretrained(f"facebook/mms-tts-{iso3}")
model = VitsModel.from_pretrained(f"facebook/mms-tts-{iso3}")
inputs = tokenizer(text=s, return_tensors="pt")
# Inference
with torch.no_grad():
outputs = model(**inputs)
synth = outputs.waveform[0]
return synth.numpy()