|
|
|
import torch |
|
from transformers import set_seed |
|
from transformers import VitsTokenizer, VitsModel |
|
|
|
def synthesize_facebook(s:str, iso3:str) -> str: |
|
''' |
|
For given text, speak it. |
|
|
|
Parameters |
|
---------- |
|
s: str |
|
The written text. |
|
is03:str |
|
The ISO-3 code of the text's language. |
|
|
|
Returns |
|
---------- |
|
synth:str |
|
The synthesized audio. |
|
''' |
|
|
|
tokenizer = VitsTokenizer.from_pretrained(f"facebook/mms-tts-{iso3}") |
|
model = VitsModel.from_pretrained(f"facebook/mms-tts-{iso3}") |
|
|
|
inputs = tokenizer(text=s, return_tensors="pt") |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
synth = outputs.waveform[0] |
|
|
|
return synth.numpy() |