|
import os |
|
import pathlib |
|
import time |
|
from collections import namedtuple |
|
from typing import List |
|
|
|
import numpy as np |
|
import torch |
|
from TTS.api import TTS |
|
|
|
os.environ["COQUI_TOS_AGREED"] = "1" |
|
|
|
tts_pipeline = None |
|
|
|
Voice = namedtuple("voice", ["name", "neutral", "angry", "speed"]) |
|
|
|
file_full_path = pathlib.Path(os.path.realpath(__file__)).parent |
|
|
|
voices = [ |
|
Voice( |
|
"Fast", |
|
neutral="empty", |
|
angry=None, |
|
speed=1.0, |
|
), |
|
Voice( |
|
"Attenborough", |
|
neutral=f"{file_full_path}/audio/attenborough/neutral.wav", |
|
angry=None, |
|
speed=1.2, |
|
), |
|
Voice( |
|
"Rick", |
|
neutral=f"{file_full_path}/audio/rick/neutral.wav", |
|
angry=None, |
|
speed=1.2, |
|
), |
|
Voice( |
|
"Freeman", |
|
neutral=f"{file_full_path}/audio/freeman/neutral.wav", |
|
angry=f"{file_full_path}/audio/freeman/angry.wav", |
|
speed=1.1, |
|
), |
|
Voice( |
|
"Walken", |
|
neutral=f"{file_full_path}/audio/walken/neutral.wav", |
|
angry=None, |
|
speed=1.1, |
|
), |
|
Voice( |
|
"Darth Wader", |
|
neutral=f"{file_full_path}/audio/darth/neutral.wav", |
|
angry=None, |
|
speed=1.15, |
|
), |
|
] |
|
|
|
|
|
def load_tts_pipeline(): |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
tts_pipeline = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) |
|
return tts_pipeline |
|
|
|
|
|
def compute_speaker_embedding(voice_path: str, config, pipeline, cache): |
|
if voice_path not in cache: |
|
cache[voice_path] = pipeline.synthesizer.tts_model.get_conditioning_latents( |
|
audio_path=voice_path, |
|
gpt_cond_len=config.gpt_cond_len, |
|
gpt_cond_chunk_len=config.gpt_cond_chunk_len, |
|
max_ref_length=config.max_ref_len, |
|
sound_norm_refs=config.sound_norm_refs, |
|
) |
|
return cache[voice_path] |
|
|
|
|
|
voice_options = [] |
|
for voice in voices: |
|
if voice.neutral: |
|
voice_options.append(f"{voice.name} - Neutral") |
|
if voice.angry: |
|
voice_options.append(f"{voice.name} - Angry") |
|
|
|
|
|
def voice_from_text(voice): |
|
for v in voices: |
|
if voice == f"{v.name} - Neutral": |
|
return v.neutral |
|
if voice == f"{v.name} - Angry": |
|
return v.angry |
|
raise ValueError(f"Voice {voice} not found.") |
|
|
|
|
|
def speed_from_text(voice): |
|
for v in voices: |
|
if voice == f"{v.name} - Neutral": |
|
return v.speed |
|
if voice == f"{v.name} - Angry": |
|
return v.speed |
|
|
|
|
|
def tts_xtts( |
|
self, |
|
text: str = "", |
|
language_name: str = "", |
|
reference_wav=None, |
|
gpt_cond_latent=None, |
|
speaker_embedding=None, |
|
split_sentences: bool = True, |
|
**kwargs, |
|
) -> List[int]: |
|
"""🐸 TTS magic. Run all the models and generate speech. |
|
|
|
Args: |
|
text (str): input text. |
|
speaker_name (str, optional): speaker id for multi-speaker models. Defaults to "". |
|
language_name (str, optional): language id for multi-language models. Defaults to "". |
|
speaker_wav (Union[str, List[str]], optional): path to the speaker wav for voice cloning. Defaults to None. |
|
style_wav ([type], optional): style waveform for GST. Defaults to None. |
|
style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None. |
|
reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None. |
|
reference_speaker_name ([type], optional): speaker id of reference waveform. Defaults to None. |
|
split_sentences (bool, optional): split the input text into sentences. Defaults to True. |
|
**kwargs: additional arguments to pass to the TTS model. |
|
Returns: |
|
List[int]: [description] |
|
""" |
|
start_time = time.time() |
|
use_gl = self.vocoder_model is None |
|
wavs = [] |
|
|
|
if not text and not reference_wav: |
|
raise ValueError( |
|
"You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API." |
|
) |
|
|
|
if text: |
|
sens = [text] |
|
if split_sentences: |
|
print(" > Text splitted to sentences.") |
|
sens = self.split_into_sentences(text) |
|
print(sens) |
|
|
|
if not reference_wav: |
|
for sen in sens: |
|
outputs = self.tts_model.inference( |
|
sen, |
|
language_name, |
|
gpt_cond_latent, |
|
speaker_embedding, |
|
|
|
temperature=0.75, |
|
length_penalty=1.0, |
|
repetition_penalty=10.0, |
|
top_k=50, |
|
top_p=0.85, |
|
do_sample=True, |
|
**kwargs, |
|
) |
|
waveform = outputs["wav"] |
|
if ( |
|
torch.is_tensor(waveform) |
|
and waveform.device != torch.device("cpu") |
|
and not use_gl |
|
): |
|
waveform = waveform.cpu() |
|
if not use_gl: |
|
waveform = waveform.numpy() |
|
waveform = waveform.squeeze() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wavs += list(waveform) |
|
wavs += [0] * 10000 |
|
|
|
|
|
process_time = time.time() - start_time |
|
audio_time = len(wavs) / self.tts_config.audio["sample_rate"] |
|
print(f" > Processing time: {process_time}") |
|
print(f" > Real-time factor: {process_time / audio_time}") |
|
return wavs |
|
|
|
|
|
def tts_gradio(text, voice, cache): |
|
global tts_pipeline |
|
if not tts_pipeline: |
|
tts_pipeline = load_tts_pipeline() |
|
|
|
voice_path = voice_from_text(voice) |
|
speed = speed_from_text(voice) |
|
(gpt_cond_latent, speaker_embedding) = compute_speaker_embedding( |
|
voice_path, tts_pipeline.synthesizer.tts_config, tts_pipeline, cache |
|
) |
|
out = tts_xtts( |
|
tts_pipeline.synthesizer, |
|
text, |
|
language_name="en", |
|
speaker=None, |
|
gpt_cond_latent=gpt_cond_latent, |
|
speaker_embedding=speaker_embedding, |
|
speed=speed, |
|
|
|
) |
|
return (22050, np.array(out)), dict(text=text, voice=voice) |
|
|