|
import sys |
|
import io, os, stat |
|
import subprocess |
|
import random |
|
from zipfile import ZipFile |
|
import uuid |
|
import time |
|
import torch |
|
import torchaudio |
|
import langid |
|
import base64 |
|
import csv |
|
from io import StringIO |
|
import datetime |
|
import re |
|
from scipy.io.wavfile import write |
|
from pydub import AudioSegment |
|
from TTS.api import TTS |
|
from TTS.tts.configs.xtts_config import XttsConfig |
|
from TTS.tts.models.xtts import Xtts |
|
from TTS.utils.generic_utils import get_user_data_dir |
|
from huggingface_hub import HfApi |
|
os.environ["COQUI_TOS_AGREED"] = "1" |
|
repo_id = "coqui/xtts" |
|
from TTS.utils.manage import ModelManager |
|
|
|
class xtts_model: |
|
def __init__(self, model_path): |
|
device = "cuda:1" |
|
model_name = "tts_models/multilingual/multi-dataset/xtts_v2" |
|
config = XttsConfig() |
|
config.load_json(os.path.join(model_path, "config.json")) |
|
self.model = Xtts.init_from_config(config) |
|
self.model.load_checkpoint(config, checkpoint_dir=model_path, eval=True, use_deepspeed=False) |
|
self.model.to(device).eval() |
|
|
|
def infer(self, prompt, voice, load_sr=20000): |
|
gpt_cond_latent, speaker_embedding = self.model.get_conditioning_latents(audio_path=voice, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60, load_sr=load_sr) |
|
for out in self.model.inference_stream(prompt,"en", gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding, repetition_penalty=5.0, temperature=0.75): |
|
yield out.to("cpu") |
|
del gpt_cond_latent, speaker_embedding |
|
torch.cuda.empty_cache() |