import sys import io, os, stat import subprocess import random from zipfile import ZipFile import uuid import time import torch import torchaudio import langid import base64 import csv from io import StringIO import datetime import re from scipy.io.wavfile import write from pydub import AudioSegment from TTS.api import TTS from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts from TTS.utils.generic_utils import get_user_data_dir from huggingface_hub import HfApi os.environ["COQUI_TOS_AGREED"] = "1" repo_id = "coqui/xtts" from TTS.utils.manage import ModelManager class xtts_model: def __init__(self, model_path): device = "cuda:1" model_name = "tts_models/multilingual/multi-dataset/xtts_v2" config = XttsConfig() config.load_json(os.path.join(model_path, "config.json")) self.model = Xtts.init_from_config(config) self.model.load_checkpoint(config, checkpoint_dir=model_path, eval=True, use_deepspeed=False) self.model.to(device).eval() def infer(self, prompt, voice, load_sr=20000): gpt_cond_latent, speaker_embedding = self.model.get_conditioning_latents(audio_path=voice, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60, load_sr=load_sr) for out in self.model.inference_stream(prompt,"en", gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding, repetition_penalty=5.0, temperature=0.75): yield out.to("cpu") del gpt_cond_latent, speaker_embedding torch.cuda.empty_cache()