import os import random import gradio as gr import numpy as np import torch import torchaudio from huggingface_hub import snapshot_download from play_voice_inference.utils.voice_tokenizer import VoiceBpeTokenizer from play_voice_inference.models.play_voice import LanguageIdentifiers, SpeakerAttributes, SpeechAttributes, load_play_voice from play_voice_inference.utils.play_voice_sampler import PlayVoiceSampler from play_voice_inference.utils.pv_diff_sampler import PlayVoiceDiffusionDecoderSampler torch.set_grad_enabled(False) device = torch.device('cuda') HF_TOKEN = os.environ['HF_TOKEN'] print("Loading models...") tokenizer = VoiceBpeTokenizer() MODEL_DIR = snapshot_download('PlayHT/play-voice-v0-multi', token=HF_TOKEN) PV_AR_PT = MODEL_DIR + '/pv-v1-ar.pth' play_voice = load_play_voice(PV_AR_PT, device) sampler = PlayVoiceSampler(play_voice).to(device) NUM_DIFFUSION_STEPS: int = 150 DIFFUSION_PT = MODEL_DIR + '/pv-v1-diff-xf.pth' DIFFUSION_VOCODER_PT = MODEL_DIR + '/pv-v1-diff-bigvgan.pt' vocoder = PlayVoiceDiffusionDecoderSampler.from_path( DIFFUSION_PT, DIFFUSION_VOCODER_PT, steps=NUM_DIFFUSION_STEPS, silent=True, use_fp16=True, device=device ) print("Preparing voices...") VOICES_DIR = snapshot_download('PlayHT/play-voice-voices', repo_type='dataset', token=HF_TOKEN) def load_audio(path: str, sr=24000): audio, orig_sr = torchaudio.load(path) if orig_sr != sr: audio = torchaudio.transforms.Resample(orig_sr, sr)(audio) return audio def make_pcm(audio: torch.Tensor): # Must convert to 16-bit PCM for gradio # remove batch dim if any # if len(audio.shape) > 2: # audio = audio[0] # audio = audio.transpose(0, 1) # gradio expects [samples, channels] and throws very unhelpful errors if it's wrong gen_np = audio.squeeze().cpu().numpy() i = np.iinfo("int16") abs_max = 2 ** (i.bits - 1) offset = i.min + abs_max gen_np = (gen_np * abs_max + offset).clip(i.min, i.max).astype("int16") return gen_np initial_voices = [] for item in os.listdir(VOICES_DIR): if item.endswith(".wav"): name = os.path.splitext(item)[0] initial_voices.append({"name": name, "audio": load_audio(os.path.join(VOICES_DIR, item))}) initial_voices.sort(key=lambda x: x["name"]) print(f"Found {len(initial_voices)} initial voices") def get_voice_labels(voices: list[dict]): labels = [] for voice in voices: labels.append(voice["name"]) return labels with gr.Blocks(analytics_enabled=False, title="Play Voice", mode="tts") as iface: local_voices = gr.State(initial_voices) def get_selected_voice_by_label(voices, label: str): labels = get_voice_labels(voices) for i, voice_label in enumerate(labels): if voice_label == label: return voices[i] raise Exception("Voice not found: " + label) def make_voice_dropdown(voices): choices = get_voice_labels(voices) return gr.Dropdown( choices=choices, value=choices[-1] if len(choices) > 0 else None, label="Voice", ) def make_enum_dropdown(enum, label, default=None, allow_none=False): choices = [e.name for e in enum] if allow_none: choices.append("none") return gr.Dropdown( choices=choices, value=default, label=label, ) def get_enum_value(enum, value): if value == "none": return None return enum[value] gr.Markdown("# Play Voice\n") with gr.Tab("TTS"): speak_text = gr.Textbox(lines=2, placeholder="What would you like to say?", label="Text") speak_voice = make_voice_dropdown(initial_voices) with gr.Accordion("Settings", open=False): speaker_attributes = make_enum_dropdown( SpeakerAttributes, "Speaker Attributes", "full_sentence", allow_none=True ) speech_attributes = make_enum_dropdown(SpeechAttributes, "Speech Attributes", "none", allow_none=True) language = make_enum_dropdown(LanguageIdentifiers, "Language", "none", allow_none=True) temperature = gr.Slider(minimum=0, maximum=2.0, value=0.3, label="Temperature") repetition_penalty = gr.Slider(minimum=1.0, maximum=10.0, value=1.8, label="Repetition Penalty") filter_thresh = gr.Slider(minimum=0.1, maximum=1.0, value=0.75, label="Top-p Threshold") voice_guidance = gr.Slider(minimum=0.0, maximum=6.0, value=0.4, label="Voice Guidance") style_guidance = gr.Slider(minimum=0.0, maximum=6.0, value=0.1, label="Style Guidance") text_guidance = gr.Slider(minimum=0.0, maximum=6.0, value=0.6, label="Text Guidance") speak_submit = gr.Button("Speak!") speak_result = gr.Audio(label="Result", interactive=False) ref_voice = gr.Audio(label="Reference Voice", interactive=False) @torch.no_grad() def handle_speak( text, voices, voice_name, voice_guidance, speaker_attributes, speech_attributes, language, temperature, repetition_penalty, top_p, style_guidance, text_guidance, ): if text.strip() == "": text = "I am PlayVoice, the voice of the future. Feed me your words and I will speak them, hahahaha!" voice = get_selected_voice_by_label(voices, voice_name) seed = random.randint(0, 2**32 - 1) print(f"Voice: {voice['name']} Text: {text}") voice_emb = sampler.get_voice_embedding(voice["audio"]) text_tokens = [] text_tokens.append(torch.tensor(tokenizer.encode(text), dtype=torch.int, device=device)) text_tokens = torch.nn.utils.rnn.pad_sequence(text_tokens, batch_first=True, padding_value=0) torch.manual_seed(seed) sample_result = sampler.sample_batched( text_tokens=text_tokens, text_guidance=text_guidance, voice_emb=voice_emb, voice_guidance=voice_guidance, speaker_attributes=get_enum_value(SpeakerAttributes, speaker_attributes), speech_attributes=get_enum_value(SpeechAttributes, speech_attributes), language_identifier=get_enum_value(LanguageIdentifiers, language), style_guidance=float(style_guidance), temperature=float(temperature), repetition_penalty=float(repetition_penalty), top_p=float(top_p), ) latents = sample_result["latents"] audio = vocoder.sample(text_tokens, latents, ref_wav=voice["audio"]) audio = make_pcm(audio) return { speak_result: (vocoder.OUTPUT_FREQUENCY, audio), ref_voice: (22050, make_pcm(voice["audio"])), } speak_submit.click( handle_speak, inputs=[ speak_text, local_voices, speak_voice, voice_guidance, speaker_attributes, speech_attributes, language, temperature, repetition_penalty, filter_thresh, style_guidance, text_guidance, ], outputs=[ speak_result, ref_voice, ], ) with gr.Tab("Clone Voice"): new_voice_name = gr.Textbox(value="cloned-voice", label="Voice Name") new_voice_audio = gr.Audio(label="Voice Audio (20s min, ideally 30s, anything longer will be truncated)", sources=["upload", "microphone"], ) new_voice_submit = gr.Button("Create!") new_voice_result = gr.Label("") def on_new_voice_submit(voices, name, raw_audio): assert raw_audio is not None, "Must provide audio" sr = raw_audio[0] torch_audio = torch.from_numpy(raw_audio[1]).float() / 32768.0 if torch_audio.ndim == 1: torch_audio = torch_audio.unsqueeze(0) else: torch_audio = torch_audio.transpose(0, 1).mean(dim=0, keepdim=True) if sr != 24000: if sr < 16000: raise Exception( "Garbage in, garbage out. Please provide audio with a sample rate of at least 16kHz, ideally 24kHz." ) torch_audio = torchaudio.transforms.Resample(sr, 24000)(torch_audio) # trim to 30s if torch_audio.shape[1] > 24000 * 30: torch_audio = torch_audio[:, : 24000 * 30] # add to local voices voices.append({"name": name, "audio": torch_audio}) return { speak_voice: make_voice_dropdown(voices), new_voice_result: f"Created voice {name}", } new_voice_submit.click( on_new_voice_submit, inputs = [ local_voices, new_voice_name, new_voice_audio ], outputs=[ speak_voice, new_voice_result ] ) iface.launch(show_error=True, share=False)