Spaces:
Runtime error
Runtime error
import os | |
import random | |
import gradio as gr | |
import numpy as np | |
import torch | |
import torchaudio | |
from huggingface_hub import snapshot_download | |
from play_voice_inference.utils.voice_tokenizer import VoiceBpeTokenizer | |
from play_voice_inference.models.play_voice import LanguageIdentifiers, SpeakerAttributes, SpeechAttributes, load_play_voice | |
from play_voice_inference.utils.play_voice_sampler import PlayVoiceSampler | |
from play_voice_inference.utils.pv_diff_sampler import PlayVoiceDiffusionDecoderSampler | |
torch.set_grad_enabled(False) | |
device = torch.device('cuda') | |
HF_TOKEN = os.environ['HF_TOKEN'] | |
print("Loading models...") | |
tokenizer = VoiceBpeTokenizer() | |
MODEL_DIR = snapshot_download('PlayHT/play-voice-v0-multi', token=HF_TOKEN) | |
PV_AR_PT = MODEL_DIR + '/pv-v1-ar.pth' | |
play_voice = load_play_voice(PV_AR_PT, device) | |
sampler = PlayVoiceSampler(play_voice).to(device) | |
NUM_DIFFUSION_STEPS: int = 150 | |
DIFFUSION_PT = MODEL_DIR + '/pv-v1-diff-xf.pth' | |
DIFFUSION_VOCODER_PT = MODEL_DIR + '/pv-v1-diff-bigvgan.pt' | |
vocoder = PlayVoiceDiffusionDecoderSampler.from_path( | |
DIFFUSION_PT, | |
DIFFUSION_VOCODER_PT, | |
steps=NUM_DIFFUSION_STEPS, | |
silent=True, | |
use_fp16=True, | |
device=device | |
) | |
print("Preparing voices...") | |
VOICES_DIR = snapshot_download('PlayHT/play-voice-voices', repo_type='dataset', token=HF_TOKEN) | |
def load_audio(path: str, sr=24000): | |
audio, orig_sr = torchaudio.load(path) | |
if orig_sr != sr: | |
audio = torchaudio.transforms.Resample(orig_sr, sr)(audio) | |
return audio | |
def make_pcm(audio: torch.Tensor): | |
# Must convert to 16-bit PCM for gradio | |
# remove batch dim if any | |
# if len(audio.shape) > 2: | |
# audio = audio[0] | |
# audio = audio.transpose(0, 1) # gradio expects [samples, channels] and throws very unhelpful errors if it's wrong | |
gen_np = audio.squeeze().cpu().numpy() | |
i = np.iinfo("int16") | |
abs_max = 2 ** (i.bits - 1) | |
offset = i.min + abs_max | |
gen_np = (gen_np * abs_max + offset).clip(i.min, i.max).astype("int16") | |
return gen_np | |
initial_voices = [] | |
for item in os.listdir(VOICES_DIR): | |
if item.endswith(".wav"): | |
name = os.path.splitext(item)[0] | |
initial_voices.append({"name": name, "audio": load_audio(os.path.join(VOICES_DIR, item))}) | |
initial_voices.sort(key=lambda x: x["name"]) | |
print(f"Found {len(initial_voices)} initial voices") | |
def get_voice_labels(voices: list[dict]): | |
labels = [] | |
for voice in voices: | |
labels.append(voice["name"]) | |
return labels | |
with gr.Blocks(analytics_enabled=False, title="Play Voice", mode="tts") as iface: | |
local_voices = gr.State(initial_voices) | |
def get_selected_voice_by_label(voices, label: str): | |
labels = get_voice_labels(voices) | |
for i, voice_label in enumerate(labels): | |
if voice_label == label: | |
return voices[i] | |
raise Exception("Voice not found: " + label) | |
def make_voice_dropdown(voices): | |
choices = get_voice_labels(voices) | |
return gr.Dropdown( | |
choices=choices, | |
value=choices[-1] if len(choices) > 0 else None, | |
label="Voice", | |
) | |
def make_enum_dropdown(enum, label, default=None, allow_none=False): | |
choices = [e.name for e in enum] | |
if allow_none: | |
choices.append("none") | |
return gr.Dropdown( | |
choices=choices, | |
value=default, | |
label=label, | |
) | |
def get_enum_value(enum, value): | |
if value == "none": | |
return None | |
return enum[value] | |
gr.Markdown("# Play Voice (pretrained)\n") | |
with gr.Tab("TTS"): | |
speak_text = gr.Textbox(lines=2, placeholder="What would you like to say?", label="Text") | |
speak_voice = make_voice_dropdown(initial_voices) | |
with gr.Accordion("Settings", open=False): | |
speaker_attributes = make_enum_dropdown( | |
SpeakerAttributes, "Speaker Attributes", "full_sentence", allow_none=True | |
) | |
speech_attributes = make_enum_dropdown(SpeechAttributes, "Speech Attributes", "none", allow_none=True) | |
language = make_enum_dropdown(LanguageIdentifiers, "Language", "none", allow_none=True) | |
temperature = gr.Slider(minimum=0, maximum=2.0, value=0.3, label="Temperature") | |
repetition_penalty = gr.Slider(minimum=1.0, maximum=10.0, value=1.8, label="Repetition Penalty") | |
filter_thresh = gr.Slider(minimum=0.1, maximum=1.0, value=0.75, label="Top-p Threshold") | |
voice_guidance = gr.Slider(minimum=0.0, maximum=6.0, value=0.4, label="Voice Guidance") | |
style_guidance = gr.Slider(minimum=0.0, maximum=6.0, value=0.1, label="Style Guidance") | |
text_guidance = gr.Slider(minimum=0.0, maximum=6.0, value=0.6, label="Text Guidance") | |
speak_submit = gr.Button("Speak!") | |
speak_result = gr.Audio(label="Result", interactive=False) | |
ref_voice = gr.Audio(label="Reference Voice", interactive=False) | |
def handle_speak( | |
text, | |
voices, | |
voice_name, | |
voice_guidance, | |
speaker_attributes, | |
speech_attributes, | |
language, | |
temperature, | |
repetition_penalty, | |
top_p, | |
style_guidance, | |
text_guidance, | |
): | |
if text.strip() == "": | |
text = "I am PlayVoice, the voice of the future. Feed me your words and I will speak them, hahahaha!" | |
voice = get_selected_voice_by_label(voices, voice_name) | |
seed = random.randint(0, 2**32 - 1) | |
print(f"Voice: {voice['name']} Text: {text}") | |
voice_emb = sampler.get_voice_embedding(voice["audio"]) | |
text_tokens = [] | |
text_tokens.append(torch.tensor(tokenizer.encode(text), dtype=torch.int, device=device)) | |
text_tokens = torch.nn.utils.rnn.pad_sequence(text_tokens, batch_first=True, padding_value=0) | |
torch.manual_seed(seed) | |
sample_result = sampler.sample_batched( | |
text_tokens=text_tokens, | |
text_guidance=text_guidance, | |
voice_emb=voice_emb, | |
voice_guidance=voice_guidance, | |
speaker_attributes=get_enum_value(SpeakerAttributes, speaker_attributes), | |
speech_attributes=get_enum_value(SpeechAttributes, speech_attributes), | |
language_identifier=get_enum_value(LanguageIdentifiers, language), | |
style_guidance=float(style_guidance), | |
temperature=float(temperature), | |
repetition_penalty=float(repetition_penalty), | |
top_p=float(top_p), | |
) | |
latents = sample_result["latents"] | |
audio = vocoder.sample(text_tokens, latents, ref_wav=voice["audio"]) | |
audio = make_pcm(audio) | |
return { | |
speak_result: (vocoder.OUTPUT_FREQUENCY, audio), | |
ref_voice: (22050, make_pcm(voice["audio"])), | |
} | |
speak_submit.click( | |
handle_speak, | |
inputs=[ | |
speak_text, | |
local_voices, | |
speak_voice, | |
voice_guidance, | |
speaker_attributes, | |
speech_attributes, | |
language, | |
temperature, | |
repetition_penalty, | |
filter_thresh, | |
style_guidance, | |
text_guidance, | |
], | |
outputs=[ | |
speak_result, | |
ref_voice, | |
], | |
) | |
with gr.Tab("Clone Voice"): | |
new_voice_name = gr.Textbox(value="cloned-voice", label="Voice Name") | |
new_voice_audio = gr.Audio(label="Voice Audio (20s min, ideally 30s, anything longer will be truncated)", | |
sources=["upload", "microphone"], | |
) | |
new_voice_submit = gr.Button("Create!") | |
new_voice_result = gr.Label("") | |
def on_new_voice_submit(voices, name, raw_audio): | |
assert raw_audio is not None, "Must provide audio" | |
sr = raw_audio[0] | |
torch_audio = torch.from_numpy(raw_audio[1]).float() / 32768.0 | |
if torch_audio.ndim == 1: | |
torch_audio = torch_audio.unsqueeze(0) | |
else: | |
torch_audio = torch_audio.transpose(0, 1).mean(dim=0, keepdim=True) | |
if sr != 24000: | |
if sr < 16000: | |
raise Exception( | |
"Garbage in, garbage out. Please provide audio with a sample rate of at least 16kHz, ideally 24kHz." | |
) | |
torch_audio = torchaudio.transforms.Resample(sr, 24000)(torch_audio) | |
# trim to 30s | |
if torch_audio.shape[1] > 24000 * 30: | |
torch_audio = torch_audio[:, : 24000 * 30] | |
# add to local voices | |
voices.append({"name": name, "audio": torch_audio}) | |
return { | |
speak_voice: make_voice_dropdown(voices), | |
new_voice_result: f"Created voice {name}", | |
} | |
new_voice_submit.click( | |
on_new_voice_submit, | |
inputs = [ | |
local_voices, | |
new_voice_name, | |
new_voice_audio | |
], | |
outputs=[ | |
speak_voice, | |
new_voice_result | |
] | |
) | |
iface.launch(show_error=True, share=False) | |