File size: 1,504 Bytes
4b1870b 1828e31 4b1870b a926d6a 4b1870b baa70b6 4b1870b 1828e31 4b1870b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import os
import torch
import gradio as gr
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Vocoder
vocoder = torch.hub.load(repo_or_dir='ex3ndr/supervoice-vocoder', model='bigvsan')
vocoder.to(device)
vocoder.eval()
# GPT Model
gpt = torch.hub.load(repo_or_dir='ex3ndr/supervoice-gpt', model='phonemizer')
gpt.to(device)
gpt.eval()
# Main Model
model = torch.hub.load(repo_or_dir='ex3ndr/supervoice-voicebox', model='phonemizer', gpt=gpt, vocoder=vocoder)
model.to(device)
model.eval()
description = f'''
'''
def synthesise(text, voice):
output = model.synthesize(text, voice = voice, steps = 8, alpha = 0.1)
waveform = output['wav']
return (24000, waveform.cpu().squeeze().numpy())
if __name__ == "__main__":
i = gr.Interface(
fn=synthesise,
description=description,
inputs=[
gr.Text(label='Text:', lines=5, max_lines=10),
gr.Dropdown(label="voice", choices=("voice_1", "voice_2", "voice_3"), value="voice_1"),
],
outputs=[
gr.Audio(
label="Audio:",
autoplay=False,
streaming=False,
type="numpy",
),
],
allow_flagging ='never',
cache_examples=True,
title='Voicebox demo',
examples=[ ],
)
i.queue(max_size=20, default_concurrency_limit=4)
i.launch(share=False, server_name="0.0.0.0")
|