import os import torch import gradio as gr device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Vocoder vocoder = torch.hub.load(repo_or_dir='ex3ndr/supervoice-vocoder', model='bigvsan') vocoder.to(device) vocoder.eval() # GPT Model gpt = torch.hub.load(repo_or_dir='ex3ndr/supervoice-gpt', model='phonemizer') gpt.to(device) gpt.eval() # Main Model model = torch.hub.load(repo_or_dir='ex3ndr/supervoice-voicebox', model='phonemizer', gpt=gpt, vocoder=vocoder) model.to(device) model.eval() description = f''' ''' def synthesise(text, voice): output = model.synthesize(text, voice = voice, steps = 8, alpha = 0.1) waveform = output['wav'] return (24000, waveform.cpu().squeeze().numpy()) if __name__ == "__main__": i = gr.Interface( fn=synthesise, description=description, inputs=[ gr.Text(label='Text:', lines=5, max_lines=10), gr.Dropdown(label="voice", choices=("voice_1", "voice_2", "voice_3"), value="voice_1"), ], outputs=[ gr.Audio( label="Audio:", autoplay=False, streaming=False, type="numpy", ), ], allow_flagging ='never', cache_examples=True, title='Voicebox demo', examples=[ ], ) i.queue(max_size=20, default_concurrency_limit=4) i.launch(share=False, server_name="0.0.0.0")