File size: 1,504 Bytes
4b1870b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1828e31
4b1870b
 
 
 
 
a926d6a
4b1870b
 
 
 
 
 
 
baa70b6
4b1870b
 
 
 
 
 
 
 
 
 
 
 
1828e31
4b1870b
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os

import torch
import gradio as gr



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Vocoder
vocoder = torch.hub.load(repo_or_dir='ex3ndr/supervoice-vocoder', model='bigvsan')
vocoder.to(device)
vocoder.eval()

# GPT Model
gpt = torch.hub.load(repo_or_dir='ex3ndr/supervoice-gpt', model='phonemizer')
gpt.to(device)
gpt.eval()

# Main Model
model = torch.hub.load(repo_or_dir='ex3ndr/supervoice-voicebox', model='phonemizer', gpt=gpt, vocoder=vocoder)
model.to(device)
model.eval()




description = f'''
 
'''

def synthesise(text, voice):
    output = model.synthesize(text, voice = voice, steps = 8, alpha = 0.1)
    waveform = output['wav']
    return (24000, waveform.cpu().squeeze().numpy())

if __name__ == "__main__":
    i = gr.Interface(
        fn=synthesise,
        description=description,
        inputs=[
            gr.Text(label='Text:', lines=5, max_lines=10),
            gr.Dropdown(label="voice", choices=("voice_1", "voice_2", "voice_3"), value="voice_1"),
        ],
        outputs=[
            gr.Audio(
                        label="Audio:",
                        autoplay=False,
                        streaming=False,
                        type="numpy",
                    ),
            
        ],
        allow_flagging ='never',
        cache_examples=True,
        title='Voicebox demo',
        examples=[         ],
    )
    i.queue(max_size=20, default_concurrency_limit=4)
    i.launch(share=False, server_name="0.0.0.0")