File size: 2,238 Bytes
d1ceaed
 
 
a377fc8
d1ceaed
 
 
a377fc8
 
 
 
 
d1ceaed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a377fc8
d1ceaed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6265d22
 
d1ceaed
 
 
 
4bb1137
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import base64
import torch
import io
import tempfile
import scipy.io.wavfile as wavfile
import commons
import utils

import gradio as gr
import numpy as np

from PIL import Image
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm


def text_to_speech(text):
    stn_tst = get_text(text, hps)
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1.2)[0][
            0, 0].data.float().numpy()

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        wavfile.write(f.name, hps.data.sampling_rate, audio)
        audio_file = f.name
    # Return the audio file path
    return audio_file

# Load the trained model
hps = utils.get_hparams_from_file("./configs/jp_base.json")
hps.model_dir = './logs/jp_base'
pretrained_model = f'{hps.model_dir}/model.pth'

net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
_ = net_g.eval()
_ = utils.load_checkpoint(pretrained_model, net_g, None)

# Define the function that will be used to generate speech from text
def generate_speech(text):
    # Use the text_to_speech function to generate speech from text
    speech = text_to_speech(text)
    # Return the speech as a dictionary with 'audio' as the key
    # return {'audio': speech}
    return speech

# Define the interface for the text-to-speech model
text_input = gr.inputs.Textbox(label='Enter Text Here')
output_audio = gr.outputs.Audio(label='Speech', type='filepath')

# Define the user interface using Gradio
ui = gr.Interface(
    fn=generate_speech,
    inputs=text_input,
    outputs=output_audio,
    title='Text-to-Speech for Japanese Demo',
    description='Generate speech from japanese text using a text-to-speech model.'
)


# Run the interface
ui.launch()