Spaces:

quyanh
/

vits-japanese

Runtime error

File size: 2,438 Bytes

d1ceaed

import gradio as gr
import base64
import torch
import io
import scipy.io.wavfile as wavfile
from PIL import Image
import numpy as np

import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

import subprocess
import os
import tempfile

def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm


def text_to_speech(text):
    stn_tst = get_text(text, hps)
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1.2)[0][
            0, 0].data.float().numpy()

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        wavfile.write(f.name, hps.data.sampling_rate, audio)
        audio_file = f.name
    # Return the audio file path
    return audio_file

# Load the trained model
hps = utils.get_hparams_from_file("./configs/jp_base.json")
hps.model_dir = './logs/jp_base'
pretrained_model = f'{hps.model_dir}/model.pth'

net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
_ = net_g.eval()

if os.path.isfile(pretrained_model):
    _ = utils.load_checkpoint(pretrained_model, net_g, None)
else:
    # Run the shell script
    subprocess.call('./startup.sh', shell=True)
    _ = utils.load_checkpoint(pretrained_model, net_g, None)

# Define the function that will be used to generate speech from text
def generate_speech(text):
    # Use the text_to_speech function to generate speech from text
    speech = text_to_speech(text)
    # Return the speech as a dictionary with 'audio' as the key
    # return {'audio': speech}
    return speech

# Define the interface for the text-to-speech model
text_input = gr.inputs.Textbox(label='Enter Text Here')
output_audio = gr.outputs.Audio(label='Speech', type='filepath')

# Define the user interface using Gradio
ui = gr.Interface(
    fn=generate_speech,
    inputs=text_input,
    outputs=output_audio,
    title='Text-to-Speech Demo',
    description='Generate speech from text using a text-to-speech model.'
)


# Run the interface
ui.launch(share=True)