vits-japanese / app.py
quyanh's picture
initial commit
d1ceaed
raw
history blame
2.44 kB
import gradio as gr
import base64
import torch
import io
import scipy.io.wavfile as wavfile
from PIL import Image
import numpy as np
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
import subprocess
import os
import tempfile
def get_text(text, hps):
text_norm = text_to_sequence(text, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
def text_to_speech(text):
stn_tst = get_text(text, hps)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1.2)[0][
0, 0].data.float().numpy()
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
wavfile.write(f.name, hps.data.sampling_rate, audio)
audio_file = f.name
# Return the audio file path
return audio_file
# Load the trained model
hps = utils.get_hparams_from_file("./configs/jp_base.json")
hps.model_dir = './logs/jp_base'
pretrained_model = f'{hps.model_dir}/model.pth'
net_g = SynthesizerTrn(
len(symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
**hps.model)
_ = net_g.eval()
if os.path.isfile(pretrained_model):
_ = utils.load_checkpoint(pretrained_model, net_g, None)
else:
# Run the shell script
subprocess.call('./startup.sh', shell=True)
_ = utils.load_checkpoint(pretrained_model, net_g, None)
# Define the function that will be used to generate speech from text
def generate_speech(text):
# Use the text_to_speech function to generate speech from text
speech = text_to_speech(text)
# Return the speech as a dictionary with 'audio' as the key
# return {'audio': speech}
return speech
# Define the interface for the text-to-speech model
text_input = gr.inputs.Textbox(label='Enter Text Here')
output_audio = gr.outputs.Audio(label='Speech', type='filepath')
# Define the user interface using Gradio
ui = gr.Interface(
fn=generate_speech,
inputs=text_input,
outputs=output_audio,
title='Text-to-Speech Demo',
description='Generate speech from text using a text-to-speech model.'
)
# Run the interface
ui.launch(share=True)