Spaces:

quyanh
/

vits-japanese

Runtime error

App Files Files Community

vits-japanese / app.py

quyanh

initial commit

d1ceaed over 1 year ago

raw

history blame

2.44 kB

	import gradio as gr
	import base64
	import torch
	import io
	import scipy.io.wavfile as wavfile
	from PIL import Image
	import numpy as np

	import commons
	import utils
	from models import SynthesizerTrn
	from text.symbols import symbols
	from text import text_to_sequence

	import subprocess
	import os
	import tempfile

	def get_text(text, hps):
	text_norm = text_to_sequence(text, hps.data.text_cleaners)
	if hps.data.add_blank:
	text_norm = commons.intersperse(text_norm, 0)
	text_norm = torch.LongTensor(text_norm)
	return text_norm


	def text_to_speech(text):
	stn_tst = get_text(text, hps)
	with torch.no_grad():
	x_tst = stn_tst.unsqueeze(0)
	x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
	audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1.2)[0][
	0, 0].data.float().numpy()

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	wavfile.write(f.name, hps.data.sampling_rate, audio)
	audio_file = f.name
	# Return the audio file path
	return audio_file

	# Load the trained model
	hps = utils.get_hparams_from_file("./configs/jp_base.json")
	hps.model_dir = './logs/jp_base'
	pretrained_model = f'{hps.model_dir}/model.pth'

	net_g = SynthesizerTrn(
	len(symbols),
	hps.data.filter_length // 2 + 1,
	hps.train.segment_size // hps.data.hop_length,
	**hps.model)
	_ = net_g.eval()

	if os.path.isfile(pretrained_model):
	_ = utils.load_checkpoint(pretrained_model, net_g, None)
	else:
	# Run the shell script
	subprocess.call('./startup.sh', shell=True)
	_ = utils.load_checkpoint(pretrained_model, net_g, None)

	# Define the function that will be used to generate speech from text
	def generate_speech(text):
	# Use the text_to_speech function to generate speech from text
	speech = text_to_speech(text)
	# Return the speech as a dictionary with 'audio' as the key
	# return {'audio': speech}
	return speech

	# Define the interface for the text-to-speech model
	text_input = gr.inputs.Textbox(label='Enter Text Here')
	output_audio = gr.outputs.Audio(label='Speech', type='filepath')

	# Define the user interface using Gradio
	ui = gr.Interface(
	fn=generate_speech,
	inputs=text_input,
	outputs=output_audio,
	title='Text-to-Speech Demo',
	description='Generate speech from text using a text-to-speech model.'
	)


	# Run the interface
	ui.launch(share=True)