Spaces:

quyanh
/

vits-japanese

Runtime error

App Files Files Community

vits-japanese / app.py

quyanh

describe model

6265d22 over 1 year ago

raw

history blame

2.24 kB

	import base64
	import torch
	import io
	import tempfile
	import scipy.io.wavfile as wavfile
	import commons
	import utils

	import gradio as gr
	import numpy as np

	from PIL import Image
	from models import SynthesizerTrn
	from text.symbols import symbols
	from text import text_to_sequence

	def get_text(text, hps):
	text_norm = text_to_sequence(text, hps.data.text_cleaners)
	if hps.data.add_blank:
	text_norm = commons.intersperse(text_norm, 0)
	text_norm = torch.LongTensor(text_norm)
	return text_norm


	def text_to_speech(text):
	stn_tst = get_text(text, hps)
	with torch.no_grad():
	x_tst = stn_tst.unsqueeze(0)
	x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
	audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1.2)[0][
	0, 0].data.float().numpy()

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	wavfile.write(f.name, hps.data.sampling_rate, audio)
	audio_file = f.name
	# Return the audio file path
	return audio_file

	# Load the trained model
	hps = utils.get_hparams_from_file("./configs/jp_base.json")
	hps.model_dir = './logs/jp_base'
	pretrained_model = f'{hps.model_dir}/model.pth'

	net_g = SynthesizerTrn(
	len(symbols),
	hps.data.filter_length // 2 + 1,
	hps.train.segment_size // hps.data.hop_length,
	**hps.model)
	_ = net_g.eval()
	_ = utils.load_checkpoint(pretrained_model, net_g, None)

	# Define the function that will be used to generate speech from text
	def generate_speech(text):
	# Use the text_to_speech function to generate speech from text
	speech = text_to_speech(text)
	# Return the speech as a dictionary with 'audio' as the key
	# return {'audio': speech}
	return speech

	# Define the interface for the text-to-speech model
	text_input = gr.inputs.Textbox(label='Enter Text Here')
	output_audio = gr.outputs.Audio(label='Speech', type='filepath')

	# Define the user interface using Gradio
	ui = gr.Interface(
	fn=generate_speech,
	inputs=text_input,
	outputs=output_audio,
	title='Text-to-Speech for Japanese Demo',
	description='Generate speech from japanese text using a text-to-speech model.'
	)


	# Run the interface
	ui.launch()