import gradio as gr import base64 import torch import io import scipy.io.wavfile as wavfile from PIL import Image import numpy as np import commons import utils from models import SynthesizerTrn from text.symbols import symbols from text import text_to_sequence import subprocess import os import tempfile def get_text(text, hps): text_norm = text_to_sequence(text, hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm = torch.LongTensor(text_norm) return text_norm def text_to_speech(text): stn_tst = get_text(text, hps) with torch.no_grad(): x_tst = stn_tst.unsqueeze(0) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1.2)[0][ 0, 0].data.float().numpy() with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: wavfile.write(f.name, hps.data.sampling_rate, audio) audio_file = f.name # Return the audio file path return audio_file # Load the trained model hps = utils.get_hparams_from_file("./configs/jp_base.json") hps.model_dir = './logs/jp_base' pretrained_model = f'{hps.model_dir}/model.pth' net_g = SynthesizerTrn( len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model) _ = net_g.eval() if os.path.isfile(pretrained_model): _ = utils.load_checkpoint(pretrained_model, net_g, None) else: # Run the shell script subprocess.call('./startup.sh', shell=True) _ = utils.load_checkpoint(pretrained_model, net_g, None) # Define the function that will be used to generate speech from text def generate_speech(text): # Use the text_to_speech function to generate speech from text speech = text_to_speech(text) # Return the speech as a dictionary with 'audio' as the key # return {'audio': speech} return speech # Define the interface for the text-to-speech model text_input = gr.inputs.Textbox(label='Enter Text Here') output_audio = gr.outputs.Audio(label='Speech', type='filepath') # Define the user interface using Gradio ui = gr.Interface( fn=generate_speech, inputs=text_input, outputs=output_audio, title='Text-to-Speech Demo', description='Generate speech from text using a text-to-speech model.' ) # Run the interface ui.launch(share=True)