Spaces:

Pipe1213
/

VITS_Walloon_Phonemes

Sleeping

App Files Files Community

VITS_Walloon_Phonemes / app.py

Pipe1213

Update app.py

c612025 verified 3 months ago

raw

history blame

5.18 kB

	import gradio as gr
	import os
	os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..')

	import json
	import math
	import torch
	from torch import nn
	from torch.nn import functional as F
	from torch.utils.data import DataLoader

	import commons
	import utils
	from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
	from models import SynthesizerTrn
	#from text.symbols import symbols
	from text.symbols_pho import symbols_pho
	#from text import text_to_sequence
	from scipy.io.wavfile import write
	from text import cleaners

	symbols = symbols_pho # change this later
	_symbol_to_id = {s: i for i, s in enumerate(symbols)}
	_id_to_symbol = {i: s for i, s in enumerate(symbols)}

	def text_to_sequence(text, cleaner_names):
	'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
	Args:
	text: string to convert to a sequence
	cleaner_names: names of the cleaner functions to run the text through
	Returns:
	List of integers corresponding to the symbols in the text
	'''
	sequence = []

	clean_text = _clean_text(text, cleaner_names)
	for symbol in clean_text:
	symbol_id = _symbol_to_id[symbol]
	sequence += [symbol_id]
	return sequence

	def _clean_text(text, cleaner_names):
	for name in cleaner_names:
	cleaner = getattr(cleaners, name)
	if not cleaner:
	raise Exception('Unknown cleaner: %s' % name)
	text = cleaner(text)
	return text


	def get_text(text, hps):
	text_norm = text_to_sequence(text, hps.data.text_cleaners)
	if hps.data.add_blank:
	text_norm = commons.intersperse(text_norm, 0)
	text_norm = torch.LongTensor(text_norm)
	return text_norm

	def load_model(model_path, hps):
	net_g = SynthesizerTrn(
	len(symbols),
	hps.data.filter_length // 2 + 1,
	hps.train.segment_size // hps.data.hop_length,
	n_speakers=hps.data.n_speakers,
	**hps.model)
	_ = net_g.eval()
	_ = utils.load_checkpoint(model_path, net_g, None)
	return net_g

	#hps = utils.get_hparams_from_file("configs/vctk_base.json")
	hps = utils.get_hparams_from_file("configs/vctk_base.json")

	# Define a dictionary to store the model paths for each tab
	model_paths = {
	"Phonemes_finetuned": "wallon_pho/G_277000.pth"
	}

	# Load the initial model
	net_g = load_model(model_paths["Phonemes_finetuned"], hps)

	def tts(text, speaker_id, tab_name):
	global net_g
	net_g = load_model(model_paths[tab_name], hps)
	sid = torch.LongTensor([speaker_id]) # speaker identity
	stn_tst = get_text(text, hps)

	with torch.no_grad():
	x_tst = stn_tst.unsqueeze(0)
	x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
	audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][
	0, 0].data.float().numpy()
	return "Success", (hps.data.sampling_rate, audio)

	def create_tab(tab_name):
	with gr.TabItem(tab_name):
	gr.Markdown(f"### {tab_name} TTS Model")
	tts_input1 = gr.TextArea(label="Text in Walloon on IPA phonemes", value="")
	tts_input2 = gr.Dropdown(label="Speaker", choices=["Male", "Female"], type="index", value="Male")
	tts_submit = gr.Button("Generate", variant="primary")
	tts_output1 = gr.Textbox(label="Message")
	tts_output2 = gr.Audio(label="Output")
	tts_submit.click(lambda text, speaker_id: tts(text, speaker_id, tab_name), [tts_input1, tts_input2], [tts_output1, tts_output2])

	app = gr.Blocks()
	with app:
	gr.Markdown(
	"""
	# First Text to Speech (TTS) for Walloon
	Based on VITS (https://github.com/jaywalnut310/vits).

	Write the text in phonemes or graphemes depending on the model.
	For faster inference, it is recommended to use short sentences.

	The quality of the results varies between male and female voice due to the limited data for female voice on this language.
	For better results with male voice, use the models fully trained on Walloon.
	For better results with female voice, use the models trained on french and fine-tuned on Walloon.

	To try the version trained in graphemes follow the link below:

	https://huggingface.co/spaces/Pipe1213/VITS_Walloon_Graphemes


	## Hint: Some sample texts are available at the bottom of the web site.
	## Hint: For faster inference speed it is recommended to use short sentences.
	"""
	)
	with gr.Tabs():
	create_tab("Phonemes_finetuned")
	create_tab("Phonemes(not working yet!!!)")

	gr.Markdown(
	"""
	### Examples
	\| Input Text \| Speaker \|
	\|------------\|---------\|
	\| li biːç ɛ l sɔlja ɛstẽ ki s maʁɡajẽ pɔ sawɛ kiː ski , dɛ døː , ɛstøː l py fwaʁ . m ɛ̃ s koː la , la k i vɛjɛ õ tsminɔː k aʁivef pjim pjam , d ɛ̃ õ bja nuː tsoː paltɔ . \| Female \|
	\| ɛl m õ ʁɛspõdu , duvẽ ɔːʁẽ n pøː d õ tsapja . \| Male \|
	\| dɔ koː , dz a dvu tswɛzi ɛn oːt mɛstiː , dz ast apʁ ɛ̃ a mõne dɛz avjõ .\| Female \|
	"""
	)

	app.launch()