import gradio as gr import os os.system('cd monotonic_align && python setup.py build_ext --inplace && cd ..') import json import math import torch from torch import nn from torch.nn import functional as F from torch.utils.data import DataLoader import commons import utils from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate from models import SynthesizerTrn from text.symbols import symbols as symbols_default # import symbols graphemes from fr_wa_graphemes.symbols import symbols as symbols_ft # import symbols finetuned from wa_ft_graphemes_v2.symbols import symbols as symbols_ft_v2 from wa_graphemes_v2.symbols import symbols as symbols_v2 from scipy.io.wavfile import write from text import cleaners model_configs = { "Graphemes_finetuned": { "path": "fr_wa_graphemes/G_80000.pth", "symbols": symbols_ft }, "Graphemes": { "path": "wa_graphemes/G_258000.pth", "symbols": symbols_default }, "Graphemes_v2": { "path": "wa_graphemes_v2/G_112000.pth", "symbols": symbols_v2 }, "Graphemes_finetuned_v2": { "path": "wa_ft_graphemes_v2/G_207000.pth", "symbols": symbols_ft_v2 } } # Global variables net_g = None symbols = [] _symbol_to_id = {} _id_to_symbol = {} def text_to_sequence(text, cleaner_names): sequence = [] clean_text = _clean_text(text, cleaner_names) for symbol in clean_text: symbol_id = _symbol_to_id[symbol] sequence += [symbol_id] return sequence def _clean_text(text, cleaner_names): for name in cleaner_names: cleaner = getattr(cleaners, name) if not cleaner: raise Exception('Unknown cleaner: %s' % name) text = cleaner(text) return text def get_text(text, hps): text_norm = text_to_sequence(text, hps.data.text_cleaners) if (hps.data.add_blank): text_norm = commons.intersperse(text_norm, 0) text_norm = torch.LongTensor(text_norm) return text_norm def load_model_and_symbols(tab_name): global net_g, symbols, _symbol_to_id, _id_to_symbol model_config = model_configs[tab_name] symbols = model_config["symbols"] _symbol_to_id = {s: i for i, s in enumerate(symbols)} _id_to_symbol = {i: s for i, s in enumerate(symbols)} net_g = SynthesizerTrn( len(symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, **hps.model) _ = net_g.eval() _ = utils.load_checkpoint(model_config["path"], net_g, None) def tts(text, speaker_id, tab_name): load_model_and_symbols(tab_name) sid = torch.LongTensor([speaker_id]) # speaker identity stn_tst = get_text(text, hps) with torch.no_grad(): x_tst = stn_tst.unsqueeze(0) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][ 0, 0].data.float().numpy() return "Success", (hps.data.sampling_rate, audio) def create_tab(tab_name): with gr.TabItem(tab_name): gr.Markdown(f"### {tab_name} TTS Model") tts_input1 = gr.TextArea(label="Text in Walloon", value="") tts_input2 = gr.Dropdown(label="Speaker", choices=["Male", "Female"], type="index", value="Male") tts_submit = gr.Button("Generate", variant="primary") tts_output1 = gr.Textbox(label="Message") tts_output2 = gr.Audio(label="Output") tts_submit.click(lambda text, speaker_id: tts(text, speaker_id, tab_name), [tts_input1, tts_input2], [tts_output1, tts_output2]) def tts_comparison(text, speaker_id): result1 = tts(text, speaker_id, "Graphemes") result2 = tts(text, speaker_id, "Graphemes_finetuned") result3 = tts(text, speaker_id, "Graphemes_v2") result4 = tts(text, speaker_id, "Graphemes_finetuned_v2") return result1[1], result2[1], result3[1], result4[1] def create_comparison_tab(): with gr.TabItem("Compare Models"): gr.Markdown("### Compare TTS Models") tts_input = gr.TextArea(label="Text in Walloon", value="") tts_speaker = gr.Dropdown(label="Speaker", choices=["Male", "Female"], type="index", value="Male") tts_submit = gr.Button("Generate", variant="primary") tts_output1 = gr.Audio(label="Graphemes Output") tts_output2 = gr.Audio(label="Graphemes v2 Output") tts_output3 = gr.Audio(label="Graphemes Finetuned Output") tts_output4 = gr.Audio(label="Graphemes Finetuned v2 Output") tts_submit.click(lambda text, speaker_id: tts_comparison(text, speaker_id), [tts_input, tts_speaker], [tts_output1, tts_output2, tts_output3, tts_output4]) hps = utils.get_hparams_from_file("configs/vctk_base.json") app = gr.Blocks() with app: gr.Markdown( """ # First Text to Speech (TTS) for Walloon Based on VITS (https://github.com/jaywalnut310/vits). ## How to use: Write the text in graphemes. For faster inference, it is recommended to use short sentences. The quality of the results varies between male and female voice due to the limited data for female voice on this language. For better results with male voice, use the models fully trained on Walloon. For better results with female voice, use the models trained on french and fine-tuned on Walloon. To try the version trained in phonemes follow the link below: https://huggingface.co/spaces/Pipe1213/VITS_Walloon_Phonemes ## Hint: Some sample texts are available at the bottom of the web site. """ ) with gr.Tabs(): create_tab("Graphemes") create_tab("Graphemes_finetuned") create_tab("Graphemes_v2") create_tab("Graphemes_finetuned_v2") create_comparison_tab() gr.Markdown( """ ## Examples | Input Text | Speaker | |------------|---------| | Portant, c' est l' seu ki n' doereut nén fé rire di lu, a mi idêye. | Female | | Li bijhe et l’ solea estént ki s’ margayént po sawè kî çki, des deus, esteut l’ pus foirt. Mins ç’ côp la, la k’ i veyèt on tchminåd k' arivéve pyim piam, dins on bea noû tchôd paltot. | Male | | Ci fourit co l' bedot les cåzes ca, a on moumint, li Ptit Prince mi dmanda yåk, come onk k' est so dotance, tot d' on côp | Female | | Li Ptit Prince, da Antoenne di Sint-Spuri, ratourné e walon pa Lorint Enchel | Female | """ ) app.launch()