from espnet2.bin.tts_inference import Text2Speech
import soundfile as sf
import gradio as gr
import subprocess

subprocess.check_output("git lfs install", shell=True)
subprocess.check_output("git clone https://huggingface.co/DigitalUmuganda/lingala_vits_tts", 
                        shell=True)

def generate_audio(text):
    text2speech = Text2Speech(train_config="lingala_vits_tts/config.yaml",model_file="lingala_vits_tts/train.total_count.best.pth")
    wav = text2speech(text)["wav"]
    sf.write("outfile.wav", wav.numpy(), text2speech.fs, "PCM_16")
    return "outfile.wav"


iface = gr.Interface(
    fn=generate_audio,
    inputs=[
        gr.inputs.Textbox(
            label="Entrez le text",
        ),
    ],
    outputs=gr.outputs.Audio(type="filepath",label="Output"),
    #outputs=gr.outputs.Textbox(label="Recognized speech from speechbrain model"),
    title="Lingala TTS",
    description="Logiciel capable de creer de son a partir de texte en Lingala",
    layout="vertical",
    allow_flagging=False,
    flagging_options=['erreur', 'mauvaise-qualite', 'mauvaise-prononciation'],
)
iface.launch(share=False)