Spaces:
Running
Running
import gradio as gr | |
import librosa | |
from asr import transcribe | |
from tts import synthesize, TTS_EXAMPLES | |
ALL_LANGUAGES = {} | |
for task in ["asr", "tts", "lid"]: | |
ALL_LANGUAGES.setdefault(task, {}) | |
with open(f"data/{task}/all_langs.tsv") as f: | |
for line in f: | |
iso, name = line.split(" ", 1) | |
ALL_LANGUAGES[task][iso] = name | |
def identify(microphone, file_upload): | |
LID_SAMPLING_RATE = 16_000 | |
warn_output = "" | |
if (microphone is not None) and (file_upload is not None): | |
warn_output = ( | |
"WARNING: You've uploaded an audio file and used the microphone. " | |
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" | |
) | |
elif (microphone is None) and (file_upload is None): | |
return "ERROR: You have to either use the microphone or upload an audio file" | |
audio_fp = microphone if microphone is not None else file_upload | |
inputs = librosa.load(audio_fp, sr=LID_SAMPLING_RATE, mono=True)[0] | |
raw_output = {"eng": 0.9, "hin": 0.04, "heb": 0.03, "ara": 0.02, "fra": 0.01} | |
return {(k + ": " + ALL_LANGUAGES["lid"][k]): v for k, v in raw_output.items()} | |
demo = gr.Blocks() | |
mms_transcribe = gr.Interface( | |
fn=transcribe, | |
inputs=[ | |
gr.Audio(source="microphone", type="filepath"), | |
gr.Audio(source="upload", type="filepath"), | |
gr.Dropdown( | |
[f"{k}: {v}" for k, v in ALL_LANGUAGES["asr"].items()], | |
label="Language", | |
value="tuk-script_latin: Turkmen", | |
), | |
], | |
outputs="text", | |
title="Speech-to-text", | |
description=("Transcribe audio!"), | |
allow_flagging="never", | |
) | |
mms_synthesize = gr.Interface( | |
fn=synthesize, | |
inputs=[ | |
gr.Text(label="Input text"), | |
gr.Dropdown( | |
[f"{k}: {v}" for k, v in ALL_LANGUAGES["tts"].items()], | |
label="Language", | |
value="tuk-script_latin: Turkmen", | |
), | |
gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Speed"), | |
], | |
outputs=[ | |
gr.Audio(label="Generated Audio", type="numpy"), | |
gr.Text(label="Filtered text after removing OOVs"), | |
], | |
examples=TTS_EXAMPLES, | |
title="Text-to-speech", | |
description=("Generate audio!"), | |
allow_flagging="never", | |
) | |
mms_identify = gr.Interface( | |
fn=identify, | |
inputs=[ | |
gr.Audio(source="microphone", type="filepath"), | |
gr.Audio(source="upload", type="filepath"), | |
], | |
outputs=gr.Label(num_top_classes=10), | |
title="Language Identification", | |
description=("Identity the language of audio!"), | |
allow_flagging="never", | |
) | |
with demo: | |
gr.TabbedInterface( | |
[mms_transcribe, mms_synthesize, mms_identify], | |
["Speech-to-text", "Text-to-speech", "Language Identification"], | |
) | |
demo.launch() | |