import gradio as gr import librosa from asr import transcribe from tts import synthesize, TTS_EXAMPLES ALL_LANGUAGES = {} for task in ["asr", "tts", "lid"]: ALL_LANGUAGES.setdefault(task, {}) with open(f"data/{task}/all_langs.tsv") as f: for line in f: iso, name = line.split(" ", 1) ALL_LANGUAGES[task][iso] = name def identify(microphone, file_upload): LID_SAMPLING_RATE = 16_000 warn_output = "" if (microphone is not None) and (file_upload is not None): warn_output = ( "WARNING: You've uploaded an audio file and used the microphone. " "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" ) elif (microphone is None) and (file_upload is None): return "ERROR: You have to either use the microphone or upload an audio file" audio_fp = microphone if microphone is not None else file_upload inputs = librosa.load(audio_fp, sr=LID_SAMPLING_RATE, mono=True)[0] raw_output = {"eng": 0.9, "hin": 0.04, "heb": 0.03, "ara": 0.02, "fra": 0.01} return {(k + ": " + ALL_LANGUAGES["lid"][k]): v for k, v in raw_output.items()} demo = gr.Blocks() mms_transcribe = gr.Interface( fn=transcribe, inputs=[ gr.Audio(source="microphone", type="filepath"), gr.Audio(source="upload", type="filepath"), gr.Dropdown( [f"{k}: {v}" for k, v in ALL_LANGUAGES["asr"].items()], label="Language", value="eng: English", ), ], outputs="text", title="Speech-to-text", description=("Transcribe audio!"), allow_flagging="never", ) mms_synthesize = gr.Interface( fn=synthesize, inputs=[ gr.Text(label="Input text"), gr.Dropdown( [f"{k}: {v}" for k, v in ALL_LANGUAGES["tts"].items()], label="Language", value="eng: English", ), gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Speed"), ], outputs=[ gr.Audio(label="Generated Audio", type="numpy"), gr.Text(label="Filtered text after removing OOVs"), ], examples=TTS_EXAMPLES, title="Text-to-speech", description=("Generate audio!"), allow_flagging="never", ) mms_identify = gr.Interface( fn=identify, inputs=[ gr.Audio(source="microphone", type="filepath"), gr.Audio(source="upload", type="filepath"), ], outputs=gr.Label(num_top_classes=10), title="Language Identification", description=("Identity the language of audio!"), allow_flagging="never", ) with demo: gr.TabbedInterface( [mms_transcribe, mms_synthesize, mms_identify], ["Speech-to-text", "Text-to-speech", "Language Identification"], ) demo.launch()