Spaces:

mms-meta
/

MMS

Running on A10G

File size: 4,889 Bytes

import gradio as gr
import librosa
from asr import transcribe, ASR_EXAMPLES, ASR_LANGUAGES, ASR_NOTE
from tts import synthesize, TTS_EXAMPLES, TTS_LANGUAGES
from lid import identify, LID_EXAMPLES


demo = gr.Blocks()

mms_select_source_trans = gr.Radio(
    ["Record from Mic", "Upload audio"],
    label="Audio input",
    value="Record from Mic",
)
mms_mic_source_trans = gr.Audio(source="microphone", type="filepath", label="Use mic")
mms_upload_source_trans = gr.Audio(
    source="upload", type="filepath", label="Upload file", visible=False
)
mms_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        mms_select_source_trans,
        mms_mic_source_trans,
        mms_upload_source_trans,
        gr.Dropdown(
            [f"{k} ({v})" for k, v in ASR_LANGUAGES.items()],
            label="Language",
            value="eng English",
        ),
        # gr.Checkbox(label="Use Language Model (if available)", default=True),
    ],
    outputs="text",
    examples=ASR_EXAMPLES,
    title="Speech-to-text",
    description=(
        "Transcribe audio from a microphone or input file in your desired language."
    ),
    article=ASR_NOTE,
    allow_flagging="never",
)

mms_synthesize = gr.Interface(
    fn=synthesize,
    inputs=[
        gr.Text(label="Input text"),
        gr.Dropdown(
            [f"{k} ({v})" for k, v in TTS_LANGUAGES.items()],
            label="Language",
            value="eng English",
        ),
        gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Speed"),
    ],
    outputs=[
        gr.Audio(label="Generated Audio", type="numpy"),
        gr.Text(label="Filtered text after removing OOVs"),
    ],
    examples=TTS_EXAMPLES,
    title="Text-to-speech",
    description=("Generate audio in your desired language from input text."),
    allow_flagging="never",
)

mms_select_source_iden = gr.Radio(
    ["Record from Mic", "Upload audio"],
    label="Audio input",
    value="Record from Mic",
)
mms_mic_source_iden = gr.Audio(source="microphone", type="filepath", label="Use mic")
mms_upload_source_iden = gr.Audio(
    source="upload", type="filepath", label="Upload file", visible=False
)
mms_identify = gr.Interface(
    fn=identify,
    inputs=[
        mms_select_source_iden,
        mms_mic_source_iden,
        mms_upload_source_iden,
    ],
    outputs=gr.Label(num_top_classes=10),
    examples=LID_EXAMPLES,
    title="Language Identification",
    description=("Identity the language of input audio."),
    allow_flagging="never",
)

tabbed_interface = gr.TabbedInterface(
    [mms_transcribe, mms_synthesize, mms_identify],
    ["Speech-to-text", "Text-to-speech", "Language Identification"],
)

with gr.Blocks() as demo:
    gr.Markdown(
        "<p align='center' style='font-size: 20px;'>MMS: Scaling Speech Technology to 1000+ languages demo. See our <a href='https://ai.facebook.com/blog/multilingual-model-speech-recognition/'>blog post</a> and <a href='https://arxiv.org/abs/2305.13516'>paper</a>.</p>"
    )
    gr.HTML(
        """<center>Click on the appropriate tab to explore Speech-to-text (ASR), Text-to-speech (TTS) and Language identification (LID) demos.   </center>"""
    )
    gr.HTML(
        """<center>You can also finetune MMS models on your data using the recipes provides here - <a href='https://huggingface.co/blog/mms_adapters'>ASR</a> <a href='https://github.com/ylacombe/finetune-hf-vits'>TTS</a>  </center>"""
    )
    gr.HTML(
        """<center><a href="https://huggingface.co/spaces/facebook/MMS?duplicate=true"  style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"><img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for more control and no queue.</center>"""
    )

    tabbed_interface.render()
    mms_select_source_trans.change(
        lambda x: [
            gr.update(visible=True if x == "Record from Mic" else False),
            gr.update(visible=True if x == "Upload audio" else False),
        ],
        inputs=[mms_select_source_trans],
        outputs=[mms_mic_source_trans, mms_upload_source_trans],
        queue=False,
    )
    mms_select_source_iden.change(
        lambda x: [
            gr.update(visible=True if x == "Record from Mic" else False),
            gr.update(visible=True if x == "Upload audio" else False),
        ],
        inputs=[mms_select_source_iden],
        outputs=[mms_mic_source_iden, mms_upload_source_iden],
        queue=False,
    )
    gr.HTML(
        """
            <div class="footer" style="text-align:center">
                <p>
                    Model by <a href="https://ai.facebook.com" style="text-decoration: underline;" target="_blank">Meta AI</a> - Gradio Demo by 🤗 Hugging Face
                </p>
            </div>
           """
        )

demo.queue(concurrency_count=3)
demo.launch()