import gradio as gr import numpy as np import torch import torchaudio from transformers import SeamlessM4Tv2Model, AutoProcessor from lang_list import ( ASR_TARGET_LANGUAGE_NAMES, LANGUAGE_NAME_TO_CODE, S2ST_TARGET_LANGUAGE_NAMES, S2TT_TARGET_LANGUAGE_NAMES, T2ST_TARGET_LANGUAGE_NAMES, T2TT_TARGET_LANGUAGE_NAMES, TEXT_SOURCE_LANGUAGE_NAMES, ) processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large") model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large") device = "cuda:0" if torch.cuda.is_available() else "cpu" AUDIO_SAMPLE_RATE = 16000.0 MAX_INPUT_AUDIO_LENGTH = 60 # in seconds DEFAULT_TARGET_LANGUAGE = "French" if torch.cuda.is_available(): device = torch.device("cuda:0") dtype = torch.float16 else: device = torch.device("cpu") dtype = torch.float32 def preprocess_audio(input_audio: str) -> None: arr, org_sr = torchaudio.load(input_audio) new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE) max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE) if new_arr.shape[1] > max_length: new_arr = new_arr[:, :max_length] gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.") torchaudio.save(input_audio, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE)) def run_s2st( input_audio: str, source_language: str, target_language: str ) -> tuple[tuple[int, np.ndarray] | None, str]: preprocess_audio(input_audio) source_language_code = LANGUAGE_NAME_TO_CODE[source_language] target_language_code = LANGUAGE_NAME_TO_CODE[target_language] arr, org_sr = torchaudio.load(input_audio) audio_inputs = processor(audios=arr, return_tensors="pt", sampling_rate=model.config.sampling_rate).to(device) output = model.generate(**audio_inputs, return_intermediate_token_ids=True, tgt_lang=target_language_code,) audio_array_from_audio = output[0].cpu().numpy().squeeze() text_tokens = output[2] translated_text_from_text = processor.decode(text_tokens.tolist()[0], skip_special_tokens=True) return (int(AUDIO_SAMPLE_RATE), audio_array_from_audio), translated_text_from_text description = """ # Direct Speech to Speech Translation This demo uses SeamlessM4T V2 to translate one speech directly into another. The model being used here is [facebook/seamless-m4t-v2-large](https://huggingface.co/facebook/seamless-m4t-v2-large). SeamlessM4T V2 is unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST) translation and more, without relying on multiple separate models. """ with gr.Blocks() as demo_s2st: gr.Markdown(description) with gr.Row(): with gr.Column(): with gr.Group(): input_audio = gr.Audio(label="Input speech", type="filepath") source_language = gr.Dropdown( label="Source language", choices=ASR_TARGET_LANGUAGE_NAMES, value="English", ) target_language = gr.Dropdown( label="Target language", choices=S2ST_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE, ) btn = gr.Button("Translate") with gr.Column(): with gr.Group(): output_audio = gr.Audio( label="Translated speech", autoplay=False, streaming=False, type="numpy", ) output_text = gr.Textbox(label="Translated text") gr.Examples( examples=[ ["assets/sample_input.mp3", "English", "French"], ["assets/sample_input.mp3", "English", "Mandarin Chinese"], ["assets/sample_input_2.mp3", "English", "Hindi"], ["assets/sample_input_2.mp3", "English", "Spanish"], ], inputs=[input_audio, source_language, target_language], outputs=[output_audio, output_text], fn=run_s2st, cache_examples=True, allow_flagging="never", ) btn.click( fn=run_s2st, inputs=[input_audio, source_language, target_language], outputs=[output_audio, output_text], ) demo_s2st.launch()