shamik
Added the required files to run the app.
e61ddf0
raw
history blame
4.41 kB
import gradio as gr
import numpy as np
import torch
import torchaudio
from transformers import SeamlessM4Tv2Model, AutoProcessor
from lang_list import (
ASR_TARGET_LANGUAGE_NAMES,
LANGUAGE_NAME_TO_CODE,
S2ST_TARGET_LANGUAGE_NAMES,
S2TT_TARGET_LANGUAGE_NAMES,
T2ST_TARGET_LANGUAGE_NAMES,
T2TT_TARGET_LANGUAGE_NAMES,
TEXT_SOURCE_LANGUAGE_NAMES,
)
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
AUDIO_SAMPLE_RATE = 16000.0
MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
DEFAULT_TARGET_LANGUAGE = "French"
if torch.cuda.is_available():
device = torch.device("cuda:0")
dtype = torch.float16
else:
device = torch.device("cpu")
dtype = torch.float32
def preprocess_audio(input_audio: str) -> None:
arr, org_sr = torchaudio.load(input_audio)
new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
if new_arr.shape[1] > max_length:
new_arr = new_arr[:, :max_length]
gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
torchaudio.save(input_audio, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
def run_s2st(
input_audio: str, source_language: str, target_language: str
) -> tuple[tuple[int, np.ndarray] | None, str]:
preprocess_audio(input_audio)
source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
arr, org_sr = torchaudio.load(input_audio)
audio_inputs = processor(audios=arr, return_tensors="pt",
sampling_rate=model.config.sampling_rate).to(device)
output = model.generate(**audio_inputs, return_intermediate_token_ids=True,
tgt_lang=target_language_code,)
audio_array_from_audio = output[0].cpu().numpy().squeeze()
text_tokens = output[2]
translated_text_from_text = processor.decode(text_tokens.tolist()[0], skip_special_tokens=True)
return (int(AUDIO_SAMPLE_RATE), audio_array_from_audio), translated_text_from_text
description = """
# Direct Speech to Speech Translation
This demo uses SeamlessM4T V2 to translate one speech directly into another.
The model being used here is [facebook/seamless-m4t-v2-large](https://huggingface.co/facebook/seamless-m4t-v2-large).
SeamlessM4T V2 is unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST) translation and more, without relying on multiple separate models.
"""
with gr.Blocks() as demo_s2st:
gr.Markdown(description)
with gr.Row():
with gr.Column():
with gr.Group():
input_audio = gr.Audio(label="Input speech", type="filepath")
source_language = gr.Dropdown(
label="Source language",
choices=ASR_TARGET_LANGUAGE_NAMES,
value="English",
)
target_language = gr.Dropdown(
label="Target language",
choices=S2ST_TARGET_LANGUAGE_NAMES,
value=DEFAULT_TARGET_LANGUAGE,
)
btn = gr.Button("Translate")
with gr.Column():
with gr.Group():
output_audio = gr.Audio(
label="Translated speech",
autoplay=False,
streaming=False,
type="numpy",
)
output_text = gr.Textbox(label="Translated text")
gr.Examples(
examples=[
["assets/sample_input.mp3", "English", "French"],
["assets/sample_input.mp3", "English", "Mandarin Chinese"],
["assets/sample_input_2.mp3", "English", "Hindi"],
["assets/sample_input_2.mp3", "English", "Spanish"],
],
inputs=[input_audio, source_language, target_language],
outputs=[output_audio, output_text],
fn=run_s2st,
cache_examples=True,
allow_flagging="never",
)
btn.click(
fn=run_s2st,
inputs=[input_audio, source_language, target_language],
outputs=[output_audio, output_text],
)
demo_s2st.launch()