Spaces:

Shamik
/

direct_speech_to_speech_translation

Runtime error

direct_speech_to_speech_translation / app.py

shamik

Added the required files to run the app.

e61ddf0 10 months ago

4.41 kB

	import gradio as gr
	import numpy as np
	import torch
	import torchaudio

	from transformers import SeamlessM4Tv2Model, AutoProcessor
	from lang_list import (
	ASR_TARGET_LANGUAGE_NAMES,
	LANGUAGE_NAME_TO_CODE,
	S2ST_TARGET_LANGUAGE_NAMES,
	S2TT_TARGET_LANGUAGE_NAMES,
	T2ST_TARGET_LANGUAGE_NAMES,
	T2TT_TARGET_LANGUAGE_NAMES,
	TEXT_SOURCE_LANGUAGE_NAMES,
	)

	processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
	model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	AUDIO_SAMPLE_RATE = 16000.0
	MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
	DEFAULT_TARGET_LANGUAGE = "French"

	if torch.cuda.is_available():
	device = torch.device("cuda:0")
	dtype = torch.float16
	else:
	device = torch.device("cpu")
	dtype = torch.float32

	def preprocess_audio(input_audio: str) -> None:
	arr, org_sr = torchaudio.load(input_audio)
	new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
	max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
	if new_arr.shape[1] > max_length:
	new_arr = new_arr[:, :max_length]
	gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
	torchaudio.save(input_audio, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))


	def run_s2st(
	input_audio: str, source_language: str, target_language: str
	) -> tuple[tuple[int, np.ndarray] \| None, str]:
	preprocess_audio(input_audio)
	source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
	target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
	arr, org_sr = torchaudio.load(input_audio)
	audio_inputs = processor(audios=arr, return_tensors="pt",
	sampling_rate=model.config.sampling_rate).to(device)
	output = model.generate(**audio_inputs, return_intermediate_token_ids=True,
	tgt_lang=target_language_code,)

	audio_array_from_audio = output[0].cpu().numpy().squeeze()
	text_tokens = output[2]
	translated_text_from_text = processor.decode(text_tokens.tolist()[0], skip_special_tokens=True)
	return (int(AUDIO_SAMPLE_RATE), audio_array_from_audio), translated_text_from_text

	description = """
	# Direct Speech to Speech Translation

	This demo uses SeamlessM4T V2 to translate one speech directly into another.
	The model being used here is [facebook/seamless-m4t-v2-large](https://huggingface.co/facebook/seamless-m4t-v2-large).

	SeamlessM4T V2 is unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST) translation and more, without relying on multiple separate models.
	"""

	with gr.Blocks() as demo_s2st:
	gr.Markdown(description)
	with gr.Row():
	with gr.Column():
	with gr.Group():
	input_audio = gr.Audio(label="Input speech", type="filepath")
	source_language = gr.Dropdown(
	label="Source language",
	choices=ASR_TARGET_LANGUAGE_NAMES,
	value="English",
	)
	target_language = gr.Dropdown(
	label="Target language",
	choices=S2ST_TARGET_LANGUAGE_NAMES,
	value=DEFAULT_TARGET_LANGUAGE,
	)
	btn = gr.Button("Translate")
	with gr.Column():
	with gr.Group():
	output_audio = gr.Audio(
	label="Translated speech",
	autoplay=False,
	streaming=False,
	type="numpy",
	)
	output_text = gr.Textbox(label="Translated text")

	gr.Examples(
	examples=[
	["assets/sample_input.mp3", "English", "French"],
	["assets/sample_input.mp3", "English", "Mandarin Chinese"],
	["assets/sample_input_2.mp3", "English", "Hindi"],
	["assets/sample_input_2.mp3", "English", "Spanish"],
	],
	inputs=[input_audio, source_language, target_language],
	outputs=[output_audio, output_text],
	fn=run_s2st,
	cache_examples=True,
	allow_flagging="never",
	)

	btn.click(
	fn=run_s2st,
	inputs=[input_audio, source_language, target_language],
	outputs=[output_audio, output_text],
	)

	demo_s2st.launch()