Sarvm-audio-search-and-language-translation

Runtime error

App Files Files Community

Sarvm-audio-search-and-language-translation / app.py

yiliny

Update app.py

39ff2a7 11 months ago

raw

history blame

10 kB

	#!/usr/bin/env python

	import os
	import pathlib
	import tempfile

	import gradio as gr
	import torch
	import torchaudio
	from fairseq2.assets import InProcAssetMetadataProvider, asset_store
	from fairseq2.data import Collater, SequenceData, VocabularyInfo
	from fairseq2.data.audio import (
	AudioDecoder,
	WaveformToFbankConverter,
	WaveformToFbankOutput,
	)

	from seamless_communication.inference import SequenceGeneratorOptions
	from fairseq2.generation import NGramRepeatBlockProcessor
	from fairseq2.memory import MemoryBlock
	from fairseq2.typing import DataType, Device
	from huggingface_hub import snapshot_download
	from seamless_communication.inference import BatchedSpeechOutput, Translator, SequenceGeneratorOptions
	from seamless_communication.models.generator.loader import load_pretssel_vocoder_model
	from seamless_communication.models.unity import (
	UnitTokenizer,
	load_gcmvn_stats,
	load_unity_text_tokenizer,
	load_unity_unit_tokenizer,
	)
	from torch.nn import Module
	from seamless_communication.cli.expressivity.evaluate.pretssel_inference_helper import PretsselGenerator

	from utils import LANGUAGE_CODE_TO_NAME

	DESCRIPTION = """\
	# Seamless Expressive


	[SeamlessExpressive](https://github.com/facebookresearch/seamless_communication/blob/main/docs/expressive/README.md) is a speech-to-speech translation model that captures certain underexplored aspects of prosody such as speech rate and pauses, while preserving the style of one's voice and high content translation quality.
	"""

	CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1" and torch.cuda.is_available()

	CHECKPOINTS_PATH = pathlib.Path(os.getenv("CHECKPOINTS_PATH", "/home/user/app/models"))
	if not CHECKPOINTS_PATH.exists():
	snapshot_download(repo_id="facebook/seamless-expressive", repo_type="model", local_dir=CHECKPOINTS_PATH)
	snapshot_download(repo_id="facebook/seamless-m4t-v2-large", repo_type="model", local_dir=CHECKPOINTS_PATH)

	# Ensure that we do not have any other environment resolvers and always return
	# "demo" for demo purposes.
	asset_store.env_resolvers.clear()
	asset_store.env_resolvers.append(lambda: "demo")

	# Construct an `InProcAssetMetadataProvider` with environment-specific metadata
	# that just overrides the regular metadata for "demo" environment. Note the "@demo" suffix.
	demo_metadata = [
	{
	"name": "seamless_expressivity@demo",
	"checkpoint": f"file://{CHECKPOINTS_PATH}/m2m_expressive_unity.pt",
	"char_tokenizer": f"file://{CHECKPOINTS_PATH}/spm_char_lang38_tc.model",
	},
	{
	"name": "vocoder_pretssel@demo",
	"checkpoint": f"file://{CHECKPOINTS_PATH}/pretssel_melhifigan_wm-final.pt",
	},
	{
	"name": "seamlessM4T_v2_large@demo",
	"checkpoint": f"file://{CHECKPOINTS_PATH}/seamlessM4T_v2_large.pt",
	"char_tokenizer": f"file://{CHECKPOINTS_PATH}/spm_char_lang38_tc.model",
	},
	]

	asset_store.metadata_providers.append(InProcAssetMetadataProvider(demo_metadata))

	LANGUAGE_NAME_TO_CODE = {v: k for k, v in LANGUAGE_CODE_TO_NAME.items()}


	if torch.cuda.is_available():
	device = torch.device("cuda:0")
	dtype = torch.float16
	else:
	device = torch.device("cpu")
	dtype = torch.float32


	MODEL_NAME = "seamless_expressivity"
	VOCODER_NAME = "vocoder_pretssel"

	# used for ASR for toxicity
	m4t_translator = Translator(
	model_name_or_card="seamlessM4T_v2_large",
	vocoder_name_or_card=None,
	device=device,
	dtype=dtype,
	)
	unit_tokenizer = load_unity_unit_tokenizer(MODEL_NAME)

	_gcmvn_mean, _gcmvn_std = load_gcmvn_stats(VOCODER_NAME)
	gcmvn_mean = torch.tensor(_gcmvn_mean, device=device, dtype=dtype)
	gcmvn_std = torch.tensor(_gcmvn_std, device=device, dtype=dtype)

	translator = Translator(
	MODEL_NAME,
	vocoder_name_or_card=None,
	device=device,
	dtype=dtype,
	apply_mintox=False,
	)

	text_generation_opts = SequenceGeneratorOptions(
	beam_size=5,
	unk_penalty=torch.inf,
	soft_max_seq_len=(0, 200),
	step_processor=NGramRepeatBlockProcessor(
	ngram_size=10,
	),
	)
	m4t_text_generation_opts = SequenceGeneratorOptions(
	beam_size=5,
	unk_penalty=torch.inf,
	soft_max_seq_len=(1, 200),
	step_processor=NGramRepeatBlockProcessor(
	ngram_size=10,
	),
	)

	pretssel_generator = PretsselGenerator(
	VOCODER_NAME,
	vocab_info=unit_tokenizer.vocab_info,
	device=device,
	dtype=dtype,
	)

	decode_audio = AudioDecoder(dtype=torch.float32, device=device)

	convert_to_fbank = WaveformToFbankConverter(
	num_mel_bins=80,
	waveform_scale=2**15,
	channel_last=True,
	standardize=False,
	device=device,
	dtype=dtype,
	)


	def normalize_fbank(data: WaveformToFbankOutput) -> WaveformToFbankOutput:
	fbank = data["fbank"]
	std, mean = torch.std_mean(fbank, dim=0)
	data["fbank"] = fbank.subtract(mean).divide(std)
	data["gcmvn_fbank"] = fbank.subtract(gcmvn_mean).divide(gcmvn_std)
	return data


	collate = Collater(pad_value=0, pad_to_multiple=1)


	AUDIO_SAMPLE_RATE = 16000
	MAX_INPUT_AUDIO_LENGTH = 10 # in seconds


	def remove_prosody_tokens_from_text(text):
	# filter out prosody tokens, there is only emphasis '*', and pause '='
	text = text.replace("*", "").replace("=", "")
	text = " ".join(text.split())
	return text


	def preprocess_audio(input_audio_path: str) -> None:
	arr, org_sr = torchaudio.load(input_audio_path)
	new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
	max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
	if new_arr.shape[1] > max_length:
	new_arr = new_arr[:, :max_length]
	gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
	torchaudio.save(input_audio_path, new_arr, sample_rate=AUDIO_SAMPLE_RATE)


	def run(
	input_audio_path: str,
	source_language: str,
	target_language: str,
	) -> tuple[str, str]:
	target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
	source_language_code = LANGUAGE_NAME_TO_CODE[source_language]

	preprocess_audio(input_audio_path)

	with pathlib.Path(input_audio_path).open("rb") as fb:
	block = MemoryBlock(fb.read())
	example = decode_audio(block)

	example = convert_to_fbank(example)
	example = normalize_fbank(example)
	example = collate(example)

	# get transcription for mintox
	source_sentences, _ = m4t_translator.predict(
	input=example["fbank"],
	task_str="S2TT", # get source text
	tgt_lang=source_language_code,
	text_generation_opts=m4t_text_generation_opts,
	)
	source_text = str(source_sentences[0])

	prosody_encoder_input = example["gcmvn_fbank"]
	text_output, unit_output = translator.predict(
	example["fbank"],
	"S2ST",
	tgt_lang=target_language_code,
	src_lang=source_language_code,
	text_generation_opts=text_generation_opts,
	unit_generation_ngram_filtering=False,
	duration_factor=1.0,
	prosody_encoder_input=prosody_encoder_input,
	src_text=source_text, # for mintox check
	)
	speech_output = pretssel_generator.predict(
	unit_output.units,
	tgt_lang=target_language_code,
	prosody_encoder_input=prosody_encoder_input,
	)

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	torchaudio.save(
	f.name,
	speech_output.audio_wavs[0][0].to(torch.float32).cpu(),
	sample_rate=speech_output.sample_rate,
	)

	text_out = remove_prosody_tokens_from_text(str(text_output[0]))

	return f.name, text_out


	TARGET_LANGUAGE_NAMES = [
	"English",
	"French",
	"German",
	"Spanish",
	]

	UPDATED_LANGUAGE_LIST = {
	"English": ["French", "German", "Spanish"],
	"French": ["English", "German", "Spanish"],
	"German": ["English", "French", "Spanish"],
	"Spanish": ["English", "French", "German"],
	}

	def rs_change(rs):
	return gr.update(
	choices=UPDATED_LANGUAGE_LIST[rs],
	value=UPDATED_LANGUAGE_LIST[rs][0],
	)

	with gr.Blocks(css="style.css") as demo:
	gr.Markdown(DESCRIPTION)
	gr.DuplicateButton(
	value="Duplicate Space for private use",
	elem_id="duplicate-button",
	visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
	)
	with gr.Row():
	with gr.Column():
	with gr.Group():
	input_audio = gr.Audio(label="Input speech", type="filepath")
	source_language = gr.Dropdown(
	label="Source language",
	choices=TARGET_LANGUAGE_NAMES,
	value="English",
	)
	target_language = gr.Dropdown(
	label="Target language",
	choices=TARGET_LANGUAGE_NAMES,
	value="French",
	interactive=True,
	)
	source_language.change(
	fn=rs_change,
	inputs=[source_language],
	outputs=[target_language],
	)

	btn = gr.Button()
	with gr.Column():
	with gr.Group():
	output_audio = gr.Audio(label="Translated speech")
	output_text = gr.Textbox(label="Translated text")

	gr.Examples(
	examples=[
	["assets/Excited-Es.wav", "English", "Spanish"],
	["assets/whisper.wav", "English", "French"],
	["assets/FastTalking-En.wav", "French", "English"],
	["assets/Sad-Es.wav", "English", "Spanish"],
	],
	inputs=[input_audio, source_language, target_language],
	outputs=[output_audio, output_text],
	fn=run,
	cache_examples=CACHE_EXAMPLES,
	api_name=False,
	)

	btn.click(
	fn=run,
	inputs=[input_audio, source_language, target_language],
	outputs=[output_audio, output_text],
	api_name="run",
	)

	if __name__ == "__main__":
	demo.queue(max_size=50).launch()