whisat-base / infer_hf_whisper.yaml

rosyvs

first commit - version 202_base-en_v1 model run. trained without augmentation

8e560f0 over 1 year ago

2.59 kB

	# ################################
	# Model: Whisper (Encoder-Decoder) + NLL
	# Augmentation: TimeDomainSpecAugment
	# Authors: Adel Moumen 2022, Titouan Parcollet 2022, Rosy Southwell 2023
	# ################################

	model_src: rosyvs/whisat-base
	model_type: tiny.en
	language: english
	auto_mix_prec: False # TODO: set to True for CUDA
	only_encoder: False

	# These values are only used for the searchers.
	# They needs to be hardcoded and should not be changed with Whisper.
	# They are used as part of the searching process.
	# The bos token of the searcher will be timestamp_index
	# and will be concatenated with the bos, language and task tokens.
	timestamp_index: 50363
	eos_index: 50257
	bos_index: 50258

	# Decoding parameters
	min_decode_ratio: 0.0
	max_decode_ratio: 1.0 # the commonvoice inference yaml uses 0.1
	test_beam_size: 5 # TODO: this was 8, changing to 5 as this is the default used by openAI

	whisper: !new:speechbrain.lobes.models.huggingface_whisper.HuggingFaceWhisper
	encoder_only: !ref <only_encoder>
	freeze: True
	freeze_encoder: True
	source: !ref <model_src>
	save_path: !ref <cache_dir>
	# language: language

	# tokenizer: !new:speechbrain.lobes.models.huggingface_whisper.HuggingFaceWhisper
	# encoder_only: False
	# freeze: True
	# freeze_encoder: True
	# source: !ref openai/whisper-<model_type>
	# save_path: !ref <cache_dir>

	# decoder: !new:speechbrain.decoders.seq2seq.S2SWhisperGreedySearch
	# model: !ref <whisper>
	# bos_index: !ref <timestamp_index>
	# eos_index: !ref <eos_index>
	# min_decode_ratio: !ref <min_decode_ratio>
	# max_decode_ratio: !ref <max_decode_ratio>

	decoder: !new:speechbrain.decoders.seq2seq.S2SWhisperBeamSearch
	module: [!ref <whisper>]
	bos_index: !ref <timestamp_index>
	eos_index: !ref <eos_index>
	min_decode_ratio: !ref <min_decode_ratio>
	max_decode_ratio: !ref <max_decode_ratio>
	beam_size: !ref <test_beam_size>

	modules:
	whisper: !ref <whisper>
	# tokenizer: !ref <tokenizer>
	decoder: !ref <decoder> # can change to greedy

	pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
	loadables:
	whisper: !ref <whisper>
	# tokenizer: !ref <tokenizer>
	# decoder: !ref <decoder>
	# paths:
	# whisper: !ref <model_src>/whisper.ckpt
	# tokenizer: !ref openai/whisper-<model_type>
	# decoder: !ref <model_src>/whisper.ckpt

	# checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
	# checkpoints_dir: !ref <model_src>
	# recoverables:
	# whisper: !ref <whisper>