# ################################ | |
# Model: Whisper (Encoder-Decoder) + NLL | |
# Augmentation: TimeDomainSpecAugment | |
# Authors: Adel Moumen 2022, Titouan Parcollet 2022, Rosy Southwell 2023 | |
# ################################ | |
model_src: rosyvs/whisat-base | |
model_type: tiny.en | |
language: english | |
auto_mix_prec: False # TODO: set to True for CUDA | |
only_encoder: False | |
# These values are only used for the searchers. | |
# They needs to be hardcoded and should not be changed with Whisper. | |
# They are used as part of the searching process. | |
# The bos token of the searcher will be timestamp_index | |
# and will be concatenated with the bos, language and task tokens. | |
timestamp_index: 50363 | |
eos_index: 50257 | |
bos_index: 50258 | |
# Decoding parameters | |
min_decode_ratio: 0.0 | |
max_decode_ratio: 1.0 # the commonvoice inference yaml uses 0.1 | |
test_beam_size: 5 # TODO: this was 8, changing to 5 as this is the default used by openAI | |
whisper: !new:speechbrain.lobes.models.huggingface_whisper.HuggingFaceWhisper | |
encoder_only: !ref <only_encoder> | |
freeze: True | |
freeze_encoder: True | |
source: !ref <model_src> | |
save_path: !ref <cache_dir> | |
# language: language | |
# tokenizer: !new:speechbrain.lobes.models.huggingface_whisper.HuggingFaceWhisper | |
# encoder_only: False | |
# freeze: True | |
# freeze_encoder: True | |
# source: !ref openai/whisper-<model_type> | |
# save_path: !ref <cache_dir> | |
# decoder: !new:speechbrain.decoders.seq2seq.S2SWhisperGreedySearch | |
# model: !ref <whisper> | |
# bos_index: !ref <timestamp_index> | |
# eos_index: !ref <eos_index> | |
# min_decode_ratio: !ref <min_decode_ratio> | |
# max_decode_ratio: !ref <max_decode_ratio> | |
decoder: !new:speechbrain.decoders.seq2seq.S2SWhisperBeamSearch | |
module: [!ref <whisper>] | |
bos_index: !ref <timestamp_index> | |
eos_index: !ref <eos_index> | |
min_decode_ratio: !ref <min_decode_ratio> | |
max_decode_ratio: !ref <max_decode_ratio> | |
beam_size: !ref <test_beam_size> | |
modules: | |
whisper: !ref <whisper> | |
# tokenizer: !ref <tokenizer> | |
decoder: !ref <decoder> # can change to greedy | |
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer | |
loadables: | |
whisper: !ref <whisper> | |
# tokenizer: !ref <tokenizer> | |
# decoder: !ref <decoder> | |
# paths: | |
# whisper: !ref <model_src>/whisper.ckpt | |
# tokenizer: !ref openai/whisper-<model_type> | |
# decoder: !ref <model_src>/whisper.ckpt | |
# checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer | |
# checkpoints_dir: !ref <model_src> | |
# recoverables: | |
# whisper: !ref <whisper> | |