# ################################ # Model: Whisper (Encoder-Decoder) + NLL # Augmentation: TimeDomainSpecAugment # Authors: Adel Moumen 2022, Titouan Parcollet 2022, Rosy Southwell 2023 # ################################ model_src: rosyvs/whisat-base model_type: tiny.en language: english auto_mix_prec: False # TODO: set to True for CUDA only_encoder: False # These values are only used for the searchers. # They needs to be hardcoded and should not be changed with Whisper. # They are used as part of the searching process. # The bos token of the searcher will be timestamp_index # and will be concatenated with the bos, language and task tokens. timestamp_index: 50363 eos_index: 50257 bos_index: 50258 # Decoding parameters min_decode_ratio: 0.0 max_decode_ratio: 1.0 # the commonvoice inference yaml uses 0.1 test_beam_size: 5 # TODO: this was 8, changing to 5 as this is the default used by openAI whisper: !new:speechbrain.lobes.models.huggingface_whisper.HuggingFaceWhisper encoder_only: !ref freeze: True freeze_encoder: True source: !ref save_path: !ref # language: language # tokenizer: !new:speechbrain.lobes.models.huggingface_whisper.HuggingFaceWhisper # encoder_only: False # freeze: True # freeze_encoder: True # source: !ref openai/whisper- # save_path: !ref # decoder: !new:speechbrain.decoders.seq2seq.S2SWhisperGreedySearch # model: !ref # bos_index: !ref # eos_index: !ref # min_decode_ratio: !ref # max_decode_ratio: !ref decoder: !new:speechbrain.decoders.seq2seq.S2SWhisperBeamSearch module: [!ref ] bos_index: !ref eos_index: !ref min_decode_ratio: !ref max_decode_ratio: !ref beam_size: !ref modules: whisper: !ref # tokenizer: !ref decoder: !ref # can change to greedy pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: whisper: !ref # tokenizer: !ref # decoder: !ref # paths: # whisper: !ref /whisper.ckpt # tokenizer: !ref openai/whisper- # decoder: !ref /whisper.ckpt # checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer # checkpoints_dir: !ref # recoverables: # whisper: !ref