whisper-webui-translate

Running

File size: 11,789 Bytes

{
  "models": {
    "whisper": [
      // Configuration for the built-in models. You can remove any of these 
      // if you don't want to use the default models.
      {
        "name": "tiny",
        "url": "tiny" 
      },
      {
        "name": "base",
        "url": "base"
      },
      {
        "name": "small",
        "url": "small"
      },
      {
        "name": "medium",
        "url": "medium"
      },
      {
        "name": "large",
        "url": "large"
      },
      {
        "name": "large-v2",
        "url": "large-v2"
      },
      {
        "name": "large-v3",
        "url": "large-v3"
      }
      // Uncomment to add custom Japanese models
      //{
      //  "name": "whisper-large-v2-mix-jp",
      //  "url": "vumichien/whisper-large-v2-mix-jp",
      //  // The type of the model. Can be "huggingface" or "whisper" - "whisper" is the default.
      //  // HuggingFace models are loaded using the HuggingFace transformers library and then converted to Whisper models.
      //  "type": "huggingface",
      //},
      //{
      //  "name": "local-model",
      //  "url": "path/to/local/model",
      //},
      //{
      //  "name": "remote-model",
      //  "url": "https://example.com/path/to/model",
      //}
    ],
    "m2m100": [
      {
        "name": "m2m100_1.2B-ct2fast/michaelfeil",
        "url": "michaelfeil/ct2fast-m2m100_1.2B",
        "type": "huggingface",
        "tokenizer_url": "facebook/m2m100_1.2B"
      },
      {
        "name": "m2m100_418M-ct2fast/michaelfeil",
        "url": "michaelfeil/ct2fast-m2m100_418M",
        "type": "huggingface",
        "tokenizer_url": "facebook/m2m100_418M"
      },
      //{
      //  "name": "m2m100-12B-ct2fast/michaelfeil",
      //  "url": "michaelfeil/ct2fast-m2m100-12B-last-ckpt",
      //  "type": "huggingface",
      //  "tokenizer_url": "facebook/m2m100-12B-last-ckpt"
      //},
      {
        "name": "m2m100_1.2B/facebook",
        "url": "facebook/m2m100_1.2B",
        "type": "huggingface"
      },
      {
        "name": "m2m100_418M/facebook",
        "url": "facebook/m2m100_418M",
        "type": "huggingface"
      }
    ],
    "nllb": [
      {
        "name": "nllb-200-distilled-1.3B-ct2fast:int8_float16/michaelfeil",
        "url": "michaelfeil/ct2fast-nllb-200-distilled-1.3B",
        "type": "huggingface",
        "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
      },
      {
        "name": "nllb-200-3.3B-ct2fast:int8_float16/michaelfeil",
        "url": "michaelfeil/ct2fast-nllb-200-3.3B",
        "type": "huggingface",
        "tokenizer_url": "facebook/nllb-200-3.3B"
      },
      {
        "name": "nllb-200-1.3B-ct2:float16/JustFrederik",
        "url": "JustFrederik/nllb-200-1.3B-ct2-float16",
        "type": "huggingface",
        "tokenizer_url": "facebook/nllb-200-1.3B"
      },
      {
        "name": "nllb-200-distilled-1.3B-ct2:float16/JustFrederik",
        "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-float16",
        "type": "huggingface",
        "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
      },
      {
        "name": "nllb-200-1.3B-ct2:int8/JustFrederik",
        "url": "JustFrederik/nllb-200-1.3B-ct2-int8",
        "type": "huggingface",
        "tokenizer_url": "facebook/nllb-200-1.3B"
      },
      {
        "name": "nllb-200-distilled-1.3B-ct2:int8/JustFrederik",
        "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-int8",
        "type": "huggingface",
        "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
      },
      {
        "name": "nllb-200-distilled-600M/facebook",
        "url": "facebook/nllb-200-distilled-600M",
        "type": "huggingface"
      },
      {
        "name": "nllb-200-distilled-600M-ct2/JustFrederik",
        "url": "JustFrederik/nllb-200-distilled-600M-ct2",
        "type": "huggingface",
        "tokenizer_url": "facebook/nllb-200-distilled-600M"
      },
      {
        "name": "nllb-200-distilled-600M-ct2:float16/JustFrederik",
        "url": "JustFrederik/nllb-200-distilled-600M-ct2-float16",
        "type": "huggingface",
        "tokenizer_url": "facebook/nllb-200-distilled-600M"
      },
      {
        "name": "nllb-200-distilled-600M-ct2:int8/JustFrederik",
        "url": "JustFrederik/nllb-200-distilled-600M-ct2-int8",
        "type": "huggingface",
        "tokenizer_url": "facebook/nllb-200-distilled-600M"
      }
      // Uncomment to add official Facebook 1.3B and 3.3B model
      // The official Facebook 1.3B and 3.3B model files are too large,
      //   and to avoid occupying too much disk space on Hugging Face's free spaces, 
      //   these models are not included in the config.
      //{
      //  "name": "nllb-200-distilled-1.3B/facebook",
      //  "url": "facebook/nllb-200-distilled-1.3B",
      //  "type": "huggingface"
      //},
      //{
      //  "name": "nllb-200-1.3B/facebook",
      //  "url": "facebook/nllb-200-1.3B",
      //  "type": "huggingface"
      //},
      //{
      //  "name": "nllb-200-3.3B/facebook",
      //  "url": "facebook/nllb-200-3.3B",
      //  "type": "huggingface"
      //},
      //{
      //  "name": "nllb-200-distilled-1.3B-ct2/JustFrederik",
      //  "url": "JustFrederik/nllb-200-distilled-1.3B-ct2",
      //  "type": "huggingface",
      //  "tokenizer_url": "facebook/nllb-200-distilled-1.3B"
      //},
      //{
      //  "name": "nllb-200-1.3B-ct2/JustFrederik",
      //  "url": "JustFrederik/nllb-200-1.3B-ct2",
      //  "type": "huggingface",
      //  "tokenizer_url": "facebook/nllb-200-1.3B"
      //},
      //{
      //  "name": "nllb-200-3.3B-ct2:float16/JustFrederik",
      //  "url": "JustFrederik/nllb-200-3.3B-ct2-float16",
      //  "type": "huggingface",
      //  "tokenizer_url": "facebook/nllb-200-3.3B"
      //},
    ],
    "mt5": [
      {
        "name": "mt5-zh-ja-en-trimmed/K024",
        "url": "K024/mt5-zh-ja-en-trimmed",
        "type": "huggingface"
      },
      {
        "name": "mt5-zh-ja-en-trimmed-fine-tuned-v1/engmatic-earth",
        "url": "engmatic-earth/mt5-zh-ja-en-trimmed-fine-tuned-v1",
        "type": "huggingface"
      }
    ]
  },
  // Configuration options that will be used if they are not specified in the command line arguments.

  // * WEBUI options *

  // Maximum audio file length in seconds, or -1 for no limit. Ignored by CLI.
  "input_audio_max_duration": 1800,
  // True to share the app on HuggingFace.
  "share": false,
  // The host or IP to bind to. If None, bind to localhost.
  "server_name": null,
  // The port to bind to.
  "server_port": 7860,
  // The number of workers to use for the web server. Use -1 to disable queueing.
  "queue_concurrency_count": 1,
  // Whether or not to automatically delete all uploaded files, to save disk space
  "delete_uploaded_files": true,

  // * General options *

  // The default implementation to use for Whisper. Can be "whisper" or "faster-whisper".
  // Note that you must either install the requirements for faster-whisper (requirements-fasterWhisper.txt) 
  // or whisper (requirements.txt)
  "whisper_implementation": "faster-whisper",

  // The default model name.
  "default_model_name": "large-v2",
  // The default VAD.
  "default_vad": "silero-vad",
  // A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.
  "vad_parallel_devices": "",
  // The number of CPU cores to use for VAD pre-processing.
  "vad_cpu_cores": 1,
  // The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout.
  "vad_process_timeout": 1800,
  // True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.
  "auto_parallel": false,
  // Directory to save the outputs (CLI will use the current directory if not specified)
  "output_dir": null,
  // The path to save model files; uses ~/.cache/whisper by default
  "model_dir": null,
  // Device to use for PyTorch inference, or Null to use the default device
  "device": null,
  // Whether to print out the progress and debug messages
  "verbose": true,
  // Whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')
  "task": "transcribe",
  // Language spoken in the audio, specify None to perform language detection
  "language": null,
  // The window size (in seconds) to merge voice segments
  "vad_merge_window": 5,
  // The maximum size (in seconds) of a voice segment
  "vad_max_merge_size": 90,
  // The padding (in seconds) to add to each voice segment
  "vad_padding": 1,
  // Whether or not to prepend the initial prompt to each VAD segment (prepend_all_segments), or just the first segment (prepend_first_segment)
  "vad_initial_prompt_mode": "prepend_first_segment",
  // The window size of the prompt to pass to Whisper
  "vad_prompt_window": 3,
  // Temperature to use for sampling
  "temperature": 0,
  // Number of candidates when sampling with non-zero temperature
  "best_of": 5,
  // Number of beams in beam search, only applicable when temperature is zero
  "beam_size": 5,
  // Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search
  "patience": 1,
  // Optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default
  "length_penalty": null,
  // Comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations
  "suppress_tokens": "-1",
  // Optional text to provide as a prompt for the first window
  "initial_prompt": null,
  // If True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop
  "condition_on_previous_text": true,
  // Whether to perform inference in fp16; True by default
  "fp16": true,
  // The compute type used by faster-whisper. Can be "int8". "int16" or "float16".
  "compute_type": "auto",
  // Temperature to increase when falling back when the decoding fails to meet either of the thresholds below
  "temperature_increment_on_fallback": 0.2,
  // If the gzip compression ratio is higher than this value, treat the decoding as failed
  "compression_ratio_threshold": 2.4,
  // If the average log probability is lower than this value, treat the decoding as failed
  "logprob_threshold": -1.0,
  // If the probability of the <no-speech> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence
  "no_speech_threshold": 0.6,

  // (experimental) extract word-level timestamps and refine the results based on them
  "word_timestamps": false, 
  // if word_timestamps is True, merge these punctuation symbols with the next word
  "prepend_punctuations": "\"\'“¿([{-",
  // if word_timestamps is True, merge these punctuation symbols with the previous word
  "append_punctuations": "\"\'.。,，!！?？:：”)]}、",
  // (requires --word_timestamps True) underline each word as it is spoken in srt and vtt
  "highlight_words": false,

  // Diarization settings
  "auth_token": null,
  // Whether to perform speaker diarization
  "diarization": false,
  // The number of speakers to detect
  "diarization_speakers": 2,
  // The minimum number of speakers to detect
  "diarization_min_speakers": 1,
  // The maximum number of speakers to detect
  "diarization_max_speakers": 8,
  // The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout.
  "diarization_process_timeout": 60,
}