video-dubbing

Paused

App Files Files Community

artificialguybr commited on Sep 27, 2023

Commit

9e548ce

•

1 Parent(s): 9d72f44

Upload 45 files

Browse files

Files changed (46) hide show

.gitattributes +1 -0
whisper/.flake8 +4 -0
whisper/.gitattributes +3 -0
whisper/.github/workflows/python-publish.yml +37 -0
whisper/.github/workflows/test.yml +56 -0
whisper/.gitignore +11 -0
whisper/.pre-commit-config.yaml +28 -0
whisper/CHANGELOG.md +69 -0
whisper/LICENSE +21 -0
whisper/MANIFEST.in +5 -0
whisper/README.md +147 -0
whisper/approach.png +0 -0
whisper/data/README.md +118 -0
whisper/data/meanwhile.json +322 -0
whisper/language-breakdown.svg +0 -0
whisper/model-card.md +69 -0
whisper/notebooks/LibriSpeech.ipynb +958 -0
whisper/notebooks/Multilingual_ASR.ipynb +0 -0
whisper/pyproject.toml +8 -0
whisper/requirements.txt +6 -0
whisper/setup.py +43 -0
whisper/tests/conftest.py +14 -0
whisper/tests/jfk.flac +3 -0
whisper/tests/test_audio.py +19 -0
whisper/tests/test_normalizer.py +96 -0
whisper/tests/test_timing.py +96 -0
whisper/tests/test_tokenizer.py +24 -0
whisper/tests/test_transcribe.py +42 -0
whisper/whisper/__init__.py +154 -0
whisper/whisper/__main__.py +3 -0
whisper/whisper/assets/gpt2.tiktoken +0 -0
whisper/whisper/assets/mel_filters.npz +3 -0
whisper/whisper/assets/multilingual.tiktoken +0 -0
whisper/whisper/audio.py +157 -0
whisper/whisper/decoding.py +821 -0
whisper/whisper/model.py +309 -0
whisper/whisper/normalizers/__init__.py +2 -0
whisper/whisper/normalizers/basic.py +76 -0
whisper/whisper/normalizers/english.json +1741 -0
whisper/whisper/normalizers/english.py +550 -0
whisper/whisper/timing.py +385 -0
whisper/whisper/tokenizer.py +386 -0
whisper/whisper/transcribe.py +461 -0
whisper/whisper/triton_ops.py +109 -0
whisper/whisper/utils.py +258 -0
whisper/whisper/version.py +1 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+whisper/tests/jfk.flac filter=lfs diff=lfs merge=lfs -text

whisper/.flake8 ADDED Viewed

	@@ -0,0 +1,4 @@

+[flake8]
+per-file-ignores =
+    */__init__.py: F401

whisper/.gitattributes ADDED Viewed

	@@ -0,0 +1,3 @@

+# Override jupyter in Github language stats for more accurate estimate of repo code languages
+# reference: https://github.com/github/linguist/blob/master/docs/overrides.md#generated-code
+*.ipynb linguist-generated

whisper/.github/workflows/python-publish.yml ADDED Viewed

	@@ -0,0 +1,37 @@

+name: Release
+on:
+  push:
+    branches:
+    - main
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions-ecosystem/action-regex-match@v2
+      id: regex-match
+      with:
+        text: ${{ github.event.head_commit.message }}
+        regex: '^Release ([^ ]+)'
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.8'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install setuptools wheel twine
+    - name: Release
+      if: ${{ steps.regex-match.outputs.match != '' }}
+      uses: softprops/action-gh-release@v1
+      with:
+        tag_name: v${{ steps.regex-match.outputs.group1 }}
+    - name: Build and publish
+      if: ${{ steps.regex-match.outputs.match != '' }}
+      env:
+        TWINE_USERNAME: __token__
+        TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
+      run: |
+        python setup.py sdist
+        twine upload dist/*

whisper/.github/workflows/test.yml ADDED Viewed

	@@ -0,0 +1,56 @@

+name: test
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Fetch base branch
+        run: git fetch origin ${{ github.base_ref }}
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+          architecture: x64
+      - name: Get pip cache dir
+        id: pip-cache
+        run: |
+          echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT
+      - name: pip/pre-commit cache
+        uses: actions/cache@v3
+        with:
+          path: |
+            ${{ steps.pip-cache.outputs.dir }}
+            ~/.cache/pre-commit
+          key: ${{ runner.os }}-pip-pre-commit-${{ hashFiles('**/.pre-commit-config.yaml') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-pre-commit
+      - name: pre-commit
+        run: |
+          pip install -U pre-commit
+          pre-commit install --install-hooks
+          pre-commit run --all-files
+  whisper-test:
+    needs: pre-commit
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.8', '3.9', '3.10', '3.11']
+        pytorch-version: [1.13.1, 2.0.0]
+        exclude:
+          - python-version: '3.11'
+            pytorch-version: 1.13.1
+    steps:
+      - uses: conda-incubator/setup-miniconda@v2
+      - run: conda install -n test ffmpeg python=${{ matrix.python-version }}
+      - run: pip3 install torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu
+      - uses: actions/checkout@v3
+      - run: echo "$CONDA/envs/test/bin" >> $GITHUB_PATH
+      - run: pip install .["dev"]
+      - run: pytest --durations=0 -vv -k 'not test_transcribe or test_transcribe[tiny] or test_transcribe[tiny.en]' -m 'not requires_cuda'

whisper/.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+__pycache__/
+*.py[cod]
+*$py.class
+*.egg-info
+.pytest_cache
+.ipynb_checkpoints
+thumbs.db
+.DS_Store
+.idea

whisper/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.0.1
+    hooks:
+      - id: check-json
+      - id: end-of-file-fixer
+        types: [file, python]
+      - id: trailing-whitespace
+        types: [file, python]
+      - id: mixed-line-ending
+      - id: check-added-large-files
+        args: [--maxkb=4096]
+  - repo: https://github.com/psf/black
+    rev: 23.7.0
+    hooks:
+      - id: black
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: isort (python)
+        args: ["--profile", "black", "-l", "88", "--trailing-comma", "--multi-line", "3"]
+  - repo: https://github.com/pycqa/flake8.git
+    rev: 6.0.0
+    hooks:
+      - id: flake8
+        types: [python]
+        args: ["--max-line-length", "88", "--ignore", "E203,E501,W503,W504"]

whisper/CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,69 @@

+# CHANGELOG
+## [v20230918](https://github.com/openai/whisper/releases/tag/v20230918)
+* Add .pre-commit-config.yaml ([#1528](https://github.com/openai/whisper/pull/1528))
+* fix doc of TextDecoder ([#1526](https://github.com/openai/whisper/pull/1526))
+* Update model-card.md ([#1643](https://github.com/openai/whisper/pull/1643))
+* word timing tweaks ([#1559](https://github.com/openai/whisper/pull/1559))
+* Avoid rearranging all caches ([#1483](https://github.com/openai/whisper/pull/1483))
+* Improve timestamp heuristics. ([#1461](https://github.com/openai/whisper/pull/1461))
+* fix condition_on_previous_text ([#1224](https://github.com/openai/whisper/pull/1224))
+* Fix numba depreceation notice ([#1233](https://github.com/openai/whisper/pull/1233))
+* Updated README.md to provide more insight on BLEU and specific appendices ([#1236](https://github.com/openai/whisper/pull/1236))
+* Avoid computing higher temperatures on no_speech segments ([#1279](https://github.com/openai/whisper/pull/1279))
+* Dropped unused execute bit from mel_filters.npz. ([#1254](https://github.com/openai/whisper/pull/1254))
+* Drop ffmpeg-python dependency and call ffmpeg directly. ([#1242](https://github.com/openai/whisper/pull/1242))
+* Python 3.11 ([#1171](https://github.com/openai/whisper/pull/1171))
+* Update decoding.py ([#1219](https://github.com/openai/whisper/pull/1219))
+* Update decoding.py ([#1155](https://github.com/openai/whisper/pull/1155))
+* Update README.md to reference tiktoken ([#1105](https://github.com/openai/whisper/pull/1105))
+* Implement max line width and max line count, and make word highlighting optional ([#1184](https://github.com/openai/whisper/pull/1184))
+* Squash long words at window and sentence boundaries. ([#1114](https://github.com/openai/whisper/pull/1114))
+* python-publish.yml: bump actions version to fix node warning ([#1211](https://github.com/openai/whisper/pull/1211))
+* Update tokenizer.py ([#1163](https://github.com/openai/whisper/pull/1163))
+## [v20230314](https://github.com/openai/whisper/releases/tag/v20230314)
+* abort find_alignment on empty input ([#1090](https://github.com/openai/whisper/pull/1090))
+* Fix truncated words list when the replacement character is decoded ([#1089](https://github.com/openai/whisper/pull/1089))
+* fix github language stats getting dominated by jupyter notebook ([#1076](https://github.com/openai/whisper/pull/1076))
+* Fix alignment between the segments and the list of words ([#1087](https://github.com/openai/whisper/pull/1087))
+* Use tiktoken ([#1044](https://github.com/openai/whisper/pull/1044))
+## [v20230308](https://github.com/openai/whisper/releases/tag/v20230308)
+* kwargs in decode() for convenience ([#1061](https://github.com/openai/whisper/pull/1061))
+* fix all_tokens handling that caused more repetitions and discrepancy in JSON ([#1060](https://github.com/openai/whisper/pull/1060))
+* fix typo in CHANGELOG.md
+## [v20230307](https://github.com/openai/whisper/releases/tag/v20230307)
+* Fix the repetition/hallucination issue identified in #1046 ([#1052](https://github.com/openai/whisper/pull/1052))
+* Use triton==2.0.0 ([#1053](https://github.com/openai/whisper/pull/1053))
+* Install triton in x86_64 linux only ([#1051](https://github.com/openai/whisper/pull/1051))
+* update setup.py to specify python >= 3.8 requirement
+## [v20230306](https://github.com/openai/whisper/releases/tag/v20230306)
+* remove auxiliary audio extension ([#1021](https://github.com/openai/whisper/pull/1021))
+* apply formatting with `black`, `isort`, and `flake8` ([#1038](https://github.com/openai/whisper/pull/1038))
+* word-level timestamps in `transcribe()` ([#869](https://github.com/openai/whisper/pull/869))
+* Decoding improvements ([#1033](https://github.com/openai/whisper/pull/1033))
+* Update README.md ([#894](https://github.com/openai/whisper/pull/894))
+* Fix infinite loop caused by incorrect timestamp tokens prediction ([#914](https://github.com/openai/whisper/pull/914))
+* drop python 3.7 support ([#889](https://github.com/openai/whisper/pull/889))
+## [v20230124](https://github.com/openai/whisper/releases/tag/v20230124)
+* handle printing even if sys.stdout.buffer is not available ([#887](https://github.com/openai/whisper/pull/887))
+* Add TSV formatted output in transcript, using integer start/end time in milliseconds ([#228](https://github.com/openai/whisper/pull/228))
+* Added `--output_format` option ([#333](https://github.com/openai/whisper/pull/333))
+* Handle `XDG_CACHE_HOME` properly for `download_root` ([#864](https://github.com/openai/whisper/pull/864))
+* use stdout for printing transcription progress ([#867](https://github.com/openai/whisper/pull/867))
+* Fix bug where mm is mistakenly replaced with hmm in e.g. 20mm ([#659](https://github.com/openai/whisper/pull/659))
+* print '?' if a letter can't be encoded using the system default encoding ([#859](https://github.com/openai/whisper/pull/859))
+## [v20230117](https://github.com/openai/whisper/releases/tag/v20230117)
+The first versioned release available on [PyPI](https://pypi.org/project/openai-whisper/)

whisper/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 OpenAI
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

whisper/MANIFEST.in ADDED Viewed

	@@ -0,0 +1,5 @@

+include requirements.txt
+include README.md
+include LICENSE
+include whisper/assets/*
+include whisper/normalizers/english.json

whisper/README.md ADDED Viewed

	@@ -0,0 +1,147 @@

+# Whisper
+[[Blog]](https://openai.com/blog/whisper)
+[[Paper]](https://arxiv.org/abs/2212.04356)
+[[Model card]](https://github.com/openai/whisper/blob/main/model-card.md)
+[[Colab example]](https://colab.research.google.com/github/openai/whisper/blob/master/notebooks/LibriSpeech.ipynb)
+Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multitasking model that can perform multilingual speech recognition, speech translation, and language identification.
+## Approach
+![Approach](https://raw.githubusercontent.com/openai/whisper/main/approach.png)
+A Transformer sequence-to-sequence model is trained on various speech processing tasks, including multilingual speech recognition, speech translation, spoken language identification, and voice activity detection. These tasks are jointly represented as a sequence of tokens to be predicted by the decoder, allowing a single model to replace many stages of a traditional speech-processing pipeline. The multitask training format uses a set of special tokens that serve as task specifiers or classification targets.
+## Setup
+We used Python 3.9.9 and [PyTorch](https://pytorch.org/) 1.10.1 to train and test our models, but the codebase is expected to be compatible with Python 3.8-3.11 and recent PyTorch versions. The codebase also depends on a few Python packages, most notably [OpenAI's tiktoken](https://github.com/openai/tiktoken) for their fast tokenizer implementation. You can download and install (or update to) the latest release of Whisper with the following command:
+    pip install -U openai-whisper
+Alternatively, the following command will pull and install the latest commit from this repository, along with its Python dependencies:
+    pip install git+https://github.com/openai/whisper.git
+To update the package to the latest version of this repository, please run:
+    pip install --upgrade --no-deps --force-reinstall git+https://github.com/openai/whisper.git
+It also requires the command-line tool [`ffmpeg`](https://ffmpeg.org/) to be installed on your system, which is available from most package managers:
+```bash
+# on Ubuntu or Debian
+sudo apt update && sudo apt install ffmpeg
+# on Arch Linux
+sudo pacman -S ffmpeg
+# on MacOS using Homebrew (https://brew.sh/)
+brew install ffmpeg
+# on Windows using Chocolatey (https://chocolatey.org/)
+choco install ffmpeg
+# on Windows using Scoop (https://scoop.sh/)
+scoop install ffmpeg
+```
+You may need [`rust`](http://rust-lang.org) installed as well, in case [tiktoken](https://github.com/openai/tiktoken) does not provide a pre-built wheel for your platform. If you see installation errors during the `pip install` command above, please follow the [Getting started page](https://www.rust-lang.org/learn/get-started) to install Rust development environment. Additionally, you may need to configure the `PATH` environment variable, e.g. `export PATH="$HOME/.cargo/bin:$PATH"`. If the installation fails with `No module named 'setuptools_rust'`, you need to install `setuptools_rust`, e.g. by running:
+```bash
+pip install setuptools-rust
+```
+## Available models and languages
+There are five model sizes, four with English-only versions, offering speed and accuracy tradeoffs. Below are the names of the available models and their approximate memory requirements and relative speed.
+|  Size  | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
+|:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:|
+|  tiny  |    39 M    |     `tiny.en`      |       `tiny`       |     ~1 GB     |      ~32x      |
+|  base  |    74 M    |     `base.en`      |       `base`       |     ~1 GB     |      ~16x      |
+| small  |   244 M    |     `small.en`     |      `small`       |     ~2 GB     |      ~6x       |
+| medium |   769 M    |    `medium.en`     |      `medium`      |     ~5 GB     |      ~2x       |
+| large  |   1550 M   |        N/A         |      `large`       |    ~10 GB     |       1x       |
+The `.en` models for English-only applications tend to perform better, especially for the `tiny.en` and `base.en` models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models.
+Whisper's performance varies widely depending on the language. The figure below shows a WER (Word Error Rate) breakdown by languages of the Fleurs dataset using the `large-v2` model (The smaller the numbers, the better the performance). Additional WER scores corresponding to the other models and datasets can be found in Appendix D.1, D.2, and D.4. Meanwhile, more BLEU (Bilingual Evaluation Understudy) scores can be found in Appendix D.3. Both are found in [the paper](https://arxiv.org/abs/2212.04356).
+![WER breakdown by language](https://raw.githubusercontent.com/openai/whisper/main/language-breakdown.svg)
+## Command-line usage
+The following command will transcribe speech in audio files, using the `medium` model:
+    whisper audio.flac audio.mp3 audio.wav --model medium
+The default setting (which selects the `small` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option:
+    whisper japanese.wav --language Japanese
+Adding `--task translate` will translate the speech into English:
+    whisper japanese.wav --language Japanese --task translate
+Run the following to view all available options:
+    whisper --help
+See [tokenizer.py](https://github.com/openai/whisper/blob/main/whisper/tokenizer.py) for the list of all available languages.
+## Python usage
+Transcription can also be performed within Python:
+```python
+import whisper
+model = whisper.load_model("base")
+result = model.transcribe("audio.mp3")
+print(result["text"])
+```
+Internally, the `transcribe()` method reads the entire file and processes the audio with a sliding 30-second window, performing autoregressive sequence-to-sequence predictions on each window.
+Below is an example usage of `whisper.detect_language()` and `whisper.decode()` which provide lower-level access to the model.
+```python
+import whisper
+model = whisper.load_model("base")
+# load audio and pad/trim it to fit 30 seconds
+audio = whisper.load_audio("audio.mp3")
+audio = whisper.pad_or_trim(audio)
+# make log-Mel spectrogram and move to the same device as the model
+mel = whisper.log_mel_spectrogram(audio).to(model.device)
+# detect the spoken language
+_, probs = model.detect_language(mel)
+print(f"Detected language: {max(probs, key=probs.get)}")
+# decode the audio
+options = whisper.DecodingOptions()
+result = whisper.decode(model, mel, options)
+# print the recognized text
+print(result.text)
+```
+## More examples
+Please use the [🙌 Show and tell](https://github.com/openai/whisper/discussions/categories/show-and-tell) category in Discussions for sharing more example usages of Whisper and third-party extensions such as web demos, integrations with other tools, ports for different platforms, etc.
+## License
+Whisper's code and model weights are released under the MIT License. See [LICENSE](https://github.com/openai/whisper/blob/main/LICENSE) for further details.

whisper/approach.png ADDED Viewed

whisper/data/README.md ADDED Viewed

	@@ -0,0 +1,118 @@

+This directory supplements the paper with more details on how we prepared the data for evaluation, to help replicate our experiments.
+## Short-form English-only datasets
+### LibriSpeech
+We used the test-clean and test-other splits from the [LibriSpeech ASR corpus](https://www.openslr.org/12).
+### TED-LIUM 3
+We used the test split of [TED-LIUM Release 3](https://www.openslr.org/51/), using the segmented manual transcripts included in the release.
+### Common Voice 5.1
+We downloaded the English subset of Common Voice Corpus 5.1 from [the official website](https://commonvoice.mozilla.org/en/datasets)
+### Artie
+We used the [Artie bias corpus](https://github.com/artie-inc/artie-bias-corpus). This is a subset of the Common Voice dataset.
+### CallHome & Switchboard
+We used the two corpora from [LDC2002S09](https://catalog.ldc.upenn.edu/LDC2002S09) and [LDC2002T43](https://catalog.ldc.upenn.edu/LDC2002T43) and followed the [eval2000_data_prep.sh](https://github.com/kaldi-asr/kaldi/blob/master/egs/fisher_swbd/s5/local/eval2000_data_prep.sh) script for preprocessing. The `wav.scp` files can be converted to WAV files with the following bash commands:
+```bash
+mkdir -p wav
+while read name cmd; do
+    echo $name
+    echo ${cmd/\|/} wav/$name.wav | bash
+done < wav.scp
+```
+### WSJ
+We used [LDC93S6B](https://catalog.ldc.upenn.edu/LDC93S6B) and [LDC94S13B](https://catalog.ldc.upenn.edu/LDC94S13B) and followed the [s5 recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/wsj/s5) to preprocess the dataset.
+### CORAAL
+We used the 231 interviews from [CORAAL (v. 2021.07)](https://oraal.uoregon.edu/coraal) and used the segmentations from [the FairSpeech project](https://github.com/stanford-policylab/asr-disparities/blob/master/input/CORAAL_transcripts.csv).
+### CHiME-6
+We downloaded the [CHiME-5 dataset](https://spandh.dcs.shef.ac.uk//chime_challenge/CHiME5/download.html) and followed the stage 0 of the [s5_track1 recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/chime6/s5_track1) to create the CHiME-6 dataset which fixes synchronization. We then used the binaural recordings (`*_P??.wav`) and the corresponding transcripts.
+### AMI-IHM, AMI-SDM1
+We preprocessed the [AMI Corpus](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) by following the stage 0 ad 2 of the [s5b recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/ami/s5b).
+## Long-form English-only datasets
+### TED-LIUM 3
+To create a long-form transcription dataset from the [TED-LIUM3](https://www.openslr.org/51/) dataset, we sliced the audio between the beginning of the first labeled segment and the end of the last labeled segment of each talk, and we used the concatenated text as the label. Below are the timestamps used for slicing each of the 11 TED talks in the test split.
+| Filename            | Begin time (s) | End time (s) |
+|---------------------|----------------|--------------|
+| DanBarber_2010      | 16.09          | 1116.24      |
+| JaneMcGonigal_2010  | 15.476         | 1187.61      |
+| BillGates_2010      | 15.861         | 1656.94      |
+| TomWujec_2010U      | 16.26          | 402.17       |
+| GaryFlake_2010      | 16.06          | 367.14       |
+| EricMead_2009P      | 18.434         | 536.44       |
+| MichaelSpecter_2010 | 16.11          | 979.312      |
+| DanielKahneman_2010 | 15.8           | 1199.44      |
+| AimeeMullins_2009P  | 17.82          | 1296.59      |
+| JamesCameron_2010   | 16.75          | 1010.65      |
+| RobertGupta_2010U   | 16.8           | 387.03       |
+### Meanwhile
+This dataset consists of 64 segments from The Late Show with Stephen Colbert. The YouTube video ID, start and end timestamps, and the labels can be found in [meanwhile.json](meanwhile.json). The labels are collected from the closed-caption data for each video and corrected with manual inspection.
+### Rev16
+We use a subset of 16 files from the 30 podcast episodes in [Rev.AI's Podcast Transcription Benchmark](https://www.rev.ai/blog/podcast-transcription-benchmark-part-1/), after finding that there are multiple cases where a significant portion of the audio and the labels did not match, mostly on the parts introducing the sponsors. We selected 16 episodes that do not have this error, whose "file number" are:
+    3 4 9 10 11 14 17 18 20 21 23 24 26 27 29 32
+### Kincaid46
+This dataset consists of 46 audio files and the corresponding transcripts compiled in the blog article [Which automatic transcription service is the most accurate - 2018](https://medium.com/descript/which-automatic-transcription-service-is-the-most-accurate-2018-2e859b23ed19) by Jason Kincaid. We used the 46 audio files and reference transcripts from the Airtable widget in the article.
+For the human transcription benchmark in the paper, we use a subset of 25 examples from this data, whose "Ref ID" are:
+    2 4 5 8 9 10 12 13 14 16 19 21 23 25 26 28 29 30 33 35 36 37 42 43 45
+### Earnings-21, Earnings-22
+For these datasets, we used the files available in [the speech-datasets repository](https://github.com/revdotcom/speech-datasets), as of their `202206` version.
+### CORAAL
+We used the 231 interviews from [CORAAL (v. 2021.07)](https://oraal.uoregon.edu/coraal) and used the full-length interview files and transcripts.
+## Multilingual datasets
+### Multilingual LibriSpeech
+We used the test splits from each language in [the Multilingual LibriSpeech (MLS) corpus](https://www.openslr.org/94/).
+### Fleurs
+We collected audio files and transcripts using the implementation available as [HuggingFace datasets](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py). To use as a translation dataset, we matched the numerical utterance IDs to find the corresponding transcript in English.
+### VoxPopuli
+We used the `get_asr_data.py` script from [the official repository](https://github.com/facebookresearch/voxpopuli) to collect the ASR data in 14 languages.
+### Common Voice 9
+We downloaded the Common Voice Corpus 9 from [the official website](https://commonvoice.mozilla.org/en/datasets)
+### CoVOST 2
+We collected the `X into English` data collected using [the official repository](https://github.com/facebookresearch/covost).

whisper/data/meanwhile.json ADDED Viewed

	@@ -0,0 +1,322 @@

+{
+    "1YOmY-Vjy-o": {
+        "begin": "1:04.0",
+        "end": "2:11.0",
+        "text": "FOLKS, IF YOU WATCH THE SHOW,\nYOU KNOW I SPEND A LOT OF TIME\nRIGHT OVER THERE, PATIENTLY AND\nASTUTELY SCRUTINIZING THE\nBOXWOOD AND MAHOGANY CHESS SET\nOF THE DAY'S BIGGEST STORIES,\nDEVELOPING THE CENTRAL\nHEADLINE-PAWNS, DEFTLY\nMANEUVERING AN OH-SO-TOPICAL\nKNIGHT TO F-6, FEIGNING A\nCLASSIC SICILIAN-NAJDORF\nVARIATION ON THE NEWS, ALL THE\nWHILE, SEEING EIGHT MOVES DEEP\nAND PATIENTLY MARSHALING THE\nLATEST PRESS RELEASES INTO A\nFISCHER SOZIN LIPNITZKY ATTACK\nTHAT CULMINATES IN THE ELEGANT,\nLETHAL, SLOW-PLAYED EN PASSANT\nCHECKMATE THAT IS MY NIGHTLY\nMONOLOGUE.\nBUT SOMETIMES, SOMETIMES,\nFOLKS-- I,\nSOMETIMES,\nI STARTLE AWAKE UPSIDE DOWN ON\nTHE MONKEY BARS OF A CONDEMNED\nPLAYGROUND ON A SUPERFUND SITE,\nGET ALL HEPPED UP ON GOOFBALLS,\nRUMMAGE THROUGH A DISCARDED TAG\nBAG OF DEFECTIVE TOYS, YANK\nOUT A FISTFUL OF DISEMBODIED\nDOLL LIMBS, TOSS THEM ON A\nSTAINED KID'S PLACEMAT FROM A\nDEFUNCT DENNY'S, SET UP A TABLE\nINSIDE A RUSTY CARGO CONTAINER\nDOWN BY THE WHARF, AND CHALLENGE\nTOOTHLESS DRIFTERS TO THE\nGODLESS BUGHOUSE BLITZ\nOF TOURNAMENT OF NEWS THAT IS MY\nSEGMENT:\nMEANWHILE!\n"
+    },
+    "3P_XnxdlXu0": {
+        "begin": "2:08.3",
+        "end": "3:02.3",
+        "text": "FOLKS, I SPEND A LOT OF TIME\nRIGHT OVER THERE, NIGHT AFTER NIGHT ACTUALLY, CAREFULLY\nSELECTING FOR YOU THE DAY'S NEWSIEST,\nMOST AERODYNAMIC HEADLINES,\nSTRESS TESTING THE MOST TOPICAL\nANTILOCK BRAKES AND POWER\nSTEERING, PAINSTAKINGLY\nSTITCHING LEATHER SEATING SO\nSOFT, IT WOULD MAKE J.D. POWER\nAND HER ASSOCIATES BLUSH, TO\nCREATE THE LUXURY SEDAN THAT IS\nMY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES,\nFOLKS, I LURCH TO CONSCIOUSNESS\nIN THE BACK OF AN ABANDONED\nSCHOOL BUS AND SLAP MYSELF AWAKE\nWITH A CRUSTY FLOOR MAT BEFORE\nUSING A MOUSE-BITTEN TIMING BELT\nTO STRAP SOME OLD PLYWOOD TO A\nCOUPLE OF DISCARDED OIL DRUMS.\nTHEN, BY THE LIGHT OF A HEATHEN\nMOON, RENDER A GAS TANK OUT OF\nAN EMPTY BIG GULP, FILL IT WITH\nWHITE CLAW AND DENATURED\nALCOHOL, THEN LIGHT A MATCH AND\nLET HER RIP, IN THE DEMENTED\nONE-MAN SOAP BOX DERBY OF NEWS\nTHAT IS MY SEGMENT: MEANWHILE!"
+    },
+    "3elIlQzJEQ0": {
+        "begin": "1:08.5",
+        "end": "1:58.5",
+        "text": "LADIES AND GENTLEMEN, YOU KNOW, I SPEND A\nLOT OF TIME RIGHT OVER THERE,\nRAISING THE FINEST HOLSTEIN NEWS\nCATTLE, FIRMLY, YET TENDERLY,\nMILKING THE LATEST HEADLINES\nFROM THEIR JOKE-SWOLLEN TEATS,\nCHURNING THE DAILY STORIES INTO\nTHE DECADENT, PROVENCAL-STYLE\nTRIPLE CREME BRIE THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES, FOLKS,\nI STAGGER HOME HUNGRY AFTER\nBEING RELEASED BY THE POLICE,\nAND ROOT AROUND IN THE NEIGHBOR'S\nTRASH CAN FOR AN OLD MILK\nCARTON, SCRAPE OUT THE BLOOMING\nDAIRY RESIDUE ONTO THE REMAINS\nOF A WET CHEESE RIND I WON\nFROM A RAT IN A PRE-DAWN STREET\nFIGHT, PUT IT IN A DISCARDED\nPAINT CAN, AND LEAVE IT TO\nFERMENT NEXT TO A TRASH FIRE,\nTHEN HUNKER DOWN AND HALLUCINATE\nWHILE EATING THE LISTERIA-LADEN\nDEMON CUSTARD OF NEWS THAT IS\nMY SEGMENT: MEANWHILE!"
+    },
+    "43P4q1KGKEU": {
+        "begin": "0:29.3",
+        "end": "1:58.3",
+        "text": "FOLKS, IF YOU WATCH THE SHOW, YOU KNOW I SPEND MOST\nOF MY TIME, RIGHT OVER THERE.\nCAREFULLY SORTING THROUGH THE\nDAY'S BIGGEST STORIES, AND\nSELECTING ONLY THE MOST\nSUPPLE AND UNBLEMISHED OSTRICH\nAND CROCODILE NEWS LEATHER,\nWHICH I THEN ENTRUST TO ARTISAN\nGRADUATES OF THE \"ECOLE\nGREGOIRE-FERRANDI,\" WHO\nCAREFULLY DYE THEM IN A PALETTE\nOF BRIGHT, ZESTY SHADES, AND\nADORN THEM WITH THE FINEST, MOST\nTOPICAL INLAY WORK USING HAND\nTOOLS AND DOUBLE MAGNIFYING\nGLASSES, THEN ASSEMBLE THEM\nACCORDING TO NOW CLASSIC AND\nELEGANT GEOMETRY USING OUR\nSIGNATURE SADDLE STITCHING, AND\nLINE IT WITH BEESWAX-COATED\nLINEN, AND FINALLY ATTACH A\nMALLET-HAMMERED STRAP, PEARLED\nHARDWARE, AND A CLOCHETTE TO\nCREATE FOR YOU THE ONE-OF-A-KIND\nHAUTE COUTURE HERMES BIRKIN BAG\nTHAT IS MY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES, FOLKS,\nSOMETIMES, SOMETIMES I WAKE UP IN THE\nLAST CAR OF AN ABANDONED ROLLER\nCOASTER AT CONEY ISLAND, WHERE\nI'M HIDING FROM THE TRIADS, I\nHUFF SOME ENGINE LUBRICANTS OUT\nOF A SAFEWAY BAG AND STAGGER\nDOWN THE SHORE TO TEAR THE SAIL\nOFF A BEACHED SCHOONER, THEN I\nRIP THE CO-AXIAL CABLE OUT OF\nTHE R.V. OF AN ELDERLY COUPLE\nFROM UTAH, HANK AND MABEL,\nLOVELY FOLKS, AND USE IT TO\nSTITCH THE SAIL INTO A LOOSE,\nPOUCH-LIKE RUCKSACK, THEN I\nSTOW AWAY IN THE BACK OF A\nGARBAGE TRUCK TO THE JUNK YARD\nWHERE I PICK THROUGH THE DEBRIS\nFOR ONLY THE BROKEN TOYS THAT\nMAKE ME THE SADDEST UNTIL I HAVE\nLOADED, FOR YOU, THE HOBO\nFUGITIVE'S BUG-OUT BINDLE OF\nNEWS THAT IS MY SEGMENT:\nMEANWHILE!"
+    },
+    "4ktyaJkLMfo": {
+        "begin": "0:42.5",
+        "end": "1:26.5",
+        "text": "YOU KNOW, FOLKS, I SPEND A LOT\nOF TIME CRAFTING FOR YOU A\nBESPOKE PLAYLIST OF THE DAY'S\nBIGGEST STORIES, RIGHT OVER THERE, METICULOUSLY\nSELECTING THE MOST TOPICAL\nCHAKRA-AFFIRMING SCENTED\nCANDLES, AND USING FENG SHUI TO\nPERFECTLY ALIGN THE JOKE ENERGY\nIN THE EXCLUSIVE BOUTIQUE YOGA\nRETREAT THAT IS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES,\nI GO TO THE DUMPSTER BEHIND THE\nWAFFLE HOUSE AT 3:00 IN THE\nMORNING, TAKE OFF MY SHIRT,\nCOVER MYSELF IN USED FRY OIL,\nWRAP MY HANDS IN SOME OLD DUCT\nTAPE I STOLE FROM A BROKEN CAR\nWINDOW, THEN POUND A SIX-PACK OF\nBLUEBERRY HARD SELTZER AND A\nSACK OF PILLS I STOLE FROM A\nPARKED AMBULANCE, THEN\nARM-WRESTLE A RACCOON IN THE\nBACK ALLEY VISION QUEST OF NEWS\nTHAT IS MY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "5Dsh9AgqRG0": {
+        "begin": "1:06.0",
+        "end": "2:34.0",
+        "text": "YOU KNOW, FOLKS, I SPEND MOST OF\nMY TIME RIGHT OVER THERE, MINING\nTHE DAY'S BIGGEST, MOST\nIMPORTANT STORIES, COLLECTING\nTHE FINEST, MOST TOPICAL IRON\nORE, HAND HAMMERING IT INTO JOKE\nPANELS.\nTHEN I CRAFT SHEETS OF BRONZE\nEMBLAZONED WITH PATTERNS THAT\nTELL AN EPIC TALE OF CONQUEST\nAND GLORY.\nTHEN, USING THE GERMANIC\nTRADITIONAL PRESSBLECH\nPROCESS, I PLACE THIN SHEETS OF\nFOIL AGAINST THE SCENES, AND BY\nHAMMERING OR OTHERWISE,\nAPPLYING PRESSURE FROM THE BACK,\nI PROJECT THESE SCENES INTO A\nPAIR OF CHEEK GUARDS AND A\nFACEPLATE.\nAND, FINALLY, USING FLUTED\nSTRIPS OF WHITE ALLOYED\nMOULDING, I DIVIDE THE DESIGNS\nINTO FRAMED PANELS AND HOLD IT\nALL TOGETHER USING BRONZE RIVETS\nTO CREATE THE BEAUTIFUL AND\nINTIMIDATING ANGLO-SAXON\nBATTLE HELM THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES, FOLKS,\nSOMETIMES, JUST SOMETIMES, I COME TO MY SENSES FULLY NAKED\nON THE DECK OF A PIRATE-BESIEGED\nMELEE CONTAINER SHIP THAT PICKED\nME UP FLOATING ON THE DETACHED\nDOOR OF A PORT-A-POTTY IN THE\nINDIAN OCEAN.\nTHEN, AFTER A SUNSTROKE-INDUCED\nREALIZATION THAT THE CREW OF\nTHIS SHIP PLANS TO SELL ME IN\nEXCHANGE FOR A BAG OF ORANGES TO\nFIGHT OFF SCURVY, I LEAD A\nMUTINY USING ONLY A P.V.C. PIPE\nAND A POOL CHAIN.\nTHEN, ACCEPTING MY NEW ROLE AS\nCAPTAIN, AND DECLARING MYSELF\nKING OF THE WINE-DARK SEAS, I\nGRAB A DIRTY MOP BUCKET COVERED\nIN BARNACLES AND ADORN IT WITH\nTHE TEETH OF THE VANQUISHED, TO\nCREATE THE SOPPING WET PIRATE\nCROWN OF NEWS THAT IS MY\nSEGMENT:\n\"MEANWHILE!\" "
+    },
+    "748OyesQy84": {
+        "begin": "0:40.0",
+        "end": "1:41.0",
+        "text": "FOLKS, IF YOU WATCH THE SHOW, YOU KNOW, I SPEND MOST OF\nMY TIME, RIGHT OVER THERE,\nCAREFULLY BLENDING FOR YOU THE\nDAY'S NEWSIEST, MOST TOPICAL\nFLOUR, EGGS, MILK, AND BUTTER,\nAND STRAINING IT INTO A FINE\nBATTER TO MAKE DELICATE, YET\nINFORMATIVE COMEDY PANCAKES.\nTHEN I GLAZE THEM IN THE JUICE\nAND ZEST OF THE MOST RELEVANT\nMIDNIGHT VALENCIA ORANGES, AND\nDOUSE IT ALL IN A FINE DELAMAIN\nDE VOYAGE COGNAC, BEFORE\nFLAMBEING AND BASTING THEM TABLE\nSIDE, TO SERVE FOR YOU THE JAMES\nBEARD AWARD-WORTHY CREPES\nSUZETTE THAT IS MY NIGHTLY,\nMONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES FOLKS, I WAKE UP IN THE\nBAGGAGE HOLD OF A GREYHOUND BUS\nAS ITS BEING HOISTED BY THE\nSCRAPYARD CLAW TOWARD THE BURN\nPIT, ESCAPE TO A NEARBY\nABANDONED PRICE CHOPPER, WHERE I\nSCROUNGE FOR OLD BREAD SCRAPS,\nBUSTED OPEN BAGS OF STAR FRUIT\nCANDIES, AND EXPIRED EGGS,\nCHUCK IT ALL IN A DIRTY HUBCAP\nAND SLAP IT OVER A TIRE FIRE\nBEFORE USING THE LEGS OF A\nSTAINED PAIR OF SWEATPANTS AS\nOVEN MITTS TO EXTRACT AND SERVE\nTHE DEMENTED TRANSIENT'S POUND\nCAKE OF NEWS THAT IS MY SEGMENT:\n\"MEANWHILE.\""
+    },
+    "8prs9Pq5Xhk": {
+        "begin": "1:18.5",
+        "end": "2:17.5",
+        "text": "FOLKS, IF YOU WATCH THE SHOW,\nAND I HOPE YOU DO, I SPEND A\nLOT OF TIME RIGHT OVER THERE,\nTIRELESSLY STUDYING THE LINEAGE\nOF THE DAY'S MOST IMPORTANT\nTHOROUGHBRED STORIES AND\nHOLSTEINER HEADLINES, WORKING\nWITH THE BEST TRAINERS MONEY CAN\nBUY TO REAR THEIR COMEDY\nOFFSPRING WITH A HAND THAT IS\nSTERN, YET GENTLE, INTO THE\nTRIPLE-CROWN-WINNING EQUINE\nSPECIMEN THAT IS MY NIGHTLY\nMONOLOGUE.\nBUT SOMETIMES, SOMETIMES, FOLKS\nI BREAK INTO AN UNINCORPORATED\nVETERINARY GENETICS LAB AND GRAB\nWHATEVER TEST TUBES I CAN FIND.\nAND THEN, UNDER A GROW LIGHT I GOT\nFROM A DISCARDED CHIA PET, I MIX\nTHE PILFERED D.N.A. OF A HORSE\nAND WHATEVER WAS IN A TUBE\nLABELED \"KEITH-COLON-EXTRA,\"\nSLURRYING THE CONCOCTION WITH\nCAFFEINE PILLS AND A MICROWAVED\nRED BULL, I SCREAM-SING A PRAYER\nTO JANUS, INITIATOR OF HUMAN\nLIFE AND GOD OF TRANSFORMATION\nAS A HALF-HORSE, HALF-MAN FREAK,\nSEIZES TO LIFE BEFORE ME IN THE\nHIDEOUS COLLECTION OF LOOSE\nANIMAL PARTS AND CORRUPTED MAN\nTISSUE THAT IS MY SEGMENT:\nMEANWHILE!"
+    },
+    "9gX4kdFajqE": {
+        "begin": "0:44.0",
+        "end": "1:08.0",
+        "text": "FOLKS, IF YOU WATCH THE SHOW,\nYOU KNOW SOMETIMES I'M OVER\nTHERE DOING THE MONOLOGUE.\nAND THEN THERE'S A COMMERCIAL\nBREAK, AND THEN I'M SITTING\nHERE.\nAND I DO A REALLY LONG\nDESCRIPTION OF A DIFFERENT\nSEGMENT ON THE SHOW, A SEGMENT\nWE CALL... \"MEANWHILE!\""
+    },
+    "9ssGpE9oem8": {
+        "begin": "0:00.0",
+        "end": "0:58.0",
+        "text": "WELCOME BACK, EVERYBODY.\nYOU KNOW, FOLKS, I SPEND MOST OF\nMY TIME RIGHT OVER THERE,\nCOMBING OVER THE DAY'S NEWS,\nSELECTING ONLY THE HIGHEST\nQUALITY AND MOST TOPICAL BONDED\nCALFSKIN-LEATHER STORIES,\nCAREFULLY TANNING THEM AND CUTTING\nTHEM WITH MILLIMETER PRECISION,\nTHEN WEAVING IT TOGETHER IN\nA DOUBLE-FACED INTRECCIATO\nPATTERN TO CREATE FOR YOU THE\nEXQUISITE BOTTEGA VENETA CLUTCH\nTHAT IS MY MONOLOGUE.\nBUT SOMETIMES, WHILE AT A RAVE\nIN A CONDEMNED CEMENT FACTORY, I\nGET INJECTED WITH A MYSTERY\nCOCKTAIL OF HALLUCINOGENS AND\nPAINT SOLVENTS, THEN, OBEYING\nTHE VOICES WHO WILL STEAL MY\nTEETH IF I DON'T, I STUMBLE INTO\nA SHIPYARD WHERE I RIP THE\nCANVAS TARP FROM A GRAVEL TRUCK,\nAND TIE IT OFF WITH THE ROPE FROM A\nROTTING FISHING NET, THEN WANDER\nA FOOD COURT, FILLING IT WITH\nWHAT I THINK ARE GOLD COINS BUT\nARE, IN FACT, OTHER PEOPLE'S CAR\nKEYS, TO DRAG AROUND THE\nROOTLESS TRANSIENT'S CLUSTER\nSACK OF NEWS THAT IS MY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "ARw4K9BRCAE": {
+        "begin": "0:26.0",
+        "end": "1:16.0",
+        "text": "YOU KNOW, FOLKS, I SPEND\nMOST OF MY TIME STANDING RIGHT OVER\nTHERE,\nGOING OVER THE DAY'S NEWS\nAND SELECTING THE FINEST,\nMOST\nTOPICAL CARBON FIBER\nSTORIES, SHAPING THEM IN DRY\nDOCK INTO A\nSLEEK AND SEXY HULL, KITTING\nIT OUT WITH THE MOST TOPICAL\nFIBERGLASS AND TEAK FITTINGS\nBRASS RAILINGS, HOT TUBS,\nAND\nNEWS HELIPADS, TO CREATE THE\nCUSTOM DESIGNED, GLEAMING\nMEGA-YACHT THAT IS MY NIGHTLY\nMONOLOGUE. BUT SOMETIMES, JUST SOMETIMES FOLKS, I\nWASH ASHORE AT\nAN ABANDONED BEACH RESORT\nAFTER A NIGHT OF BATH SALTS\nAND\nSCOPOLAMINE, LASH SOME\nROTTING PICNIC TABLES\nTOGETHER, THEN\nDREDGE THE NEWS POND TO\nHAUL UP WHATEVER DISCARDED\nTRICYCLES AND BROKEN\nFRISBEES I CAN FIND, STEAL\nAN EYE PATCH\nFROM A HOBO, STAPLE A DEAD\nPIGEON TO MY SHOULDER, AND\nSAIL\nINTO INTERNATIONAL WATERS\nON THE PIRATE GARBAGE SCOW\nOF\nNEWS THAT IS MY SEGMENT:\nMEANWHILE!"
+    },
+    "B1DRmrOlKtY": {
+        "begin": "1:34.0",
+        "end": "2:17.0",
+        "text": "FOLKS, I SPEND A LOT OF TIME\nSTANDING RIGHT OVER THERE, OKAY,\nHANDPICKING ONLY THE RIPEST\nMOST TOPICAL DAILY HEADLINES,\nSEEKING OUT THE SWEETEST, MOST\nREFINED OF JEST-JAMS AND\nJOKE-JELLIES, CURATING A PLATE\nOF LOCAL COMEDY-FED MEATS AND\nSATIRICAL CHEESES TO LAY OUT THE\nARTISANAL CHARCUTERIE BOARD\nA-NEWS BOUCHE THAT IS MY NIGHTLY\nMONOLOGUE.\nBUT SOMETIMES, I WAKE UP IN THE\nGREASE TRAP OF AN\nUNLICENSED SLAUGHTERHOUSE,\nSPLASH MY FACE WITH SOME BEEF TALLOW,\nRENDER A RUDIMENTARY PRESERVE\nFROM BONE MARROW AND MELTED\nGUMMY WORMS, AND THROW IT\nTOGETHER WITH SOME SWEETBREADS\nAND TRIPE TO PRESENT THE\nPLOWMAN'S PLATTER OF UNCLAIMED\nOFFAL THAT IS MY SEGMENT:\nMEANWHILE!"
+    },
+    "BT950jqCCUY": {
+        "begin": "1:00.0",
+        "end": "2:09.0",
+        "text": "YOU KNOW, FOLKS,\nI SPEND SO MUCH OF MY TIME RIGHT\nOVER THERE, SIFTING THROUGH THE\nDAY'S BIGGEST STORIES, HAND\nSELECTING ONLY THE FINEST, MOST\nPERFECTLY AGED BURMESE NEWS\nTEAK.\nTHEN CAREFULLY CARVING AND\nSHAPING IT INTO A REFINED\nBUDDHA, DEPICTED IN THE \"CALLING\nTHE EARTH TO WITNESS\" GESTURE,\nWITH CHARACTERISTIC CIRCULAR\nPATTERNS ON THE ROBE, SHINS, AND\nKNEES, OF COURSE, WHICH I\nTHEN CAREFULLY GILD WITH THE\nMOST TOPICAL GOLD LEAF.\nTHEN I HARVEST THE SAP OF A\nUSITATA TREE TO APPLY THE DELICATE\nTAI O LACQUER, AND FINALLY HAND\nDECORATE IT WITH THE WHITE GLASS\nINLAYS TO CREATE FOR YOU THE\nGLORIOUS AMARA-PURA PERIOD\nSCULPTURE THAT IS MY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES FOLKS,\nI WASH UP ON A GALVESTON\nBEACH, NAKED ON A RAFT OF GAS\nCANS AFTER ESCAPING FROM A FIGHT CLUB\nIN INTERNATIONAL WATERS.\nTHEN, STILL DERANGED ON A\nCOCKTAIL OF MESCALINE AND COUGH\nSYRUP, I STEAL A CINDER BLOCK\nFROM UNDER A STRIPPED '87 FORD\nTAURUS,  AND CHIP AWAY AT IT\nWITH A BROKEN UMBRELLA HANDLE I\nSCAVENGED FROM A GOODWILL\nDUMPSTER UNTIL IT VAGUELY\nRESEMBLES THE HUNGERING WOLF\nTHAT SCRATCHES AT THE DOOR\nOF MY DREAMS AND PRESENT TO YOU\nTHE TORMENTED DREAD-EFFIGY OF\nNEWS THAT IS MY SEGMENT:\nMEANWHILE!"
+    },
+    "C0e8XM30tQI": {
+        "begin": "0:57.0",
+        "end": "2:40.0",
+        "text": "YOU KNOW, IF YOU WATCH THE SHOW, YOU'RE AWARE THAT I SPEND MOST OF\nMY TIME RIGHT OVER THERE,\nWANDERING THE NEWS FOREST FOR\nYOU, FELLING ONLY THE BIGGEST\nAND HEARTIEST WHITE STORY OAKS,\nCUTTING AND SHAPING THEM INTO\nTHE NEWSIEST, MOST TOPICAL\nCLEATS, CLAMPS AND PLANKS,\nKEEPING THEM AT A CONSTANT\nANGLE, GRADUALLY CREATING A\nSHELL-SHAPED, SHALLOW BOWL HULL\nUSING THE FIRE-BENDING TECHNIQUE\nINSTEAD OF STEAM-BENDING\nOBVIOUSLY.\nTHEN I LAY OUT ALL THE KEEL\nBLOCKS TO CAREFULLY SET UP THE\nSTEM, STERN AND, GARBOARD,\nATTACH THE BILGE FUTTOCKS TO THE\nTIMBER AND LOVINGLY CRAFT A FLAT\nTRANSOM STERN OUT OF\nNATURALLY-CURVED QUARTER\nCIRCLES.\nTHEN SECURE ALL THE PLANKS WITH\nTRUNNELS HANDMADE FROM THE\nFINEST LOCUST WOOD AND, FINALLY,\nADORN IT WITH A PROUD BOWSPRIT,\nFOREPEAK, AND CUSTOM GILDED\nFIGUREHEAD TO PRESENT TO YOU THE\nDUTCH GOLDEN AGE \"SPIEGEL-JACHT\"\nTHAT IS MY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES,\nFOLKS-- YOU GOT TO HYDRATE\nAFTER THAT.\n\"SPIEGEL-JACHT\"\nBUT SOMETIMES, I\nAWAKEN FROM A MEAT-SWEAT-\nINDUCED FEVER STRAPPED TO A\nBASKET ON THE WONDER WHEEL AT\nCONEY ISLAND, STUMBLE ACROSS THE\nGARBAGE-FLECKED BEACH TO THE\nSOUND OF A TERRIFYING RAGGED\nBELLOW I REALIZE IS COMING FROM\nMY OWN LUNGS, WHICH THEN SUMMONS AN\nARMY OF SEAGULLS WHOM I INSTRUCT\nTO GATHER THE HALF-EMPTIED CANS\nOF BUSCH LIGHT LEFT BY A MOB OF\nBELGIAN TOURISTS, ALL OF WHICH\nI GATHER INTO A SACK I FASHIONED\nOUT OF PANTS I STOLE FROM A\nSLEEPING COP. THEN I SWIPE A\nGIANT INFLATABLE BABY YODA FROM\nA CARNY GAME, STRAP IT TO THE\nMAST I MADE BY RIPPING A B-68\nBUS STOP SIGN OUT OF THE GROUND\nON THE CORNER OF STILLWELL, AND\nLAUNCH THE VESSEL AS CAPTAIN OF\nTHE UNREGULATED PIRATE BOOZE\nCRUISE OF NEWS THAT IS MY\nSEGMENT:\n\"MEANWHILE\"!"
+    },
+    "CKsASCGr_4A": {
+        "begin": "1:48.3",
+        "end": "2:37.3",
+        "text": "FOLKS, YOU  KNOW, I SPEND A LOT\nOF TIME RIGHT OVER THERE, CARVING THE\nFINEST, MOST-TOPICAL JAPANESE\nNEWS CYPRESS INTO AN EXPRESSIVE\nHANNYA DEMON MASK, DONNING MY\nBLACK AND GOLD SHOZOKU ROBES\nMADE OF THE SMOOTHEST STORY\nSILK, AND MASTERING THE ELABORATE\nCHOREOGRAPHY OF STILLNESS AND\nMOVEMENT, TO CREATE THE JAPANESE\nNOH THEATER PRODUCTION THAT IS\nMY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES,\nFOLKS, I ENTER A FUGUE STATE IN\nTHE MIDDLE OF THE NIGHT, SIPHON\nA BUCKETFUL OF GASOLINE OUT OF MY\nNEIGHBOR'S MAZDA, RUN BAREFOOT\nFROM MY HOUSE TO A HOVEL UNDER\nTHE TURNPIKE WHERE I ASK A HOBO\nFOR A LIGHT AND SET A GARBAGE\nCAN ABLAZE, THEN PLACE MY\nFROSTBITTEN HANDS IN FRONT OF\nTHE DUMPSTER TO PROJECT THE\nSHADOW PUPPET WINO OPERA OF NEWS\nTHAT IS MY SEGMENT, \"MEANWHILE.\""
+    },
+    "DSc26qAJp_g": {
+        "begin": "0:28.0",
+        "end": "1:46.0",
+        "text": "FOLKS, I SPEND MOST OF MY TIME\nRIGHT OVER THERE, ASSEMBLING THE\nDAY'S BIGGEST, MOST-IMPORTANT\nSTORIES, THEN HAND-SHAPING THEM\nINTO SLEEK, ELEGANT BODYWORK,\nWHICH I LINE WITH ONLY THE\nFINEST, MOST TOPICAL POLISHED\nMACASSAR EBONY AND OPEN-PORE\nPALDAO VENEER, ADDING LIGHT\nMOCCASIN AND DARK SPICE LEATHER\nSEATS, AND MALABAR TEAK WOOD TO\nSET OFF A SCULPTED MINIMALIST\nSWITCHGEAR, ACCOMPANIED BY A\nSTERLING SILVER HUMIDOR AND\nCHAMPAGNE CELLARETTE, THEN\nHAND-SET 1,600 FIBER OPTIC\nLIGHTS ALIGNED WITH PINPOINT\nPERFORATIONS IN THE ROOF-LINING,\nTO CREATE FOR YOU THE BESPOKE\nCOACH BUILT ROLLS-ROYCE SWEPTAIL\nTHAT IS MY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES, FOLKS,\nSOMETIMES, I JUST, I JUST SHRIEK AWAKE IN THE\nSCRAPYARD OF A DERELICT MACHINE\nSHOP, SCAVENGE A\n3.6-HORSEPOWER BRIGGS AND STRATTON\nLAWN MOWER ENGINE, BULLY A\nDOG INTO GIVING ME ITS FRISBEE\nTO USE AS A STEERING WHEEL, THEN\nI BRIEFLY CONSIDER-- BUT DECIDE\nAGAINST-- SWIPING THE BRAKE PADS\nOFF AN UNATTENDED HEARSE,\nBECAUSE WHERE I'M GOING, WE DON'T\nNEED BRAKES.\nI HOOK IT ALL UP TO A RUSTY\nDOLLAR TREE SHOPPING CART,\nSHOTGUN A WHITE CLAW, NO LAW ON THE CLAW, SPRAY\nPAINT MY TEETH, AND BLAST OFF IN\nTHE \"FURY ROAD\" THUG-BUGGY OF\nNEWS THAT IS MY SEGMENT:\n\"MEANWHILE\"!"
+    },
+    "DhuCyncmFgM": {
+        "begin": "0:43.5",
+        "end": "1:30.5",
+        "text": "YOU KNOW, FOLKS, I\nSPEND A LOT OF TIME STANDING RIGHT\nOVER THERE, COMBING THROUGH\nTHE DAY'S\nSTORIES TO FIND AND ERECT\nFOR YOU THE FINEST GRECIAN\nNEWS\nCOLUMNS, ADORNING THEM WITH\nTHE MOST UP-TO-THE-MINUTE\nBAS-RELIEF.\nAND THEN I IMPART MY MOST\nTOPICAL\nTEACHINGS TO BE ABSORBED BY\nEAGER, SPONGE-LIKE\nMINDS IN THE\nAUGUST ATHENIAN ACADEMY THAT\nIS MY MONOLOGUE.\nBUT SOMETIMES, JUST\nSOMETIMES, FOLKS, I COME TO IN A\nDRIED-OUT BABY POOL\nON AN ABANDONED RANCH I WON\nIN A RUSSIAN ROULETTE GAME\nOFF THE COAST OF MOZAMBIQUE\nI GATHER TUMBLEWEEDS\nAND I LASH THEM TOGETHER WITH\nSOME TWINE I FOUND IN A DUMPSTER\nBY A BURNED-OUT REST STOP,\nTHEN I\nSHOTGUN A HOT MONSTER ENERGY\nDRINK AND CHEW ON MACA ROOT\nAS I\nHALLUCINATE THROUGH THE\nNIGHT IN THE VAGRANT'S HOT\n-BOX YURT OF\nNEWS THAT IS MY SEGMENT:\n\"MEANWILE!\""
+    },
+    "EnGHyZS4f-8": {
+        "begin": "1:33.0",
+        "end": "2:12.0",
+        "text": "YOU KNOW, FOLKS, I'VE SPENT\nDECADES CULTIVATING THE MOST\nRELEVANT RED OAK, FELLING THEM\nWITH TOPICAL HUSQVARNAS,\nMULTIGRADING THEM INTO THE MOST\nBUZZWORTHY THREE-QUARTER INCH\nFLOORSTRIPS, AND FINISHING THEM\nWITH UP-TO-THE-MINUTE HIGH-GLOSS\nPOLYURETHANE, TO LAY FOR YOU THE\nFLAWLESS PARQUET N.B.A.-QUALITY\nBASKETBALL COURT THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, I SCARE THE GEESE\nOUT OF THE BOG BEHIND MY UNCLE'S\nSHED, FILL IT WITH SAND I STOLE\nFROM AN ABANDONED PLAYGROUND,\nTHEN BLANKET IT WITH WET TRASH\nAND DISCARDED AMAZON BOXES, TO\nCREATE FOR YOU THE\nMUSKRAT-RIDDLED BOUNCELESS\nBACKYARD WALLBALL PIT OF NEWS\nTHAT IS MY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "G8ajua4Mb5I": {
+        "begin": "1:39.0",
+        "end": "2:24.0",
+        "text": "YOU KNOW, FOLKS,\nIF YOU WATCH THIS SHOW AND I\nHOPE YOU DO,\nI SPEND A LOT OF TIME RIGHT OVER\nTHERE, CAREFULLY ERECTING THE\nNEWSIEST, MOST TOPICAL\nCORINTHIAN COLUMNS, FOLLOWING\nTHE GOLDEN RATIO, POLISHING THE\nFINEST CARRARA MARBLE, AND\nENGRAVING ONTO IT MYTHIC TALES\nWITH FLOURISHES OF HUMOR AND\nPATHOS, TO CREATE THE\nGRECO-ROMAN ACROPOLIS THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES, FOLKS,\nI GRAB A COUPLE OF OFF-BRAND\nLEGOS THAT HAVE BEEN JAMMED\nBETWEEN MY TOES SINCE I STEPPED\nON THEM IN 2003, FISH THE STICKS\nOUT OF SOME MELTED POPSICLES IN\nA BROKEN FREEZER, COLLECT THE\nPRE-SOAKED SAND FROM A\nPLAYGROUND I BROKE INTO IN THE\nDEAD OF NIGHT, AND THROW IT ALL\nTOGETHER TO CONSTRUCT FOR YOU\nTHE RAMSHACKLE PARTHENON OF NEWS\nTHAT IS MY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "I4s-44cPYVE": {
+        "begin": "1:10.0",
+        "end": "2:59.0",
+        "text": "FOLKS, YOU KNOW, IF YOU WATCH THE SHOW, YOU KNOW, I SPEND\nMOST OF MY TIME\nRIGHT OVER THERE, SURVEYING THE\nNEWS MARKET FOR THE\nBIGGEST STORIES, THEN CAREFULLY\nSELECTING THE FINEST, MOST\nTOPICAL BUFFALO NEWS HIDE WHICH\nI THEN SOAK USING NATURAL SPRING\nAND LIMEWATER-- ONLY DURING\nCOLDER MONTHS-- AND SCRAPE IT\nUNTIL IT IS EVENLY TRANSLUCENT\nAND SUPPLE.\nAND THEN, USING THE TRADITIONAL\nPUSH-KNIFE METHOD, I DELICATELY\nMAKE MORE THAN 3,000 CUTS TO\nCREATE THE EMOTIVE AND POWERFUL\nFIGURINE WHICH I DECORATE WITH\nGRADATIONS AND CONTRAST,\nEMPLOYING THE SHAANXI REGION\nFIVE-COLOR SYSTEM.\nTHEN I CAREFULLY CONNECT THE\nFIGURINE'S JOINTS WITH COTTON\nTHREADS SO THEY CAN BE OPERATED\nFREELY, AND FIRE UP A PAPER\nLANTERN BEHIND A FINE HUANGZHOU\nSILK SCREEN, AND, BACKED BY A\nSUONA HORN, AND YUEQIN, AND\nBANHU FIDDLE, I OPERATE NO LESS\nTHAN FIVE OF THESE FIGURINES AT\nONCE, BECOMING THE LIVING\nEMBODIMENT OF THE 1,000-HAND\nGWAN-YIN, TO MOUNT FOR YOU THE\nEPIC AND MOVING TONG DYNASTY\nPI-YING SHADOW PLAY THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, FOLKS,\nSOMETIMES,\nIT'S CRAFTSMANSHIP.\nIT GOES LIKE THAT, RIGHT\nTHERE.\nSOMETIMES, FOLKS,\nI AM PECKED\nAWAKE BY A DIRTY SEAGULL ON THE\nINTRACOASTAL WATERWAY, WHILE\nSTILL LYING ON THE BACK OF A\nMANATEE WHO RESCUED ME FROM SOME\nARMS DEALERS I DOUBLE-CROSSED\nOFF THE COAST OF CAPE FEAR, AND WHO THEN\nDUMPS ME ON AN ABANDONED WHARF\nWHERE I SLIP THEIR DIRTY SOCK OFF\nA SEVERED FOOT I FISHED OUT OF A\nSTORM DRAIN, AND SLAP GOOGLY\nEYES ON IT MADE FROM TWO MENTOS\nI PRIED OUT OF THE MOUTH OF A\nMANGY COYOTE.\nTHEN I FASHION A KAZOO OUT OF A\nPOCKET COMB I STOLE FROM A\nFISHERMAN AND THE WAX PAPER FROM\nHIS MEATBALL SUB, TO HONK OUT A\nDIRGE WHILE YAMMERING A TONE\nPOEM ABOUT THE DEMONS INFESTING\nMY BEARD IN THE UNBALANCED MANIC\nSOCK PUPPET SHOW OF NEWS THAT IS\nMY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "JAfAApqOeFU": {
+        "begin": "1:49.5",
+        "end": "2:42.5",
+        "text": "FOLKS.\nI SPEND A LOT OF MY TIME GATHERING FOR YOU\nTHE FINEST, MOST TOPICAL STORIES\nABOUT NATIONAL STUDIES,\nSCIENTIFIC BREAKTHROUGHS, AND\nDRUNK MOOSE BREAKING INTO ICE\nCREAM SHOPS, ONLY TO HAVE A\nPANDEMIC HIT, DURING WHICH I\nTAKE THEM INTO A SAD, EMPTY\nLITTLE ROOM WHERE MY ONLY\nFRIENDS ARE ROBOT CAMERAS AND A\nPURELL BOTTLE, AND I LET MYSELF\nGO WHILE SLOWLY DESCENDING INTO\nMADNESS AS I SHOVE MY SWEET\nINNOCENT LITTLE JOKES INTO A\nSEGMENT THAT I AM FORCED TO\nRENAME \"QUARANTINE-WHILE.\"\nBUT SOMETIMES, I CRAWL OUT OF\nTHE BROOM CLOSET AFTER 15\nMONTHS, POUR MYSELF BACK INTO A\nSUIT, ASSEMBLE THE TOP TEAM IN\nTHE BUSINESS, THE SWINGINGEST\nBAND IN LATE NIGHT, AND THE\nBEST DAMN AUDIENCE IN THE WORLD.\nSO I CAN RETURN TO YOUR LOVING\nARMS IN THE KICK-ASS,\nPROPERLY-PRESENTED\nCELEBRATION OF MARGINAL NEWS\nTHAT IS MY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "JfT59wBSQME": {
+        "begin": "0:52.0",
+        "end": "1:37.0",
+        "text": "YOU KNOW, FOLKS, I SPEND A\nLOT OF TIME HARVESTING THE DAY'S\nMOST TOPICAL MATCHA POWDER,\nCAREFULLY POLISHING THE NEWSIEST\nCHAWAN TEA BOWL WITH A HEMP\nCLOTH, AND ADDING THE PUREST\nBOILED WATER COLLECTED FROM THE\nRIVER OF STORIES, TO STAGE THE\nELEGANT JAPANESE TEA CEREMONY\nTHAT IS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES, I CONVINCE A\nTRUCK DRIVER TO GIVE ME A RIDE\nIN EXCHANGE FOR AN UNREGISTERED\nHAND GUN AND A HALF-EATEN CAN OF\nBEANS, HITCHHIKE TO THE SONORA\nDESERT WITH NOTHING BUT AN OLD\nPOT THAT I FILL WITH THE\nNEWSPAPER I'VE BEEN USING FOR A\nBLANKET, AND THE SALVAGED\nTOBACCO FROM A SIDEWALK CIGARETTE\nBUTT, TO BREW FOR YOU, THE\nNIGHTMARE HALLUCINATION-INDUCING\nFERMENTED AYAHUASCA SLURRY OF\nNEWS THAT IS MY SEGMENT:\nMEANWHILE!"
+    },
+    "KT8pCZ5Xw9I": {
+        "begin": "0:37.6",
+        "end": "1:26.6",
+        "text": "FOLKS, YOU KNOW,\nIF YOU WATCH THE SHOW,\nTHAT I SPEND A LOT OF TIME RIGHT OVER\nTHERE, GATHERING THE FRESHEST,\nNEWSIEST HEADLINE FLOWERS,\nSCOURING THE FIELDS AND FORESTS\nFOR THE MOST TOPICAL AND\nFRAGRANT SYLVAN NEWS BOUGHS, THE\nJOKIEST FESTIVE GOURDS, AND THEN\nCAREFULLY ASSEMBLING AND\nARRANGING THEM ALL INTO THE\nGRAND YET TASTEFUL STATE\nDINNER-WORTHY CENTERPIECE THAT\nIS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES, NOW, SOMETIMES,\nI RUB SOME LEAD PAINT CHIPS ONTO\nMY GUMS, STAGGER INTO THE WOODS\nWITH NOTHING BUT A STAPLE GUN\nAND SOME EMPTY CANS OF SPRAY\nPAINT, AND THEN, BY THE LIGHT OF THE\nTIRE-FIRE, USING SMASHED LARVAE\nAND MY OWN SALIVA AS GLUE, I\nCOBBLE TOGETHER A CRUDE PILE OF\nPUNKY WOOD AND ANIMAL SKULLS TO\nPRESENT TO YOU THE UNHINGED\nLONER'S CORNUCOPIA OF NEWS THAT\nIS MY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "L-kR7UCzhTU": {
+        "begin": "1:36.0",
+        "end": "2:19.0",
+        "text": "YOU KNOW, I SPEND A LOT OF TIME\nRIGHT OVER THERE HAND-RAISING\nAND SELECTING THE NEWEST,\nMOST-TOPICAL SEVILLE ORANGES,\nCAREFULLY SIMMERING THEM WITH\nTURBINADO SUGAR AND PRISTINE\nFILTERED WATER TO CREATE FOR YOU\nA DOUBLE-GOLD-MEDAL-WINNING\nBITTERSWEET ENGLISH MARMALADE TO\nSPREAD ON THE PERFECTLY TOASTED\nARTISANAL BRIOCHE THAT IS\nMY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES, FOLKS,\nI GRAB AN EXPIRED TUB OF\nWHIPPING CREAM, TOSS IT IN A\nBLENDER WITH THE HALF OF A\nGOGURT I FOUND IN A SCHOOLYARD,\nAND LET THAT FERMENT BEHIND THE\nFURNACE WHILE I FISH A DRIED\nPITA OUT OF THE GUTTER\nUNDERNEATH THE CORN-- CORNER KEBAB\nSTAND, THEN SLATHER IT WITH THE\nUNPRESSURIZED NIGHTMARE\nDAIRY OF NEWS THAT IS MY\nSEGMENT:\n\"MEANWHILE!\""
+    },
+    "Lf-LkJhKVhk": {
+        "begin": "1:14.0",
+        "end": "1:53.0",
+        "text": "YOU KNOW, I SPEND A LOT OF TIME\nOVER THERE CAREFULLY ASSEMBLING\nTHE MOST TOPICAL VIRTUOSO WIND\nSECTION, TUNING THE VIOLAS,\nCELLOS, AND CONTRABASS TO THE\nCOUNTRY'S ZEITGEIST, AND WAVING\nMY CONDUCTOR'S BATON TO THE\nTEMPLE OF HUMOR TO PRESENT FOR\nYOU THE SPECTACULAR BRAHMS\nCONCERTO IN SATIRE MAJOR THAT IS\nMY MONOLOGUE.\nBUT SOMETIMES, I WAKE UP IN A\nFUGUE STATE BEHIND THE ABANDONED\nVIDEO STORE, STEAL A GARBAGE CAN\nLID AND A BROKEN UMBRELLA\nHANDLE, AND THEN GRAB AN EMPTY CAN\nOF P.B.R. I'VE BEEN USING AS AN\nASHTRAY TO FASHION A RUSTED\nKAZOO, ALL TO CREATE THE\n2-IN-THE-MORNING ONE-MAN-STOMP\nSHOW OF NEWS THAT IS MY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "P72uFdrkaVA": {
+        "begin": "0:49.3",
+        "end": "1:48.3",
+        "text": "FOLKS, IF YOU WATCH THE SHOW, YOU KNOW I SPEND A LOT OF\nTIME RIGHT OVER THERE, TENDERLY\nCLIPPING THE NEWSIEST, MOST\nFRAGRANT TEA-LEAF BUDS OF THE\nDAY, GINGERLY LAYING THEM TO DRY\nBY THE LIGHT OF THE\nROSY-FINGERED DAWN,\nPAINSTAKINGLY STEEPING THEM TO\nPERFECTION IN THE MOST TOPICAL\nOF FRESH WATER GATHERED FROM THE\nNATION'S NEWS RESERVOIR, BEFORE\nCEREMONIOUSLY SERVING TO YOU THE\nANTIOXIDANT-RICH ELIXIR OF\nBESPOKE TEA SHAN THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES FOLKS, AFTER\nA BENDER ON ABSINTHE AND\nMOUNTAIN DEW CODE RED, I SWEAT\nMYSELF AWAKE HOVERED OVER A VAT\nOF BLISTERING SEWER RUNOFF.\nI ADD TO THE GURGLING POT OF\nNIGHTMARES WHATEVER THE VOICES\nDEMAND: SCRAPS OF WET TRASHCAN\nLETTUCE AND BAND-AIDS FROM THE\nCOMMUNITY POOL.\nTHEN, USING AN OLD GYM SOCK I\nFOUND AT THE HOBOKEN Y.M.C.A., I\nSTRAIN IT ALL INTO A DISUSED\nGASOLINE CONTAINER TO OFFER YOU\nTHE SCALDING-HOT DEMENTED DEMON\nTONIC OF NEWS THAT IS MY\nSEGMENT: \"MEANWHILE!\""
+    },
+    "PT5_00Bld_8": {
+        "begin": "0:57.0",
+        "end": "2:02.0",
+        "text": "FOLKS, YOU KNOW, I SPEND A LOT OF MY TIME,\nRIGHT OVER THERE, CRUISING THE\nVAST TSUKIJI FISH MARKET OF THE\nDAY'S BIGGEST STORIES, CAREFULLY\nSURVEYING THE FRESHEST, MOST\nTOPICAL CATCH OF THE DAY,\nCHOOSING ONLY THE HIGHEST GRADE\nAHI NEWS TUNA, AWABI, AND UNAGI,\nTHEN DELICATELY PICKING THROUGH\nTHE RICE GRAINS OF THE\nHEADLINES, GENTLY WRAPPING THE\nINGREDIENTS IN THE FRESHEST\nHAND-PICKED NORI, AND CAREFULLY\nLAYING IT ALL OUT TO PRESENT TO\nYOU THE YAMANAKA LACQUERED\nTHREE-TIER JUBAKO BENTO BOX THAT\nIS MY NIGHTLY MONOLOGUE.\nBUT -- YOU KNOW WHAT I'M SAYING.\nYOU KNOW WHAT I'M SAYING.\nYOU KNOW WHAT'S COMING.\nBUT SOMETIMES, JUST SOMETIMES FOLKS, I AM SLAPPED\nAWAKE BY A POSSUM ON A BED OF\nWET TIRES NEAR A LONG-ABANDONED\nWHARF, SELL MY LAST REMAINING\nADULT TEETH TO A VAGRANT\nFISHMONGER FOR AN EXPIRED SALMON\nCARCASS, AND GRIND THAT INTO A\nSOFT PASTE, THEN ROLL IT IN ENOUGH\nSTALE RICE KRISPIES TO MAKE\nSNAP, CRACKLE, AND POP TAKE A\nHARD LOOK IN THE MIRROR, AND SERVE\nYOU THE NIGHT-SCREAM INDUCING\nHOBO HAND-ROLL OF NEWS THAT IS\nMY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "QPDZbNEhsuw": {
+        "begin": "1:31.0",
+        "end": "2:15.0",
+        "text": "YOU KNOW, FOLKS, I SPEND MOST\nOF MY TIME RIGHT OVER THERE,\nDIGGING DOWN INTO THE NEWS\nPIT AND MINING FOR YOU THE\nDAY'S\nCLEAREST STORY DIAMONDS,\nCLEAVING THEM INTO THE MOST\nTOPICAL CUTS, FACETING THEM,\nPOLISHING THEM TO A HIGH\nFINISH,\nTHEN SETTING THEM ALL IN A\nDELICATE 24 KARAT GOLD CHAIN\nTO\nCREATE THE BESPOKE CARTIER\nNECKLACE THAT IS MY NIGHTLY MONOLOGUE,\nBUT SOMETIMES, SOMETIMES FOLKS,\nI JUST HUFF A LITTLE TOO MUCH\nEPOXY\nAND STUMBLE DOWN TO AN\nABANDONED PIER, WHERE I FIND\nA PIECE OF\nDISUSED FISHING LINE AND\nSTRING IT WITH OLD BOTTLE\nCAPS, RUSTY\nPADLOCKS, AND BABY TEETH,\nTHEN RIP THE SEAT BELT OUT\nOF A\nBURNED-OUT POLICE CAR TO\nMAKE A CLASP, AND PARADE\nNAKED THROUGH\nA CHI-CHI'S WEARING THE\nPSYCHO CHOKER OF NEWS THAT\nIS MY\nSEGMENT:\n\"MEANWHILE.\""
+    },
+    "QjQbQlN9Ev8": {
+        "begin": "3:53.0",
+        "end": "4:35.0",
+        "text": "YOU KNOW, FOLKS, I SPEND A LOT OF\nMY TIME RIGHT OVER THERE,\nHANGING FOR YOU THE DAY'S\nHOTTEST, MOST TOPICAL NEWS\nDECORATIONS, BOOKING THE SEXIEST\nBAND, CURATING THE MOST RELEVANT\nDRINKS MENU, DISTRIBUTING\nPLAYFUL, YET TASTEFUL PARTY\nFAVORS, AND GLITTER HATS THAT\nSAY \"2022,\" AND THEN SETTING THE\nMOST AU-COURANT LIGHTING TO\nTHROW FOR YOU THE UPSCALE,\n\"PITCH PERFECT\" NEW YEAR'S EVE\nPARTY THAT IS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES,\nFOLKS, I WAKE UP IN THE RAFTERS\nAFTER LOSING A BET TO A CROW.\nAND THEN I SPIKE THE PUNCH BOWL WITH\nCHLOROFORM AND MILITARY-GRADE\nHELICOPTER LUBRICANTS, MAKE A\nBUNCH OF RESOLUTIONS TO QUIT\nHUFFING WD-40, AND PUNCH A\nPOLICE HORSE DURING THE FUGITIVE\nNEW YEAR'S HO-DOWN OF NEWS THAT\nIS MY SEGMENT:\nMEANWHILE!"
+    },
+    "R6JV_I36It8": {
+        "begin": "0:38.0",
+        "end": "1:18.0",
+        "text": "YOU KNOW, FOLKS, I SPEND\nA LOT OF TIME\nSHUCKING FOR YOU THE DAY'S MOST\nTOPICAL CLAMS, BONING THE\nFINEST, MOST CURRENT NEWS\nCHICKENS, AND COLLECTING THE\nHIGHEST QUALITY STORY SHRIMP\nAND SAFFRON RICE, THEN GENTLY\nSIMMERING IT ALL IN A CAST-IRON\nCOMEDY PAI-YERA, TO CREATE\nTHE FRAGRANT SEAFOOD PAELLA THAT\nIS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES FOLKS, I\nSHAMBLE DOWN TO THE DOCKS WITH A\nRUSTY CROWBAR, MANEUVER THROUGH\nTHE POLLUTED CANAL USING A\nMCDONALD'S STRAW AS A SNORKEL,\nAND SCRAPE THE BARNACLES OFF A\nPASSING GARBAGE SCOW, TOSS THEM\nIN A POT WITH SOME HALF-USED\nRAMEN FLAVORED PACKETS AND\nMOUNTAIN DEW, TO BREW FOR YOU\nTHE CHUNKY STEW OF NEWS THAT IS\nMY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "RFVggCw58lo": {
+        "begin": "1:00.0",
+        "end": "1:43.0",
+        "text": "YOU KNOW FOLKS, I SPEND A LOT OF TIME\nSTANDING RIGHT OVER THERE, PAINSTAKINGLY\nPENCILING THE DAY'S MOST TOPICAL\nAND HEROIC STORIES, STAGING THEM\nIN METICULOUSLY PLANNED PANELS,\nHAND-INKING THEM WITH THE\nPITHIEST DIALOGUE, THEN COLOR\nBALANCING THE FINISHED PAGES TO\nCREATE FOR YOU THE GENRE-BENDING\nONCE-IN-A-GENERATION GRAPHIC\nNOVEL THAT IS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES FOLKS, I JUST TEAR A PAGE OUT\nOF A WET NEWSPAPER I PEELED OFF\nA SUBWAY SEAT, GRAB A BROKEN\nCRAYON I FOUND IN MY COUCH,\nCRUDELY DRAW SOME FILTHY\nCARTOONS, SCRIBBLE IN AS\nMANY CURSE WORDS AS I CAN IN THE\nPORNOGRAPHIC DRIFTER ZINE OF\nNEWS THAT IS MY SEGMENT:\nMEANWHILE!"
+    },
+    "RHDQpOVLKeM": {
+        "begin": "0:39.0",
+        "end": "2:11.0",
+        "text": "YOU KNOW, FOLKS, I\nSPEND MOST OF MY TIME, RIGHT\nOVER THERE, PORING OVER THE\nDAY'S BIGGEST STORIES,\nCOLLECTING THE FINEST,\nMOST-TOPICAL NEWS CALFSKINS AND\nPAINSTAKINGLY WASHING THEM IN A\nCALCIUM HYDROXIDE SOLUTION, THEN\nSOAKING THEM IN LIME FOR DAYS TO\nREMOVE ALL NARRATIVE IMPURITIES\nAND CREATE A PALE VELLUM THAT I\nLATER PLACE ON MY SCRIPTORIUM IN\nA MONASTERY ON THE CLIFFS OF\nDOVER.\nTHERE, USING A PEN CUT FROM THE\nWING FEATHER OF A SWAN OF THE\nRIVER AVON, I DESIGN COPTIC AND\nSYRIAC ILLUSTRATIONS, ADORNED\nWITH WHIMSICAL CELTIC SPIRALS\nAND USE GERMANIC ZOOMORPHIC\nDESIGNS TO CREATE THE MARGINALIA\nSURROUNDING THE PAGES OF ELEGANT\nHALF-UNCIAL INSULAR SCRIPT THAT\nTELL THE HOLIEST OF STORIES,\nWHICH I THEN BIND WITH GOLDEN\nTHREAD UNDER A PROTECTIVE CASE\nOF CARVED OAK TO CREATE FOR YOU\nTHE GLORIOUS LATE\nANGLO-SAXON PERIOD ILLUMINATED\nMANUSCRIPT THAT IS MY MONOLOGUE.\nBUT SOMETIMES, FOLKS, SOMETIMES,\nTHESE PEOPLE KNOW, THEY KNOW, THEY KNOW\nBUT SOMETIMES, FOLKS,\nI COME TO UNDER A RAMP IN THE\nMIDDLE OF A DEMOLITION DERBY,\nHOTWIRE THE TRUCKASAURUS\nAND LEAD THE POLICE ON A CHASE\nBEFORE CRASHING INTO A SWAMP\nGATHERING JUST AS, WHAT I ASSUME\nIS A PRIEST SAYS, \"YOU MAY KISS\nTHE BRIDE,\" RIP A LEECH OFF MY\nASS, AND USE IT TO HASTILY\nDOODLE A SKETCH OF THE SCENE IN\nMY OWN BLOOD ON AN OLD DAVE AND\nBUSTERS RECEIPT, THEN STAGGER\nTOWARD THE HAPPY COUPLE\nCLUTCHING THE NIGHTMARE\nSTALKER'S WEDDING ALBUM OF NEWS\nTHAT IS MY SEGMENT:\nMEANWHILE!"
+    },
+    "TZSw9iRk03E": {
+        "begin": "0:22.5",
+        "end": "1:08.5",
+        "text": "FOLKS, YOU KNOW I\nSPEND A LOT\nOF RIGHT TIME OVER THERE, CAREFULLY\nSTUDYING THE LATEST, NEWSIEST\nCLINICAL STUDIES, PRACTICING AND\nTRAINING UNDER THE BEST, MOST\nTOPICAL DOCTORS, CAREFULLY\nSTERILIZING ALL MY EQUIPMENT,\nAND ASSEMBLING THE WORLD'S GREATEST\nSURGICAL TEAM TO PERFORM FOR YOU THE\nDAZZLINGLY COMPLEX AND\nGROUNDBREAKING THORACIC AORTIC\nDISSECTION REPAIR THAT IS MY\nMONOLOGUE.\nBUT SOMETIMES, SOMETIMES,\nI GET KICKED OUT OF MY HEALTH CARE\nPLAN FOR LISTING MY DOG BENNY AS\nA GASTROENTEROLOGIST, SO I\nSTIPPLE SOME INCISION MARKS ON\nMY ABDOMEN WITH A DRIED-OUT\nSHARPIE, SLAM A COUPLE OF RED BULLS\nIN FRONT OF A SHATTERED MIRROR,\nAND FISH A RUSTY BONING KNIFE\nOUT OF A STOLEN CRAB BOAT TO\nPERFORM THE EXPLORATORY HOBO\nAPPENDECTOMY OF NEWS THAT IS MY\nSEGMENT:\n\"MEANWHILE!\""
+    },
+    "VV3UJmb8kHw": {
+        "begin": "2:04.0",
+        "end": "2:54.0",
+        "text": "YOU KNOW, FOLKS,\nIF YOU WATCH THE SHOW, YOU KNOW\nI SPEND A LOT OF MY TIME RIGHT\nOVER THERE.\nCAREFULLY COMBING THROUGH THE\nBIGGEST STORIES OF THE DAY,\nSOURCING FOR YOU THE NEWSIEST\nMIKADO ORGANZA IN A HIGH SHEEN,\nADDING THE MOST TOPICAL IVORY\nFEATHER FRINGE AND A DIPPED\nBACK, THEN THROWING ON A DEMURE\nBUT KICKY FLORAL EMBROIDERED\nTULLE SHRUG WITH STATEMENT PEARL\nACCENTS TO PRESENT TO YOU THE\nGLORIOUS \"VOGUE\" COVER-READY\nWEDDING GOWN THAT IS MY\nMONOLOGUE.\nBUT SOMETIMES, WHILE ON A\nGLUE-HUFFING BINGE, I CRASH A\nSTOLEN HEARSE INTO AN ABANDONED\nCHILDREN'S HOSPITAL WHERE I USE\nMY TEETH TO TEAR UP SOME OLD\nCURTAINS AND STAINED CARPETING,\nAND STEAL A BUTTON OFF AN OLD\nSURGICAL APRON, AND STITCH IT\nALL TOGETHER WITH A NEEDLE MADE\nFROM A CHICKEN BONE TO THROW\nTOGETHER THE SHRIEKING CAT\nLADY'S SACK DRESS OF NEWS THAT\nIS MY SEGMENT: \"MEANWHILE.\""
+    },
+    "VYVbTzoggKc": {
+        "begin": "0:00.0",
+        "end": "0:49.0",
+        "text": "FOLKS, YOU KNOW, I\nSPEND A LOT OF\nMY TIME ON THE SHOW-- IF YOU\nWATCH THE SHOW YOU'D FIGURE\nTHIS OUT,\nRIGHT OVER THERE, STANDING RIGHT OVER THERE IN THE MONOLOGUE SPELUNKING\nTHROUGH THE DAY'S STORIES TO\nSELECT AND SOURCE THE\nNEWSIEST MARBLE, CHISELING\nIT INTO A\nPEDESTAL OF HUMOR AS WIDE AS\nTWO GREEK ISLES.\nTHEN I CAST THE MOST TOPICAL\nCURRENT-EVENTS-BRONZE INTO A\nFINELY CRAFTED MOULD TO\nERECT FOR YOU THE TOWERING\nGRECIAN\nCOLOSSUS THAT IS MY NIGHTLY\nMONOLOGUE.\nBUT SOMETIMES, JUST\nSOMETIMES, FOLKS, I JOLT\nAWAKE INSIDE\nWHAT'S LEFT OF A RUSTED\nMAZDA MIATA IN A WHITE CLAW\nAND\nOVEN-CLEANER-INDUCED FUGUE\nSTATE, SHAMBLE THROUGH THE\nJUNKYARD, RANSACKING THE\nDEBRIS FOR OLD FISHING RODS,\nMELTED\nBATTERIES AND THE SHOVEL OF\nA DERELICT BACKHOE, AND THEN\nBOOST AN\nACETYLENE TORCH TO HASTILY\nWELD TOGETHER THE BOOTLEG\nTRUCKASAURUS OF NEWS THAT\nIS MY SEGMENT:\nMEANWHILE!"
+    },
+    "WWWeV8xVNtI": {
+        "begin": "2:00.0",
+        "end": "2:35.0",
+        "text": "YOU KNOW, FOLKS, I SPENT A LOT\nOF TIME CAREFULLY RESEARCHING\nTHE DAY'S MOST CULTURALLY\nPRECIOUS STORIES,\nCROSS-REFERENCING HISTORICAL\nACCOUNTS WITH TOPOGRAPHICAL\nMAPS, AND ASSEMBLING THE FINEST\nTEAM OF ARCHAEOLOGISTS TO\nUNEARTH THE UNESCO WORLD\nHERITAGE EXCAVATION SITE OF\nHUMOR THAT IS MY MONOLOGUE.\nBUT SOMETIMES, I DISTRACT AN\nORPHAN WITH A PIECE OF\nLINT-COVERED CANDY AND STEAL\nTHEIR BUCKET AND PAIL, THEN\nSNEAK INTO THE POTTER'S FIELD IN\nTHE DEAD OF NIGHT WITH TWO\nDRIFTERS I PICKED UP ON THE\nCOMMUTER TRAIN, AND FORCE THEM\nTO DIG FOR THE ABANDONED\nPAUPER'S GRAVE\nOF NEWS THAT IS MY SEGMENT:\nMEANWHILE!"
+    },
+    "XzJAtzdrY_w": {
+        "begin": "2:57.0",
+        "end": "4:32.0",
+        "text": "FOLKS, IF YOU WATCH THIS SHOW, YOU KNOW I SPEND MOST\nOF MY TIME, RIGHT OVER THERE,\nCAREFULLY COMBING THE NEWS\nLANDSCAPE AND HARVESTING THE\nFINEST, MOST BEAUTIFUL STORY\nPIGMENTS LIKE MALACHITE,\nAZURITE, AND CINNABAR, WHICH I\nSLOWLY GRIND UNDER A GLASS\nOF MULLER WITH ONLY THE MOST\nTOPICAL LINSEED OIL, WORKING\nTHEM INTO SMOOTH, BUTTERY\nVERMILLIONS, VERDIGRIS, AND NEW\nGAMBOGES, WHICH I THEN APPLY TO\nA GRISAILLE PREPARED ON A CANVAS\nOF FLAX, TOW, AND JUTE, SLOWLY\nWORKING UP THE SHADOW\nSHAPES AND MAJOR MASSES, THEN\nDELICATELY RENDERING THE\nINTERPLAY OF LIGHT AND FORM,\nBEFORE APPLYING THE FINE DAMMAR\nAND MASTIC VARNISH TO UNVEIL FOR\nYOU THE GLORIOUS REMBRANDT\nPORTRAIT OF THE DAY'S EVENTS\nTHAT IS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES--\nSOMETIMES, FOLKS, SOMETIMES\nI'M SHAKEN AWAKE INSIDE THE\nDARKENED TRUNK OF A BULGARIAN\nMOBSTER'S VOLVO 940, I QUIETLY\nRELEASE THE SAFETY CATCH AND\nTUMBLE ONTO THE SIDE OF A DIRT\nROAD, BREAKING BOTH CLAVICLES,\nWHICH I DO NOT FEEL BECAUSE OF\nALL THE ANGEL DUST.\nI STAGGER INTO AN ABANDONED\nTANNERY WHERE I BEFRIEND AN OWL\nWHO TELLS ME TO I HAVE TO LET\nHIM SPEAK THROUGH ME OR HE'LL\nMURDER THE CLOUDS.\nAND IN HIS DIRECTION, I MIX THE\nFUN DIP I FOUND IN MY POCKET\nWITH THE FISTFULS OF HEXAVALENT\nCHROMIUM I SCOOP UP FROM THE\nDISUSED TANNING PITS, THEN HURL\nIT AT THE SIDE OF A NEARBY\nDEFUNCT DAIRY QUEEN IN A FUGUE\nSTATE OF LASHING OUT AT LIGHT\nAND COLOR, TO UNLEASH FOR YOU\nTHE ABSTRACT EXPRESSIONIST\nSPLATTER FRESCO OF NEWS THAT IS\nMY SEGMENT:\nMEANWHILE!"
+    },
+    "YyV6l8HPmdQ": {
+        "begin": "0:48.0",
+        "end": "1:32.0",
+        "text": "FOLKS, LADIES AND\nGENTLEMEN, YOU KNOW I SPEND A\nLOT\nOF TIME DELICATELY WHITTLING A\nMELANGE OF THE DAY'S MOST\nPRESSING STORY TIMBERS,\nPRECISELY MEASURING THE NECKS,\nRIBS, AND BACKS OF THE NEWS,\nEMPLOYING ONLY THE MOST\nSOPHISTICATED AND TOPICAL\nPURFLING, THEN LAYING 15\nEXQUISITE COATS OF INSIGHT ONTO\nTHE ORNATE YET ROBUST\nSTRADIVARIUS VIOLIN THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES, FOLKS,\nI GATHER UP FRAYED ELECTRICAL\nWIRE FROM A BURNT-OUT BOWLING\nALLEY, TAPE IT TO A\nTERMITE-INFESTED 2-by-4, THEN SHOVE\nONE END TO A DISCARDED CHUM\nBUCKET TO MAKE FOR YOU THE APPALACHIAN\nDRIFTER'S BANJO OF NEWS THAT IS\nMY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "a8DD__mRtPk": {
+        "begin": "1:13.0",
+        "end": "1:58.0",
+        "text": "FOLKS, YOU KNOW, I SPEND MOST OF\nMY TIME GATHERING FOR YOU THE LATEST,\nMOST CUTTING-EDGE NEWS STORIES,\nCAREFULLY EXAMINING THE DAY'S\nCT SCAN, THEN ASSEMBLING\nAMERICA'S CRACK MEDICAL TEAM,\nAND MAKING PRECISE INCISIONS\nWITH THE AID OF A STRYKER 1588\nLAPAROSCOPE IN THE\nGROUNDBREAKING SURGICAL ARTISTRY\nTHAT IS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES,\nFOLKS, WHEN I NEED A LITTLE EXTRA CASH TO\nPAY OFF MY COCK-FIGHTING DEBTS,\nI SET UP A RUSTY COT UNDER A\nTARP IN WASHINGTON SQUARE PARK,\nWHERE I PLY CURIOUS PASSERSBY\nWITH BATHTUB COUGH SYRUP TO HELP\nDULL THE PAIN WHILE I USE GARDEN\nSHEARS TO CUT OUT ANYTHING THAT\nLOOKS SUPERFLUOUS IN THE AMATEUR\nAPPENDECTOMY TENT OF NEWS\nTHAT IS MY SEGMENT...\nMEANWHILE!"
+    },
+    "cHhomJMwY1I": {
+        "begin": "2:42.0",
+        "end": "3:24.0",
+        "text": "FOLKS, I SPEND A LOT OF TIME\nSTANDING RIGHT OVER THERE,\nCOMBING THROUGH HOURS UPON HOURS\nOF GAME TAPE ON THE MOST PROMISING\nHEADLINES, METICULOUSLY CRAFTING\nMY BIG BOARD TO RANK STORIES\nBASED ON THEIR RAW TALENT AND\nINTANGIBLES, AND CUT DEALS FOR\nTHE MOST TOPICAL TRADES TO DRAFT\nTHE ONCE-IN-A-GENERATION,\nHEISMAN-WINNING QUARTERBACK THAT\nIS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES I\nWAKE UP IN AN ICE BATH AFTER\nDOING RAILS OF GATORADE POWDER,\nREALIZE IT'S DRAFT DAY, AND I\nHAVE 15 SECONDS LEFT TO MAKE A\nCHOICE AND BLURT OUT THE FIRST\nNAME I SEE TO WASTE THE NUMBER\nONE OVERALL PICK ON THE SCRAWNY,\nUNPOLISHED THIRD-STRING PUNTER\nOF NEWS THAT IS MY SEGMENT,\n\"MEANWHILE!\""
+    },
+    "hhwTiwUAaf8": {
+        "begin": "1:06.5",
+        "end": "1:54.5",
+        "text": "YOU KNOW, FOLKS, I SPEND MOST\nOF MY TIME SOURCING FOR YOU THE DAY'S\nFINEST HANGZHOU SILK NEWS\nSTORIES, MOUNTING THEM ON THE\nMOST TOPICAL, PREMIUM,\nPOLYHEDRAL BAMBOO JOKE FRAME,\nDECORATING IT WITH ARTISANAL ASH\nINK, INSERTING A HAND-POURED\nBEESWAX CANDLE, FILLING IT WITH\nINTENTION AND THEN SENDING IT ALOFT\nON THE UPDRAFT OF AUDIENCE\nLAUGHTER IN THE SPECTACULAR\nCHINESE LANTERN FESTIVAL THAT IS\nMY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES, FOLKS, I GO VISIT MY\nBUDDY, BARRACUDA AT THE\nABANDONED MALL, DROP A COUPLE OF\nHUNDOS ON SOME WET ROMAN\nCANDLES, BENT SPARKLERS SMUGGLED\nIN FROM THE PHILIPPINES, AND A\nFLARE GUN STOLEN FROM A CRASHED\nCOAST GUARD BOAT, SET IT\nALL OFF IN THE DERANGED,\nUNREGULATED FIREWORKS ACCIDENT\nOF NEWS THAT IS MY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "iB6diOGE8y4": {
+        "begin": "0:52.8",
+        "end": "1:50.8",
+        "text": "YOU KNOW, IF YOU WATCH THE SHOW, YOU KNOW I SPEND MOST OF MY TIME\nRIGHT OVER THERE, COMBING\nTHROUGH THE DAY'S BIGGEST NEWS,\nAND SELECTING FOR YOU THE\nFINEST, MOST TOPICAL INDIAN\nROSEWOOD, SPRUCE, AND MAHOGANY\nSTORIES.\nI THEN HAND-SHAPE AND COMBINE\nTHEM WITH AN ABALONE\nMULTI-STRIPE BACK INLAY, AND\nFORWARD-SHIFTED SCALLOPED\nBRACES, ANTIQUE WHITE BINDING,\nAND A HIGH-PERFORMANCE NECK WITH\nA HEXAGON FINGERBOARD, AND\nFINALLY LAY IN A\nTORTOISE-PATTERN PEARL PICK\nGUARD, AND A COMPENSATED\nBONE SADDLE, TO CRAFT FOR YOU\nTHE EXQUISITE MARTIN D-45\nDREADNOUGHT ACOUSTIC GUITAR\nTHAT IS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES, FOLKS, I SNAP\nAWAKE IN A RUSTY COFFIN FREEZER\nBEHIND AN ABANDONED DAIRY QUEEN\nOUTSIDE OF GALVESTON.\nTHEN I NAIL A 2-BY-4 TO A CEDAR URN\nI STOLE FROM A FUNERAL PARLOR, STRING\nON SOME BRAKE CABLES I RIPPED\nOUT OF A COP CAR, THEN CUT EYE\nHOLES IN A GOODWILL BAG FOR A MASK,\nHIT A WHIPPET, AND TERRORIZE\nTHE LOCALS ON THE TEXAS CHAINSAW\nBANJO OF NEWS THAT IS MY\nSEGMENT:\n\"MEANWHILE!\""
+    },
+    "iQFwGF0aW-o": {
+        "begin": "2:10.7",
+        "end": "3:33.7",
+        "text": "I SPEND A LOT OF MY TIME, RIGHT\nOVER THERE, CULTIVATING FOR YOU THE\nDAY'S BIGGEST STORIES,\nPLUCKING THE MOST BEAUTIFUL AND\nTOPICAL NEWS VIOLETS AND\nMARIGOLDS, STRIPPING THE FRENCH\nLAVENDER FROM THE STEM AND\nLOVINGLY PRESSING THEM ALL\nBETWEEN THE PAGES OF A GILDED\nFIRST EDITION OF \"PRIDE AND\nPREJUDICE.\"\nTHEN I FOLD THEM INTO A DOUGH I\nHAND ROLLED FROM PASINI BAKERY\nFLOUR, BORDIER BUTTER, AND\nCHILLED SOMERDALE DEVON DOUBLE\nCREAM, SPRINKLE THEM WITH A\nPINCH OF MUSCOVADO SUGAR, AND\nBAKE THEM IN A \"LA CORNUE GRAND\nPALAIS\" RANGE TO PERFECTLY PREP\nTHE GOURMET FLORAL SHORTBREAD\nCOOKIE THAT IS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, FOLKS, SOMETIMES,\nSOMETIMES, I AM NIBBLED AWAKE BY AN AMOROUS\nRACCOON IN THE ABANDONED WALK-IN\nFREEZER OF A HAUNTED BAKERY IN\nWHICH I HAVE ESTABLISHED\nSQUATTER'S RIGHTS, I SLIP INTO\nTHE TWISTED KITCHENAID PADDLES I\nCALL SHOES, AND KNIFE FIGHT A\nPOSSUM FOR AN EXPIRED BAG OF\nCRUSHED BREAKFAST CEREAL DUST\nAND A BROKEN EGG, WHICH I MIX\nWITH THREE SMUSHED RESTAURANT\nBUTTER PACKETS I STOLE FROM A\nNAPPING RETIREE'S PURSE, POUR\nTHE REST OF A SHATTERED BOTTLE\nOF RUBBING ALCOHOL I FOUND IN\nTHE DUMPSTER OUT BACK INTO A\nRUSTY BARREL TO IGNITE THE HOBO\nFIRE OVER WHICH I BAKE MY\nSLUDGE, THEN DISPLAY IT IN A\nFILTHY CHEF'S HAT TO SERVE YOU\nTHE DERANGED RAT KING BISCUIT OF\nNEWS THAT IS MY SEGMENT:\n\"MEANWHILE\"!"
+    },
+    "jIL7kvG7d10": {
+        "begin": "1:55.3",
+        "end": "2:40.3",
+        "text": "FOLKS, YOU KNOW, I SPEND A LOT\nOF TIME, RIGHT OVER THERE\nCAREFULLY PLANTING AND TENDING\nTO THE DAY'S BIGGEST, MOST\nIMPORTANT STORIES, TRIMMING\nTHE TOPICAL HEDGES WITH DELICATE\nEXACTITUDE, RESEARCHING AND\nSEEDING THE SOIL OF TODAY'S\nNEWS IN ORDER TO YIELD THE MOST\nBEAUTIFUL, FRAGRANT JOKE\nFLOWERS, AND PRECISELY TIMING\nTHE BLOOM IN THE EXQUISITE\nENGLISH GARDEN THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES FOLKS, I DRIVE A 2003\nPONTIAC SUNFIRE THROUGH A HOME\nDEPOT GARDEN CENTER, DOWN A JUG\nOF MIRACLE GRO, SMEAR MY BODY IN\nMUD AND PEA GRAVEL, BUILD A FORT\nOUT OF PAVERS, PLOP A SUCCULENT\nDISH GARDEN ON MY HEAD, AND\nBARRICADE MYSELF INSIDE A\nPORTABLE TOOL SHED TO CREATE THE\nPARANOID BACKYARD STANDOFF OF\nNEWS THAT IS MY SEGMENT:\nMEANWHILE!"
+    },
+    "jpq8eXZcvpo": {
+        "begin": "0:52.5",
+        "end": "1:37.5",
+        "text": "FOLKS, YOU KNOW, I SPEND A LOT OF MY TIME\nRIGHT OVER THERE, SORTING\nTHROUGH THE DAY'S TOP STORIES,\nCAREFULLY SELECTING FOR YOU THE\nFRESHEST, MOST TOPICAL\nNEWS-FRUIT, ARTFULLY CARVING IT\nINTO SATIRICAL SHAPES, DIPPING\nIT IN THE FINEST ARTISANAL\nCHOCOLATE, AND GENTLY PLACING THEM\nINTO THE FLAWLESSLY COMPOSED AND\nDELICIOUS EDIBLE ARRANGEMENT\nTHAT IS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES,\nFOLKS, I WAKE UP FACE DOWN IN\nTHE RECYCLING BIN BEHIND A JAMBA\nJUICE, FIGHT A SEAGULL FOR THE\nDISCARDED CANTALOUPE RINDS\nAND PINEAPPLE STEMS, DIP THEM\nINTO A BUCKET OF DIESEL SIPHONED OFF\nFROM A SEMI FULL OF UNWASHED\nBIRD BONES, WHICH I USE TO\nSKEWER TOGETHER MY GARBAGE\nKEBABS, THEN STAB THEM ONTO A\nWATERLOGGED TEDDY BEAR TO CREATE\nTHE CRIMINALLY INSANE NIGHTMARE\nGIFT BASKET OF NEWS THAT IS MY\nSEGMENT:\n\"MEANWHILE!\""
+    },
+    "jq2LhJ9rMpg": {
+        "begin": "2:52.0",
+        "end": "3:34.0",
+        "text": "YOU KNOW, I SPEND A LOT OF TIME\nRIGHT OVER THERE, PULLING\nTOGETHER THE FINEST, NEWSIEST\nLIMESTONE, CHISELING IN THE MOST\nDELICATE AND TOPICAL OF\nBAS-RELIEF, AND THE MOST ORNATE\nARCHES, MAKING SURE THERE'S NARY\nA BAD SEAT IN THE HOUSE, THEN\nASSEMBLING THE MOST FEARSOME\nNEWS WARRIORS THE ARENA HAS EVER\nSEEN TO CONSTRUCT FOR YOU THE\nROMAN COLOSSEUM THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES FOLKS, I WAKE UP MY\nNEIGHBOR AT 3:00 IN THE MORNING,\nDRAG HIM INTO MY SHED, WHERE\nI'VE SET UP A KIDDIE POOL I\nBOUGHT 20 YEARS AGO AND FILLED\nWITH EXPIRED JELL-O AND LUKEWARM\nBEER, HUFF SOME ACETONE OUT OF A\nPRICE CHOPPER BAG, THEN\nCHALLENGE HIM TO JOIN ME IN THE\nOLD MAN WRESTLING LEAGUE OF\nNEWS THAT IS MY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "lWyia3aF92o": {
+        "begin": "1:44.0",
+        "end": "2:30.0",
+        "text": "FRIENDS, EVERY NIGHT I STAND\nRIGHT OVER THERE, AND I CAREFULLY WORK ON THE LIGHTING\nAND STAGING OF THE DAY'S MOST\nTOPICAL NEWS STORIES, COMPOSING\nGROUNDBREAKING ORCHESTRAL\nARRANGEMENTS TO SUPPORT THEM,\nAND THEN METICULOUSLY CHOREOGRAPHING\nTHEM AND MYSELF INTO A DELICATE,\nHEARTBREAKING, AND YET UPLIFTING\nPAS DE DEUX, TO PRESENT FOR YOU\nTHE EPOCH-DEFINING BALLET THAT\nIS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES FOLKS, I FISH A STAINED\nVELOUR JUMPSUIT OUT OF A CANAL,\nHOOK A RADIO I RIPPED OUT OF A\nGARBAGE TRUCK TO AN ABANDONED\nCAR BATTERY, AND SLAP THE DIAL\nTHROUGH FUZZ TILL IT LANDS ON A\nRANDOM A.M. OLDIES STATION, AND THEN\nSHAKE MY ASS FOR NICKELS IN THE\nDEMENTED VAGRANT MACARENA OF\nNEWS OF THAT IS MY SEGMENT:\nMEANWHILE!"
+    },
+    "ldTzn1RpsNY": {
+        "begin": "1:00.0",
+        "end": "1:48.0",
+        "text": "FOLKS, I SPEND A LOT OF TIME\nRIGHT OVER THERE, COMPILING THE\nMOST CURRENT GEOMETRY QUESTIONS,\nSPRINKLING IN A TOPICAL SET OF\nDATA ANALYSES, FOLDING THEM\nTOGETHER ALONG THE NEWSWORTHIEST\nWORD PROBLEMS, THEN PAIRING ALL\nOF THAT WITH THE DAY'S MOST\nPRESSING READING PASSAGES TO\nCOLLATE FOR YOU THE PERFECTLY\nCALIBRATED, BESPOKE S.A.T. TEST\nTHAT IS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES,\nI HUFF A PILE OF SALVIA AND\nSTAGGER INTO A LANDFILL WHERE I\nFORAGE FOR CRUSTY OLD SUDOKUS,\nGRAB A SACKFUL OF USED AND WET\nMADLIBS, AND CRAZY-GLUE THEM INTO\nTHE SPINE OF A DISCARDED\n\"READER'S DIGEST\" I FOUND IN A\nBURNT-OUT WALDENBOOKS, TO\nPRESENT TO YOU THE ILLEGIBLE\nHOBO BUZZFEED QUIZ OF NEWS THAT\nIS MY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "lgH-itFA_hg": {
+        "begin": "1:53.2",
+        "end": "2:33.2",
+        "text": "FOLKS, I SPENT A LOT OF TIME\nSTANDING RIGHT OVER THERE, OKAY,\nSETTING\nUP MY NEWS EASEL, LAYING OUT THE\nMOST TOPICAL BRUSH STROKES,\nCHOOSING THE MOST RELEVANT\nCOLORS, ALL TO FAITHFULLY\nCAPTURE FOR YOU, THE SOUL OF THE\nSTORIES OF THE DAY IN THE\nOIL-ON-CANVAS MASTERPIECE THAT\nIS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES -- JUST SOMETIMES,\nFOLKS -- I SET A LIQUOR\nSTORE ON FIRE AND COME BACK THE\nNEXT DAY TO SCRAPE SOME CHARCOAL\nOFF THE BURNT TIMBERS, USE THE\nCARDBOARD FROM THE DISCARDED\nREFRIGERATOR BOX I'VE BEEN\nCALLING HOME FOR THE WEEKEND,\nTHEN HARASS TOURISTS TO ETCH THE\nOFFENSIVE BOARDWALK CARICATURE\nOF NEWS THAT IS MY SEGMENT:\nMEANWHILE!"
+    },
+    "ll5DeZrejsM": {
+        "begin": "2:02.7",
+        "end": "2:52.7",
+        "text": "YOU KNOW, FOLKS,\nIF YOU WATCH THE SHOW, YOU KNOW\nI SPEND A LOT OF TIME RIGHT OVER\nTHERE,  COMBING THROUGH THE\nDAY'S BIG STORIES, SELECTING THE\nFINEST NEWS TENORS AND THE\nSILKIEST SOPRANOS.\nTHEN, I BRUSH UP ON THE WORKS OF\nCERVANTES AND FIND THE PERFECT\nSWEET SPOT BETWEEN DRAMA, HUMOR,\nAND OPERA TO COMPOSE FOR YOU THE\nTIMELESS AND SEDUCTIVE SPANISH\nZARZUELA THAT IS MY\nMONOLOGUE.\nBUT SOMETIMES, SOMETIMES, FOLKS,\nI WAKE UP IN THE FREEZER OF A\nCOMBINATION TACO BELL PIZZA\nHUT ON THE EXPRESSWAY, AND I CUT A\nPAIR OF LEG HOLES INTO A POTATO\nSACK AND RACE BAREFOOT INTO THE\nCITY TO BREAK INTO AN ABANDONED\nDOLLAR STORE, WHERE I FASHION A\nPAIR OF CASTANETS FROM DEFECTIVE\nCHATTERING TEETH TOYS.\nTHEN I DOWN A JERRY CAN FULL OF\nRED BULL AND COUGH MEDICINE\nBEFORE STAGGERING INTO A PUBLIC\nPARK TO DISTURB TOURISTS WITH\nTHE DRIFTER'S FLAMENCO SHOWCASE\nOF NEWS THAT IS MY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "lzviJMlii7A": {
+        "begin": "1:36.5",
+        "end": "3:21.5",
+        "text": "LADIES AND GENTLEMEN, YOU KNOW, IF YOU\nWATCH THIS SHOW, YOU KNOW I\nSPEND A LOT OF MY TIME RIGHT OVER\nTHERE, METICULOUSLY SIFTING\nTHROUGH THE DAILY NEWS\nDESERT, HARVESTING THE FINEST,\nMOST TOPICAL MINERAL SANDS--\nABOUT 65% SILICA, 10% FLUX\nOF SODIUM OXIDE, AND A\nSTABILIZER OF CALCIUM OXIDE--\nWHICH I THEN SMELT IN A\nHIGH-TEMPERATURE CALCERA FURNACE\nAT 1,200 TO 1,400 DEGREES\nCELSIUS, FUSING THEM INTO LIQUID\nGLASS, THEN CAREFULLY DROPPING\nMY FURNACE TEMPERATURE SO I CAN\nFOLD IN HAND-SELECTED CULLET AND\nCOBALT TO OBTAIN MY INTENDED\nCOLOR AND CREATE THE MOST\nPRISMATIC NEWS CRYSTALS, WHICH\nI THEN DELICATELY HANG ON A\nHAND-CRAFTED BALUSTER OF\nHEADLINES, ARRANGING THEM TO\nCATCH AND REFRACT, IN A GENTLE\nDANCE OF LIGHTS AND SHADOWS, THE\nMOST TOPICAL REFLECTIONS OF THE\nDAY, ADORNING THE ARRANGEMENT\nWITH ONE FINAL FINIAL OF QUIPS\nTO PRESENT TO YOU THE VENETIAN\nMURANO GLASS CHANDELIER THAT IS\nMY MONOLOGUE.\nBUT SOMETIMES FOLKS, SOMETIMES,\nFOLKS, SOMETIMES, I JOLT AWAKE\nNAKED IN THE BACK BOOTH OF A\nLONG-ABANDONED COUNTY FAIR, I\nPULL ON SOME OVERALLS I STOLE\nOFF A SCARECROW, AND CLAW\nTHROUGH THE GROUNDS SCRAPING THE\nLEAVINGS OFF SOME DISUSED\nSPARKLERS, PICK THROUGH BROKEN\nCOKE BOTTLES AND BIRTHDAY\nCANDLES, BOIL OFF THE REMNANTS\nIN A DISCARDED TUB OF LAUNDRY\nDETERGENT TO EXTRACT THE\nBENZENE.\nTHEN, USING THE SHOELACES I TOOK\nOFF A HOBO SLEEPING UNDER\nTHE FERRIS WHEEL AND DENTAL\nFLOSS CURRENTLY IN USE BY SAID\nHOBO, I BIND THE CONGLOMERATE\nOF SHARDS AND ACCELERANT\nTOGETHER AND HOLD IT NEAR THE\nSPUTTERING SPARK PLUG OF AN\nOLD ICE CREAM TRUCK TO IGNITE\nTHE CHAOTIC CANDELABRA OF\nFLAMING NEWS THAT IS MY SEGMENT:\n\"MEANWHILE\"!"
+    },
+    "okJDGV6Jjmc": {
+        "begin": "2:03.0",
+        "end": "2:55.0",
+        "text": "YOU KNOW, FOLKS, I SPEND A LOT\nOF TIME RIGHT OVER THERE, PORING\nOVER THE DAY'S NEWSIEST, MOST\nTOPICAL NAUTICAL RECORDS TO\nDETERMINE THE ROUGH POSITIONS OF\nTHE DAY'S TRENDING SHIPWRECKS.\nTHEN I USE THE LATEST SONAR TECH\nTO LOCATE AND FIND THE\nORIENTATION OF THE FINEST\nSALVAGE SITE.\nTHEN MY TEAM OF CERTIFIED AND\nLICENSED COMEDY DIVERS DESCEND\nTO THE OCEAN FLOOR AND USE\nCUTTING EDGE UNDERWATER CAMERAS\nTO STITCH TOGETHER THE DETAILED\n3D-MODELED HISTORIC ATOSHA\nSHIPWRECK SITE OF SATIRICAL\nOBSERVATIONS THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES, FOLKS,\nI WAKE UP NAKED BY THE DOCKS\nCOVERED IN PIRATE TATTOOS, BLAST\nA STRING OF WHIPPETS, THEN\nSTAGGER INTO THE FESTERING\nHUDSON RIVER, WHERE I SLOWLY\nSINK THROUGH THE MURK UNTIL I\nIMPALE MYSELF ON THE RUSTY AXLE\nOF A SUNKEN TAXI IN THE\nTETANUS-LACED CRIME SCENE OF\nNEWS THAT IS MY SEGMENT...\n\"MEANWHILE!\""
+    },
+    "pbR-kF0PjlA": {
+        "begin": "1:09.0",
+        "end": "2:43.0",
+        "text": "WELL FOLKS, I SPEND A LOT\nOF MY TIME, ON THE SHOW, RIGHT OVER THERE,\nWANDERING THROUGH THE\nFARMERS' MARKET THAT IS TODAY'S\nBIGGEST STORIES, SQUEEZING THE\nFINEST NEWS RADISHES, THE RIPEST\nSTORY PEPPERS, SNIFFING THE MOST\nTOPICAL DATES, WHICH I THEN PAIR\nWITH FRA'MANI SOPPRESSATA, AND\nTHE MOST SUCCULENT HANDRAISED\nPATA NEGRA JAMON IBERICO, BACKED\nUP BY GENEROUS HELPINGS OF\nBEEMSTER GOUDA, AND A WEDGE OF\nBRILLAT-SAVARIN TRIPLE CREAM\nBRIE, THEN I ADD FORTNUM AND MASON\nAPRICOT AND FIG SPREADS WITH\nGRISSINI BREADSTICKS AND LA\nPANZANELLA CROCCANTINI, AND\nFINALLY LIBERAL SPRINKLINGS OF\nSAN SABA ELLIOT PECANS AND\nSICILIAN CASTEL-VETRANO OLIVES\nON A RAW CARRARA MARBLE SLAB TO\nLAY OUT FOR YOU THE SPECTACULAR\nGOURMET CHARCUTERIE BOARD THAT\nIS MY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES FOLKS, SOMETIMES, I AM HOSED AWAKE\nINSIDE AN EMPTY 6,000 GALLON\nDIESEL TANKER OFF OF I-24, WHERE I\nAM HIDING FROM A CULT THAT I\nSTARTED, THEN DASH, NAKED BEHIND\nA RECENTLY DEFUNCT QUIZNOS,\nWHERE I MUST WRESTLE A POSSUM\nFOR THE REMAINS OF A BAJA\nCHICKEN FOOTLONG, STAGGER INTO A\nMIDDLE SCHOOL REC. YARD AFTER\nFIGHTING A SEAGULL FOR THE LAST\nHAM CUBE IN A LUNCHABLES TRAY,\nPUNCH A RACCOON TO STEAL HIS\nPEANUT, THEN DUMP IT ALL INTO A\nHUBCAP I STRIPPED OFF AN\nABANDONED '76 CHEVY VEGA TO\nOFFER FOR YOU THE RAIL YARD BUFFET\nOF NEWS THAT IS MY SEGMENT:\nMEANWHILE!"
+    },
+    "pyhaU-_1Szk": {
+        "begin": "2:48.0",
+        "end": "4:31.0",
+        "text": "YOU KNOW FOLKS, I SPEND A LOT OF\nTIME RIGHT OVER THERE, ISOLATING\nTHE BIGGEST, NEWSIEST STORIES\nOF THE DAY AND CONTAINING THEM\nIN THE MOST TOPICAL CIRCULAR\nTUNNEL, WITH A CIRCUMFERENCE OF\n26.7 KILOMETERS, AND A DEPTH\nRANGING FROM 50 TO 175 METERS.\nTHEN, I ADD TWO ADJACENT\nPARALLEL BEAM-LINES, WHICH\nTRAVEL IN OPPOSITE DIRECTIONS\nAROUND THE RING, INTERSECTING AT\nFOUR POINTS.\nI ADD 1,232 DIPOLE MAGNETS TO\nKEEP THE BEAMS IN THEIR CIRCULAR\nPATH, WHILE AN ADDITIONAL 392\nQUADRUPOLE MAGNETS ARE USED TO\nKEEP THE BEAMS FOCUSED, THEN\nI ADD STRONGER QUADRUPOLE\nMAGNETS CLOSE TO THE\nINTERSECTION POINTS IN ORDER TO\nMAXIMIZE THE CHANCES OF\nINTERACTION BETWEEN THE TWO BEAMS\nCROSS, ALL SO I CAN SMASH JOKE\nPROTONS AGAINST EACH OTHER AT\nNEAR THE SPEED OF LIGHT TO\nGENERATE THE HIGGS BOSON HEAVY\nCOMEDY PARTICLES THAT MAKE UP MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, FOLKS, SOMETIMES \nSOMETIMES, SOMETIMES I WAKE UP IN AN\nABANDONED JUNKYARD, STRAPPED TO\nTHE CHASSIS OF WHAT USED TO BE A\nSCHOOL BUS.\nI GNAW MYSELF FREE OF MY\nRESTRAINTS AND CLIMB ATOP A HILL\nOF CRUSHED MAZDA MIATAS TO UTTER\nA CALL THAT CAN BE HEARD ONLY BY\nTHOSE IN THE MIDST OF A\nLIFE-CHANGING PEYOTE TRIP.\nWITH MY FREAKS GATHERED AROUND ME,\nHOTWIRE AS MANY BURNT-OUT\n'91 BUICK LESABRES AS WE CAN\nFIND TO ANIMATE A FLEET OF\nFURY-ROAD-WORTHY LEMONS,\nTHEN ROLL THEM TO THE ABANDONED\nSUBWAY STATION BELOW CITY HALL,\nWHERE I LAUNCH THEM HEAD ON AT\nTOP SPEED IN THE UNREGULATED\nHOTWHEELS COLOSSAL CRASH TRACK\nOF NEWS THAT IS MY SEGMENT:\nMEANWHILE!"
+    },
+    "q8zlh8XKfLc": {
+        "begin": "1:00.0",
+        "end": "1:54.0",
+        "text": "FOLKS, YOU KNOW, I\nSPEND MOST OF MY TIME, RIGHT\nOVER THERE, WITH MY EARS, MY MIND,\nAND MY HEART OPEN TO THE DAY'S\nBIGGEST STORIES, AUDITIONING AND\nSELECTING ONLY THE MOST TOPICAL\nNEWS-OBOES, THE MOST RELEVANT\nAND LILTING VIOLAS, ROUNDING IT\nOUT WITH SOME NOBLE FRENCH\nHORNS, AND INSOUCIANT\nBASSOONS, THEN COMPOSING AND\nARRANGING THE NEWSIEST, MOST\nUPLIFTING YET BITTERSWEET\nRONDOS, ALLEGROS, SCHERZOS, AND\nSONATAS TO PRESENT TO YOU THE\nTIMELESSLY MOVING YET\nINFORMATIVE POST-MODERN OPUS\nNUMBER ONE SYMPHONY THAT IS MY\nMONOLOGUE.\nBUT SOMETIMES, SOMETIMES FOLKS, I WAKE UP AT THE\nWHEEL OF A STOLEN CEMENT TRUCK,\nSNORT ANOTHER RAIL OF KETAMINE\nAND BATH SALTS, THEN I STRIP\nDOWN AND SCAMPER THROUGH A\nCEMETERY TRAPPING RATS UNDER\nRUSTY COFFEE CANS.\nAFTER AN IMPASSIONED SPEECH TO THEM\nABOUT THEIR NEED\nTO HELP ME SAVE AN OLD THEATER, THEY\nACCOMPANY ME ON A RAID TO A\nPRESCHOOL MUSIC ROOM TO STEAL\nITS FLUTES, RECORDERS, AND\nKAZOOS, WHERE I CONDUCT THE\nFUGITIVE VERMIN PHILHARMONIC OF\nNEWS THAT IS MY SEGMENT:\nMEANWHILE!\n"
+    },
+    "qEY5SUevhgU": {
+        "begin": "1:59.0",
+        "end": "3:07.0",
+        "text": "FOLKS, IF YOU\nWATCH THIS SHOW, YOU KNOW I\nSPEND MUCH OF MY TIME,\nRIGHT OVER THERE, PLANTING AND\nGROWING THE DAY'S BIGGEST NEWS\nIN A PARCELED TERROIR AT\nPRECISELY 80 METERS, ON A\nNORTH-FACING SLOPE, WITH JUST\nTHE RIGHT MICROCLIMATE, THEN\nHAND-PICKING ONLY THE RIPEST,\nMOST TOPICAL BOTRYTIS-PRUNED\nSTORY GRAPES.\nAFTER THREE PRESSINGS, I THEN\nCAREFULLY BARREL-AGE THEIR\nNOBLE-ROTTED NECTAR FOR 30\nMONTHS EXCLUSIVELY IN NEW OAK BARRELS TO\nBRING OUT THE AROMAS OF TROPICAL\nFRUITS, HONEYED PEARS, AND\nROASTED NUTS IN THE CHATEAU\nD'YQUEM SAUTERNES THAT IS MY\nNIGHTLY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES, FOLKS,\nI WAKE UP IN A BULGARIAN PRISON,\nCONVICTED OF WHAT MY NON-ENGLISH\nSPEAKING, COURT-APPOINTED LAWYER\nONLY CALLS \"ANIMAL WRONGS.\"\nI TRADE THE CIGARETTES I WON IN\nA BARE-KNUCKLE MATCH WITH A\nGUARD FOR SOME FIG MARMALADE,\nAPPLE CORES, AND DISCARDED\nKETCHUP PACKETS, TOSS IT ALL IN\nTHE PLASTIC BAG I STOLE OFF A\nCELLMATE DRAGOMIR'S FOOT WHILE\nHE SLEPT, LEAVE IT UNDER A\nFERMENTING PIPE\nOVERNIGHT, TO SERVE UP THE\nSOUR-MASHED GOON PLONK OF NEWS\nTHAT IS MY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "r7NnpAGIkEY": {
+        "begin": "1:46.5",
+        "end": "2:32.5",
+        "text": "AND, YOU KNOW,\nFOLKS, I SPEND A LOT OF TIME ON THIS SHOW, RIGHT\nOVER THERE, CAREFULLY HARVESTING\nTHE HIGHEST-QUALITY ORGANIC ACAI\nNEWS BERRIES, PUTTING THEM INTO\nMY CURRENT EVENTS BLENDER, THEN\nPULSING ON HIGH UNTIL THEY'VE\nBECOME THE SMOOTH PURPLE PUREE OF\nSTORIES TO BE PILED WITH\nGRANOLA, CHIA SEEDS, AND SLICED\nJOKE BANANA, TO MAKE THE\nHIGH-PRICED, ARTISANAL SMOOTHIE\nBOWL OF NEWS THAT IS MY\nMONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES\nFOLKS, I LIKE TO SCROUNGE\nTOGETHER SOME EXPIRED KALE FROM\nTHE BACK OF THE FRIDGE, MIX IT\nWITH THE FERMENTING ORANGE\nSLICES LEFT IN THE BACK SEAT\nAFTER LAST WEEK'S LITTLE LEAGUE\nGAME, AND AN APPLE CORE I FOUND\nINSIDE A COFFEE CUP, THEN\nPULVERIZE IT ALL IN A LEAKY\nNUTRI-BULLET TO MAKE THE PRISON\nTOILET GREEN JUICE OF NEWS THAT\nIS MY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "sKCeqiWA-gQ": {
+        "begin": "0:43.5",
+        "end": "1:36.5",
+        "text": "FOLKS, I SPEND A LOT OF TIME\nRIGHT OVER THERE, NIGHT AFTER NIGHT, COMBING\nTHROUGH THE DAY'S NEWS,\nCAREFULLY SELECTING THE MOST\nTOPICAL, FRAGRANT HERBS AND\nJOKE-RICH ALLIUM, DELICATELY\nSTIRRING THEM INTO A SATIRICAL\nSTOCK, BRINGING THE CONCOCTION\nTO A BREAKING NEWS BOIL BEFORE\nPAINSTAKINGLY EDITING AWAY THE\nSCRAPS, LEAVING ONLY THE PUREST,\nNUTRIENT-RICH CONSOMME OF COMEDY\nTHAT IS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, I SWEAT MYSELF\nAWAKE INSIDE A DEFLATED BOUNCY\nCASTLE AT A DEFUNCT AMUSEMENT\nPARK, BREAK INTO A COMBINATION\nGAS STATION PIZZA HUT WHERE I\nTHROW TOGETHER WHATEVER OLD HOT\nDOGS AND REPURPOSED CHEESE\nPRODUCT I CAN GET MY CHAPPED AND\nCRACKING HANDS ON.\nAND THERE, BY THE CRUEL LIGHT OF\nA PIZZA WARMING DRAWER, I DROWN\nTHE MIXTURE IN A CAN OF\nDISCONTINUED SURGE FROM 2002\nBEFORE STRAINING IT THROUGH THE\nGREASE-SOILED BEARD NET TO\nCREATE THE FESTERING MOP BUCKET\nSOUR MASH OF NEWS THAT IS MY\nSEGMENT:\nMEANWHILE!"
+    },
+    "tSdWz6CvpIc": {
+        "begin": "1:50.0",
+        "end": "2:39.0",
+        "text": "FOLKS, YOU KNOW, IF YOU WATCH\nTHE SHOW, YOU KNOW I SPEND A\nLOT OF MY TIME, RIGHT OVER THERE,\nCAREFULLY WELDING TOGETHER THE\nDAY'S TOP STORIES, FORGED FROM\nTHE FINEST NEWS METALS, WIRING\nIN THE MOST EFFICIENT, HIGH-\nSPEED ENGINE.\nTHEN I COMBINE THE MOST TOPICAL\nTITANIUM ACCENTS WITH ITALIAN\nCURRENT-EVENTS CRAFTSMANSHIP TO\nCREATE THE BESPOKE TRIUMPH\nMOTORCYCLE THAT IS MY MONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES FOLKS,\nAFTER STANDING TOO LONG OVER AN\nEPHEDRINE BARREL FIRE, I STUMBLE\nINTO A DEFUNCT BODY SHOP, SLAP A\nHALF-EMPTY CANISTER OF PROPANE\nONTO A STOLEN HUFFY, WRAP IT IN\nNEWSPAPER AND BITS OF CAUTION\nTAPE THAT I SWIPED FROM A\nSTILL-ACTIVE CRIME SCENE, AND\nHOOK IT UP TO AN OLD MILK JUG\nFULL OF NITROUS I STOLE FROM A\nBLACK MARKET ORTHODONTIST, IN\nORDER TO MAKE THE FLAMING DEATH\nROCKET OF NEWS THAT IS MY\nSEGMENT:\n\"MEANWHILE!\""
+    },
+    "thFDua8MF_w": {
+        "begin": "1:21.5",
+        "end": "2:18.5",
+        "text": "FOLKS, I SPEND A\nLOT OF TIME\nRIGHT OVER THERE, COMBING\nTHROUGH THE DAY'S NEWS AND\nCAREFULLY SELECTING THE MOST\nPRISTINE OPALESCENT GLASS\nSTORIES, ORNATELY FUSING THE\nPIECES USING ONLY THE MOST\nTOPICAL COPPER WIRE AND LEAD\nCASING BEFORE COLORING THEM WITH\nTHE MOST PIGMENT-RICH JOKES\nAVAILABLE TO CONSTRUCT FOR YOU AND YOU ALONE\nTHE ELEGANT STAINED-GLASS\nTIFFANY DOME THAT IS MY NIGHTLY\nMONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES FOLKS, I JOLT AWAKE\nBEHIND THE WHEEL OF A '79 BUICK\nREGAL LOWRIDER WHILE DOING\nDONUTS IN THE PARKING LOT OF A\nBOARDED UP JOANNE FABRICS, WHEN\nI CLIP A BARREL FIRE AND I'M\nTHROWN FROM THE CAR INTO THE\nDUMPSTERS.\nTHERE, I RUMMAGE THROUGH THE\nBITS OF BROKEN FANTA BOTTLES,\nAND GLUE THEM TOGETHER WITH\nSTILL-WARM CHEWING GUM, AND\nSTAIN THEM WITH WHATEVER\nREMNANTS I CAN SCRAPE FROM OLD\nKETCHUP AND FUN-DIP PACKETS.\nTHEN I DOUSE MY PANTS IN\nKEROSENE AND LET HER BLAZE TO\nPROJECT THE DEMENTED NIGHTMARE\nKALEIDOSCOPE OF NEWS THAT IS MY\nSEGMENT:\nMEANWHILE"
+    },
+    "u9oMwS3I12s": {
+        "begin": "0:53.0",
+        "end": "1:47.0",
+        "text": "FOLKS, YOU KNOW, IF YOU WATCH THE SHOW, YOU KNOW I SPEND A LOT OF\nMY TIME RIGHT OVER THERE,\nWORKING THE OLD MIRTH KILN,\nMELTING DOWN THE DAY'S MOST\nIMPORTANT GOLDEN STORY INGOTS\nTO MAKE AN ORNATE SET OF CUSTOM\nNEWS VAMBRACES.\nTHEN CARVE A MOULDED CUIRASS\nINTO THE MOST TOPICAL\nANIMAL-THEMED CHEST PLATE THAT I\nDECORATE WITH FINE FILIGREE AND\nORNATE PRE-COLUMBIAN PATTERNS TO\nCREATE THE BESPOKE SET OF GOLD\nMUISCA ARMOR THAT IS MY\nMONOLOGUE.\nBUT SOMETIMES, JUST SOMETIMES,\nFOLKS, I WAKE UP IN THE BASEMENT\nOF A DERELICT ROW HOUSE DURING A\nFULL MOON, RIFLE THROUGH A\nDISCARDED BOX OF ELBOW PASTA AND\nSTRING SOME NOODLES TOGETHER\nINTO CRUDE SHIN GUARDS WITH\nCOPPER WIRE I STRIPPED OUT OF\nTHE FUNERAL HOME I ROBBED\nEARLIER.\nTHEN, I FASHION A HAT BY\nSTAPLING OLD NEWSPAPER CLIPPINGS\nTO A BIKE HELMET AND WRAP MYSELF\nIN A TARP I SWIPED FROM THE\nRETIREMENT HOME'S HOT TUB TO\nFROLIC BEFORE YOU IN THE\nMADMAN'S HAZMAT SUIT OF NEWS\nTHAT IS MY SEGMENT:\n\"MEANWHILE!\""
+    },
+    "z2dPp5yM-NA": {
+        "begin": "1:34.3",
+        "end": "2:34.3",
+        "text": "YOU KNOW, FOLKS, IF YOU WATCH\nTHIS SHOW, AND I HOPE YOU DO...\nTHEN YOU KNOW I SPEND MOST OF MY TIME\nRIGHT OVER THERE, CAREFULLY\nUNWRAPPING THE DAY'S NEWS,\nPLACING THE FINEST, MOST TOPICAL\nORNAMENTS UPON THE HAND-CUT\nDOUGLAS FIR OF THE DAY'S TOP\nSTORIES, SPRINKLING\nBIODEGRADABLE TINSEL ON THE\nBOUGHS WITH PRECISION AND\nDECADES OF TRAINING THAT COMES\nACROSS AS EFFORTLESS, CHECKING\nEACH JOKE BULB FOR THE OPTIMAL\nTWINKLE, AND FINALLY TOPPING IT\nOFF WITH A FAMILY HEIRLOOM STAR\nTO CREATE THE MAGICAL CHRISTMAS\nMEMORY THAT IS MY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES,\nI BREAK INTO A HOME DEPOT, HUFF\nA BOTTLE OF GOO GONE, STEAL A\nPALLET OF TWO BY FOURS, A PILE\nOF RUSTY NAILS, A BUCKET OF\nDISCONTINUED FAUCET PARTS, SLAP\nTHEM TOGETHER WITH A RECEIPT PAPER\nAND INDUSTRIAL ADHESIVE, PUT IT\nOUTSIDE THE LIVING ROOM WINDOW\nOF THE RETIREMENT HOME I WAS\nKICKED OUT OF FOR A VERY GOOD\nREASON, AND THROW IN A COUPLE OF\nMANNEQUINS STOLEN FROM A BURNED\nOUT FOREVER 21 TO CREATE THE\nHELLSCAPE CRECHE OF NEWS THAT IS\nMY SEGMENT:\n\"MEANWHILE\"!"
+    },
+    "zFRXCwdPD-M": {
+        "begin": "1:31.6",
+        "end": "2:10.6",
+        "text": "YOU KNOW, FOLKS, I SPEND A LOT\nOF MY TIME ON THE SHOW RIGHT OVER THERE PRECISELY\nMEASURING THE NEWS' INSEAM,\nSELECTING THE FINEST, MOST\nTOPICAL IMPORTED MERINO STORY\nWOOL, THEN HAND-STITCHING IT\nWITH JOKES TO CREATE FOR YOU THE\nBESPOKE, DOUBLE-BREASTED SAVILE\nROW CURRENT EVENT SUIT THAT IS\nMY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES,\nI LIKE TO GATHER UP SOME USED\nBURLAP FROM BEHIND THE\nMEAT-PACKING PLANT, DRAPE IT\nOVER AN ABANDONED MANNEQUIN AT\nOLD MAN JENKINS' BURNED-DOWN\nDRESS FACTORY, AND SEW IT\nTOGETHER WITH SHOESTRINGS AND A\nSTAPLE GUN, TO CREATE FOR YOU\nTHE HAUNTED POTATO-SACK\nSCARECROW OF NEWS THAT IS MY\nSEGMENT:\n\"MEANWHILE!\""
+    },
+    "zIS1lp9CS-E": {
+        "begin": "1:16.5",
+        "end": "2:04.5",
+        "text": "FOLKS, I SPEND A\nLOT OF TIME RIGHT OVER THERE,\nSELECTING THE FINEST GRAINS OF\nNEWS, HAND SIFTING THROUGH\nBARRELS OF STEELCUT JOKE OATS,\nSELECTING THE RIPEST SEASONAL\nSTORY BERRIES, AND\nHOME-FERMENTING MACROBIOTIC\nALMOND MILK INTO THE UPSCALE ORGANIC\nYOGURT TO LOVINGLY FOLD TOGETHER\nTHE BUZZWORTHY BREAKFAST PARFAIT\nTHAT IS MY NIGHTLY MONOLOGUE.\nBUT SOMETIMES, SOMETIMES I SWEAT MYSELF\nAWAKE IN THE MIDDLE OF THE\nNIGHT, CREEP OUT TO AN ABANDONED\nSCHOOLYARD, SCRAPE A BUNCH OF\nSPILLED CHEERIOS AND DISCARDED\nGOGURT WRAPPERS INTO A BIG GULP\nTRAVEL MUG I WON IN THE GREAT\nTRUCKER-DRIFTER WARS OF 2019,\nADD SOME FERMENTED CRABAPPLES\nFROM BEHIND THE SWING SET, AND\nCHUG BACK THE FETID PRISON\nPORRIDGE OF NEWS THAT IS MY\nSEGMENT:\nMEANWHILE!"
+    }
+}

whisper/language-breakdown.svg ADDED Viewed

whisper/model-card.md ADDED Viewed

	@@ -0,0 +1,69 @@

+# Model Card: Whisper
+This is the official codebase for running the automatic speech recognition (ASR) models (Whisper models) trained and released by OpenAI.
+Following [Model Cards for Model Reporting (Mitchell et al.)](https://arxiv.org/abs/1810.03993), we're providing some information about the automatic speech recognition model. More information on how these models were trained and evaluated can be found [in the paper](https://arxiv.org/abs/2212.04356).
+## Model Details
+The Whisper models are trained for speech recognition and translation tasks, capable of transcribing speech audio into the text in the language it is spoken (ASR) as well as translated into English (speech translation). Researchers at OpenAI developed the models to study the robustness of speech processing systems trained under large-scale weak supervision. There are 9 models of different sizes and capabilities, summarized in the following table.
+|  Size  | Parameters | English-only model | Multilingual model |
+|:------:|:----------:|:------------------:|:------------------:|
+|  tiny  |    39 M    |         ✓          |         ✓          |
+|  base  |    74 M    |         ✓          |         ✓          |
+| small  |   244 M    |         ✓          |         ✓          |
+| medium |   769 M    |         ✓          |         ✓          |
+| large  |   1550 M   |                    |         ✓          |
+In December 2022, we [released an improved large model named `large-v2`](https://github.com/openai/whisper/discussions/661).
+### Release date
+September 2022 (original series) and December 2022 (`large-v2`)
+### Model type
+Sequence-to-sequence ASR (automatic speech recognition) and speech translation model
+### Paper & samples
+[Paper](https://arxiv.org/abs/2212.04356) / [Blog](https://openai.com/blog/whisper)
+## Model Use
+### Evaluated Use
+The primary intended users of these models are AI researchers studying the robustness, generalization, capabilities, biases, and constraints of the current model. However, Whisper is also potentially quite useful as an ASR solution for developers, especially for English speech recognition. We recognize that once models are released, it is impossible to restrict access to only “intended” uses or to draw reasonable guidelines around what is or is not research.
+The models are primarily trained and evaluated on ASR and speech translation to English tasks. They show strong ASR results in ~10 languages. They may exhibit additional capabilities, particularly if fine-tuned on certain tasks like voice activity detection, speaker classification, or speaker diarization but have not been robustly evaluated in these areas. We strongly recommend that users perform robust evaluations of the models in a particular context and domain before deploying them.
+In particular, we caution against using Whisper models to transcribe recordings of individuals taken without their consent or purporting to use these models for any kind of subjective classification. We recommend against use in high-risk domains like decision-making contexts, where flaws in accuracy can lead to pronounced flaws in outcomes. The models are intended to transcribe and translate speech, use of the model for classification is not only not evaluated but also not appropriate, particularly to infer human attributes.
+## Training Data
+The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.
+As discussed in [the accompanying paper](https://arxiv.org/abs/2212.04356), we see that performance on transcription in a given language is directly correlated with the amount of training data we employ in that language.
+## Performance and Limitations
+Our studies show that, over many existing ASR systems, the models exhibit improved robustness to accents, background noise, and technical language, as well as zero-shot translation from multiple languages into English; and that accuracy on speech recognition and translation is near the state-of-the-art level.
+However, because the models are trained in a weakly supervised manner using large-scale noisy data, the predictions may include texts that are not actually spoken in the audio input (i.e. hallucination). We hypothesize that this happens because, given their general knowledge of language, the models combine trying to predict the next word in audio with trying to transcribe the audio itself.
+Our models perform unevenly across languages, and we observe lower accuracy on low-resource and/or low-discoverability languages or languages where we have less training data. The models also exhibit disparate performance on different accents and dialects of particular languages, which may include a higher word error rate across speakers of different genders, races, ages, or other demographic criteria. Our full evaluation results are presented in [the paper accompanying this release](https://arxiv.org/abs/2212.04356).
+In addition, the sequence-to-sequence architecture of the model makes it prone to generating repetitive texts, which can be mitigated to some degree by beam search and temperature scheduling but not perfectly. Further analysis of these limitations is provided in [the paper](https://arxiv.org/abs/2212.04356). It is likely that this behavior and hallucinations may be worse in lower-resource and/or lower-discoverability languages.
+## Broader Implications
+We anticipate that Whisper models’ transcription capabilities may be used for improving accessibility tools. While Whisper models cannot be used for real-time transcription out of the box – their speed and size suggest that others may be able to build applications on top of them that allow for near-real-time speech recognition and translation. The real value of beneficial applications built on top of Whisper models suggests that the disparate performance of these models may have real economic implications.
+There are also potential dual-use concerns that come with releasing Whisper. While we hope the technology will be used primarily for beneficial purposes, making ASR technology more accessible could enable more actors to build capable surveillance technologies or scale up existing surveillance efforts, as the speed and accuracy allow for affordable automatic transcription and translation of large volumes of audio communication. Moreover, these models may have some capabilities to recognize specific individuals out of the box, which in turn presents safety concerns related both to dual use and disparate performance. In practice, we expect that the cost of transcription is not the limiting factor of scaling up surveillance projects.

whisper/notebooks/LibriSpeech.ipynb ADDED Viewed

	@@ -0,0 +1,958 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "v5hvo8QWN-a9"
+   },
+   "source": [
+    "# Installing Whisper\n",
+    "\n",
+    "The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "id": "ZsJUxc0aRsAf"
+   },
+   "outputs": [],
+   "source": [
+    "! pip install git+https://github.com/openai/whisper.git\n",
+    "! pip install jiwer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "1IMEkgyagYto"
+   },
+   "source": [
+    "# Loading the LibriSpeech dataset\n",
+    "\n",
+    "The following will load the test-clean split of the LibriSpeech corpus using torchaudio."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "id": "3CqtR2Fi5-vP"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import numpy as np\n",
+    "\n",
+    "try:\n",
+    "    import tensorflow  # required in Colab to avoid protobuf compatibility issues\n",
+    "except ImportError:\n",
+    "    pass\n",
+    "\n",
+    "import torch\n",
+    "import pandas as pd\n",
+    "import whisper\n",
+    "import torchaudio\n",
+    "\n",
+    "from tqdm.notebook import tqdm\n",
+    "\n",
+    "\n",
+    "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "id": "GuCCB2KYOJCE"
+   },
+   "outputs": [],
+   "source": [
+    "class LibriSpeech(torch.utils.data.Dataset):\n",
+    "    \"\"\"\n",
+    "    A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.\n",
+    "    It will drop the last few seconds of a very small portion of the utterances.\n",
+    "    \"\"\"\n",
+    "    def __init__(self, split=\"test-clean\", device=DEVICE):\n",
+    "        self.dataset = torchaudio.datasets.LIBRISPEECH(\n",
+    "            root=os.path.expanduser(\"~/.cache\"),\n",
+    "            url=split,\n",
+    "            download=True,\n",
+    "        )\n",
+    "        self.device = device\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.dataset)\n",
+    "\n",
+    "    def __getitem__(self, item):\n",
+    "        audio, sample_rate, text, _, _, _ = self.dataset[item]\n",
+    "        assert sample_rate == 16000\n",
+    "        audio = whisper.pad_or_trim(audio.flatten()).to(self.device)\n",
+    "        mel = whisper.log_mel_spectrogram(audio)\n",
+    "        \n",
+    "        return (mel, text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "id": "-YcRU5jqNqo2"
+   },
+   "outputs": [],
+   "source": [
+    "dataset = LibriSpeech(\"test-clean\")\n",
+    "loader = torch.utils.data.DataLoader(dataset, batch_size=16)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "0ljocCNuUAde"
+   },
+   "source": [
+    "# Running inference on the dataset using a base Whisper model\n",
+    "\n",
+    "The following will take a few minutes to transcribe all utterances in the dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "_PokfNJtOYNu",
+    "outputId": "2c53ec44-bc93-4107-b4fa-214e3f71fe8e"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model is English-only and has 71,825,408 parameters.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = whisper.load_model(\"base.en\")\n",
+    "print(\n",
+    "    f\"Model is {'multilingual' if model.is_multilingual else 'English-only'} \"\n",
+    "    f\"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters.\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# predict without timestamps for short-form transcription\n",
+    "options = whisper.DecodingOptions(language=\"en\", without_timestamps=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 49,
+     "referenced_widgets": [
+      "09a29a91f58d4462942505a3cc415801",
+      "83391f98a240490987c397048fc1a0d4",
+      "06b9aa5f49fa44ba8c93b647dc7db224",
+      "da9c231ee67047fb89073c95326b72a5",
+      "48da931ebe7f4fd299f8c98c7d2460ff",
+      "7a901f447c1d477bb49f954e0feacedd",
+      "39f5a6ae8ba74c8598f9c6d5b8ad2d65",
+      "a0d10a42c753453283e5219c22239337",
+      "09f4cb79ff86465aaf48b0de24869af9",
+      "1b9cecf5b3584fba8258a81d4279a25b",
+      "039b53f2702c4179af7e0548018d0588"
+     ]
+    },
+    "id": "7OWTn_KvNk59",
+    "outputId": "a813a792-3c91-4144-f11f-054fd6778023"
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9df048b46f764cf68cbe0045b8ff73a8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/164 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "hypotheses = []\n",
+    "references = []\n",
+    "\n",
+    "for mels, texts in tqdm(loader):\n",
+    "    results = model.decode(mels, options)\n",
+    "    hypotheses.extend([result.text for result in results])\n",
+    "    references.extend(texts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 424
+    },
+    "id": "4nTyynELQ42j",
+    "outputId": "1c72d25a-3e87-4c60-a8d1-1da9d2f73bd7"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>hypothesis</th>\n",
+       "      <th>reference</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>He hoped there would be stew for dinner, turni...</td>\n",
+       "      <td>HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Stuffered into you, his belly counseled him.</td>\n",
+       "      <td>STUFF IT INTO YOU HIS BELLY COUNSELLED HIM</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>After early nightfall the yellow lamps would l...</td>\n",
+       "      <td>AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Hello Bertie, any good in your mind?</td>\n",
+       "      <td>HELLO BERTIE ANY GOOD IN YOUR MIND</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Number 10. Fresh Nelly is waiting on you. Good...</td>\n",
+       "      <td>NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2615</th>\n",
+       "      <td>Oh, to shoot my soul's full meaning into futur...</td>\n",
+       "      <td>OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2616</th>\n",
+       "      <td>Then I, long tried by natural ills, received t...</td>\n",
+       "      <td>THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2617</th>\n",
+       "      <td>I love thee freely as men strive for right. I ...</td>\n",
+       "      <td>I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2618</th>\n",
+       "      <td>I love thee with the passion put to use, in my...</td>\n",
+       "      <td>I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2619</th>\n",
+       "      <td>I love thee with the love I seemed to lose wit...</td>\n",
+       "      <td>I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2620 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                             hypothesis  \\\n",
+       "0     He hoped there would be stew for dinner, turni...   \n",
+       "1          Stuffered into you, his belly counseled him.   \n",
+       "2     After early nightfall the yellow lamps would l...   \n",
+       "3                  Hello Bertie, any good in your mind?   \n",
+       "4     Number 10. Fresh Nelly is waiting on you. Good...   \n",
+       "...                                                 ...   \n",
+       "2615  Oh, to shoot my soul's full meaning into futur...   \n",
+       "2616  Then I, long tried by natural ills, received t...   \n",
+       "2617  I love thee freely as men strive for right. I ...   \n",
+       "2618  I love thee with the passion put to use, in my...   \n",
+       "2619  I love thee with the love I seemed to lose wit...   \n",
+       "\n",
+       "                                              reference  \n",
+       "0     HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...  \n",
+       "1            STUFF IT INTO YOU HIS BELLY COUNSELLED HIM  \n",
+       "2     AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...  \n",
+       "3                    HELLO BERTIE ANY GOOD IN YOUR MIND  \n",
+       "4     NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...  \n",
+       "...                                                 ...  \n",
+       "2615  OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...  \n",
+       "2616  THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...  \n",
+       "2617  I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...  \n",
+       "2618  I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...  \n",
+       "2619  I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ...  \n",
+       "\n",
+       "[2620 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))\n",
+    "data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "HPppEJRXX4ox"
+   },
+   "source": [
+    "# Calculating the word error rate\n",
+    "\n",
+    "Now, we use our English normalizer implementation to standardize the transcription and calculate the WER."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "id": "dl-KBDflMhrg"
+   },
+   "outputs": [],
+   "source": [
+    "import jiwer\n",
+    "from whisper.normalizers import EnglishTextNormalizer\n",
+    "\n",
+    "normalizer = EnglishTextNormalizer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 641
+    },
+    "id": "6-O048q4WI4o",
+    "outputId": "f2089bc9-f535-441e-f192-26e52ae82b5e"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>hypothesis</th>\n",
+       "      <th>reference</th>\n",
+       "      <th>hypothesis_clean</th>\n",
+       "      <th>reference_clean</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>He hoped there would be stew for dinner, turni...</td>\n",
+       "      <td>HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...</td>\n",
+       "      <td>he hoped there would be stew for dinner turnip...</td>\n",
+       "      <td>he hoped there would be stew for dinner turnip...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Stuffered into you, his belly counseled him.</td>\n",
+       "      <td>STUFF IT INTO YOU HIS BELLY COUNSELLED HIM</td>\n",
+       "      <td>stuffered into you his belly counseled him</td>\n",
+       "      <td>stuff it into you his belly counseled him</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>After early nightfall the yellow lamps would l...</td>\n",
+       "      <td>AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...</td>\n",
+       "      <td>after early nightfall the yellow lamps would l...</td>\n",
+       "      <td>after early nightfall the yellow lamps would l...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Hello Bertie, any good in your mind?</td>\n",
+       "      <td>HELLO BERTIE ANY GOOD IN YOUR MIND</td>\n",
+       "      <td>hello bertie any good in your mind</td>\n",
+       "      <td>hello bertie any good in your mind</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Number 10. Fresh Nelly is waiting on you. Good...</td>\n",
+       "      <td>NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...</td>\n",
+       "      <td>number 10 fresh nelly is waiting on you good n...</td>\n",
+       "      <td>number 10 fresh nelly is waiting on you good n...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2615</th>\n",
+       "      <td>Oh, to shoot my soul's full meaning into futur...</td>\n",
+       "      <td>OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...</td>\n",
+       "      <td>0 to shoot my soul is full meaning into future...</td>\n",
+       "      <td>0 to shoot my soul is full meaning into future...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2616</th>\n",
+       "      <td>Then I, long tried by natural ills, received t...</td>\n",
+       "      <td>THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...</td>\n",
+       "      <td>then i long tried by natural ills received the...</td>\n",
+       "      <td>then i long tried by natural ills received the...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2617</th>\n",
+       "      <td>I love thee freely as men strive for right. I ...</td>\n",
+       "      <td>I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...</td>\n",
+       "      <td>i love thee freely as men strive for right i l...</td>\n",
+       "      <td>i love thee freely as men strive for right i l...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2618</th>\n",
+       "      <td>I love thee with the passion put to use, in my...</td>\n",
+       "      <td>I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...</td>\n",
+       "      <td>i love thee with the passion put to use in my ...</td>\n",
+       "      <td>i love thee with the passion put to use in my ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2619</th>\n",
+       "      <td>I love thee with the love I seemed to lose wit...</td>\n",
+       "      <td>I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ...</td>\n",
+       "      <td>i love thee with the love i seemed to lose wit...</td>\n",
+       "      <td>i love thee with a love i seemed to lose with ...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2620 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                             hypothesis  \\\n",
+       "0     He hoped there would be stew for dinner, turni...   \n",
+       "1          Stuffered into you, his belly counseled him.   \n",
+       "2     After early nightfall the yellow lamps would l...   \n",
+       "3                  Hello Bertie, any good in your mind?   \n",
+       "4     Number 10. Fresh Nelly is waiting on you. Good...   \n",
+       "...                                                 ...   \n",
+       "2615  Oh, to shoot my soul's full meaning into futur...   \n",
+       "2616  Then I, long tried by natural ills, received t...   \n",
+       "2617  I love thee freely as men strive for right. I ...   \n",
+       "2618  I love thee with the passion put to use, in my...   \n",
+       "2619  I love thee with the love I seemed to lose wit...   \n",
+       "\n",
+       "                                              reference  \\\n",
+       "0     HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...   \n",
+       "1            STUFF IT INTO YOU HIS BELLY COUNSELLED HIM   \n",
+       "2     AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...   \n",
+       "3                    HELLO BERTIE ANY GOOD IN YOUR MIND   \n",
+       "4     NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...   \n",
+       "...                                                 ...   \n",
+       "2615  OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...   \n",
+       "2616  THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...   \n",
+       "2617  I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...   \n",
+       "2618  I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...   \n",
+       "2619  I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ...   \n",
+       "\n",
+       "                                       hypothesis_clean  \\\n",
+       "0     he hoped there would be stew for dinner turnip...   \n",
+       "1            stuffered into you his belly counseled him   \n",
+       "2     after early nightfall the yellow lamps would l...   \n",
+       "3                    hello bertie any good in your mind   \n",
+       "4     number 10 fresh nelly is waiting on you good n...   \n",
+       "...                                                 ...   \n",
+       "2615  0 to shoot my soul is full meaning into future...   \n",
+       "2616  then i long tried by natural ills received the...   \n",
+       "2617  i love thee freely as men strive for right i l...   \n",
+       "2618  i love thee with the passion put to use in my ...   \n",
+       "2619  i love thee with the love i seemed to lose wit...   \n",
+       "\n",
+       "                                        reference_clean  \n",
+       "0     he hoped there would be stew for dinner turnip...  \n",
+       "1             stuff it into you his belly counseled him  \n",
+       "2     after early nightfall the yellow lamps would l...  \n",
+       "3                    hello bertie any good in your mind  \n",
+       "4     number 10 fresh nelly is waiting on you good n...  \n",
+       "...                                                 ...  \n",
+       "2615  0 to shoot my soul is full meaning into future...  \n",
+       "2616  then i long tried by natural ills received the...  \n",
+       "2617  i love thee freely as men strive for right i l...  \n",
+       "2618  i love thee with the passion put to use in my ...  \n",
+       "2619  i love thee with a love i seemed to lose with ...  \n",
+       "\n",
+       "[2620 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[\"hypothesis_clean\"] = [normalizer(text) for text in data[\"hypothesis\"]]\n",
+    "data[\"reference_clean\"] = [normalizer(text) for text in data[\"reference\"]]\n",
+    "data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "EBGSITeBYPTT",
+    "outputId": "7b3dbe7c-a37e-4a07-a50a-b27d5f88b68f"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WER: 4.26 %\n"
+     ]
+    }
+   ],
+   "source": [
+    "wer = jiwer.wer(list(data[\"reference_clean\"]), list(data[\"hypothesis_clean\"]))\n",
+    "\n",
+    "print(f\"WER: {wer * 100:.2f} %\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "provenance": []
+  },
+  "gpuClass": "standard",
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.9"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "039b53f2702c4179af7e0548018d0588": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "06b9aa5f49fa44ba8c93b647dc7db224": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_a0d10a42c753453283e5219c22239337",
+      "max": 164,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_09f4cb79ff86465aaf48b0de24869af9",
+      "value": 164
+     }
+    },
+    "09a29a91f58d4462942505a3cc415801": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_83391f98a240490987c397048fc1a0d4",
+       "IPY_MODEL_06b9aa5f49fa44ba8c93b647dc7db224",
+       "IPY_MODEL_da9c231ee67047fb89073c95326b72a5"
+      ],
+      "layout": "IPY_MODEL_48da931ebe7f4fd299f8c98c7d2460ff"
+     }
+    },
+    "09f4cb79ff86465aaf48b0de24869af9": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "1b9cecf5b3584fba8258a81d4279a25b": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "39f5a6ae8ba74c8598f9c6d5b8ad2d65": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "48da931ebe7f4fd299f8c98c7d2460ff": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "7a901f447c1d477bb49f954e0feacedd": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "83391f98a240490987c397048fc1a0d4": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_7a901f447c1d477bb49f954e0feacedd",
+      "placeholder": "",
+      "style": "IPY_MODEL_39f5a6ae8ba74c8598f9c6d5b8ad2d65",
+      "value": "100%"
+     }
+    },
+    "a0d10a42c753453283e5219c22239337": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "da9c231ee67047fb89073c95326b72a5": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_1b9cecf5b3584fba8258a81d4279a25b",
+      "placeholder": "",
+      "style": "IPY_MODEL_039b53f2702c4179af7e0548018d0588",
+      "value": " 164/164 [05:08&lt;00:00,  1.86s/it]"
+     }
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}

whisper/notebooks/Multilingual_ASR.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

whisper/pyproject.toml ADDED Viewed

	@@ -0,0 +1,8 @@

+[tool.black]
+[tool.isort]
+profile = "black"
+include_trailing_comma = true
+line_length = 88
+multi_line_output = 3

whisper/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+numba
+numpy
+torch
+tqdm
+more-itertools
+tiktoken==0.3.3

whisper/setup.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+import platform
+import sys
+import pkg_resources
+from setuptools import find_packages, setup
+def read_version(fname="whisper/version.py"):
+    exec(compile(open(fname, encoding="utf-8").read(), fname, "exec"))
+    return locals()["__version__"]
+requirements = []
+if sys.platform.startswith("linux") and platform.machine() == "x86_64":
+    requirements.append("triton==2.0.0")
+setup(
+    name="openai-whisper",
+    py_modules=["whisper"],
+    version=read_version(),
+    description="Robust Speech Recognition via Large-Scale Weak Supervision",
+    long_description=open("README.md", encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    readme="README.md",
+    python_requires=">=3.8",
+    author="OpenAI",
+    url="https://github.com/openai/whisper",
+    license="MIT",
+    packages=find_packages(exclude=["tests*"]),
+    install_requires=requirements
+    + [
+        str(r)
+        for r in pkg_resources.parse_requirements(
+            open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
+        )
+    ],
+    entry_points={
+        "console_scripts": ["whisper=whisper.transcribe:cli"],
+    },
+    include_package_data=True,
+    extras_require={"dev": ["pytest", "scipy", "black", "flake8", "isort"]},
+)

whisper/tests/conftest.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import random as rand
+import numpy
+import pytest
+def pytest_configure(config):
+    config.addinivalue_line("markers", "requires_cuda")
+@pytest.fixture
+def random():
+    rand.seed(42)
+    numpy.random.seed(42)

whisper/tests/jfk.flac ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63a4b1e4c1dc655ac70961ffbf518acd249df237e5a0152faae9a4a836949715
+size 1152693

whisper/tests/test_audio.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import os.path
+import numpy as np
+from whisper.audio import SAMPLE_RATE, load_audio, log_mel_spectrogram
+def test_audio():
+    audio_path = os.path.join(os.path.dirname(__file__), "jfk.flac")
+    audio = load_audio(audio_path)
+    assert audio.ndim == 1
+    assert SAMPLE_RATE * 10 < audio.shape[0] < SAMPLE_RATE * 12
+    assert 0 < audio.std() < 1
+    mel_from_audio = log_mel_spectrogram(audio)
+    mel_from_file = log_mel_spectrogram(audio_path)
+    assert np.allclose(mel_from_audio, mel_from_file)
+    assert mel_from_audio.max() - mel_from_audio.min() <= 2.0

whisper/tests/test_normalizer.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import pytest
+from whisper.normalizers import EnglishTextNormalizer
+from whisper.normalizers.english import (
+    EnglishNumberNormalizer,
+    EnglishSpellingNormalizer,
+)
+@pytest.mark.parametrize("std", [EnglishNumberNormalizer(), EnglishTextNormalizer()])
+def test_number_normalizer(std):
+    assert std("two") == "2"
+    assert std("thirty one") == "31"
+    assert std("five twenty four") == "524"
+    assert std("nineteen ninety nine") == "1999"
+    assert std("twenty nineteen") == "2019"
+    assert std("two point five million") == "2500000"
+    assert std("four point two billions") == "4200000000s"
+    assert std("200 thousand") == "200000"
+    assert std("200 thousand dollars") == "$200000"
+    assert std("$20 million") == "$20000000"
+    assert std("€52.4 million") == "€52400000"
+    assert std("£77 thousands") == "£77000s"
+    assert std("two double o eight") == "2008"
+    assert std("three thousand twenty nine") == "3029"
+    assert std("forty three thousand two hundred sixty") == "43260"
+    assert std("forty three thousand two hundred and sixty") == "43260"
+    assert std("nineteen fifties") == "1950s"
+    assert std("thirty first") == "31st"
+    assert std("thirty three thousand and three hundred and thirty third") == "33333rd"
+    assert std("three billion") == "3000000000"
+    assert std("millions") == "1000000s"
+    assert std("july third twenty twenty") == "july 3rd 2020"
+    assert std("august twenty sixth twenty twenty one") == "august 26th 2021"
+    assert std("3 14") == "3 14"
+    assert std("3.14") == "3.14"
+    assert std("3 point 2") == "3.2"
+    assert std("3 point 14") == "3.14"
+    assert std("fourteen point 4") == "14.4"
+    assert std("two point two five dollars") == "$2.25"
+    assert std("two hundred million dollars") == "$200000000"
+    assert std("$20.1 million") == "$20100000"
+    assert std("ninety percent") == "90%"
+    assert std("seventy six per cent") == "76%"
+    assert std("double oh seven") == "007"
+    assert std("double zero seven") == "007"
+    assert std("nine one one") == "911"
+    assert std("nine double one") == "911"
+    assert std("one triple oh one") == "10001"
+    assert std("two thousandth") == "2000th"
+    assert std("thirty two thousandth") == "32000th"
+    assert std("minus 500") == "-500"
+    assert std("positive twenty thousand") == "+20000"
+    assert std("two dollars and seventy cents") == "$2.70"
+    assert std("3 cents") == "¢3"
+    assert std("$0.36") == "¢36"
+    assert std("three euros and sixty five cents") == "€3.65"
+    assert std("three and a half million") == "3500000"
+    assert std("forty eight and a half dollars") == "$48.5"
+    assert std("b747") == "b 747"
+    assert std("10 th") == "10th"
+    assert std("10th") == "10th"
+def test_spelling_normalizer():
+    std = EnglishSpellingNormalizer()
+    assert std("mobilisation") == "mobilization"
+    assert std("cancelation") == "cancellation"
+def test_text_normalizer():
+    std = EnglishTextNormalizer()
+    assert std("Let's") == "let us"
+    assert std("he's like") == "he is like"
+    assert std("she's been like") == "she has been like"
+    assert std("10km") == "10 km"
+    assert std("10mm") == "10 mm"
+    assert std("RC232") == "rc 232"
+    assert (
+        std("Mr. Park visited Assoc. Prof. Kim Jr.")
+        == "mister park visited associate professor kim junior"
+    )

whisper/tests/test_timing.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import numpy as np
+import pytest
+import scipy.ndimage
+import torch
+from whisper.timing import dtw_cpu, dtw_cuda, median_filter
+sizes = [
+    (10, 20),
+    (32, 16),
+    (123, 1500),
+    (234, 189),
+]
+shapes = [
+    (10,),
+    (1, 15),
+    (4, 5, 345),
+    (6, 12, 240, 512),
+]
+@pytest.mark.parametrize("N, M", sizes)
+def test_dtw(N: int, M: int):
+    steps = np.concatenate([np.zeros(N - 1), np.ones(M - 1)])
+    np.random.shuffle(steps)
+    x = np.random.random((N, M)).astype(np.float32)
+    i, j, k = 0, 0, 0
+    trace = []
+    while True:
+        x[i, j] -= 1
+        trace.append((i, j))
+        if k == len(steps):
+            break
+        if k + 1 < len(steps) and steps[k] != steps[k + 1]:
+            i += 1
+            j += 1
+            k += 2
+            continue
+        if steps[k] == 0:
+            i += 1
+        if steps[k] == 1:
+            j += 1
+        k += 1
+    trace = np.array(trace).T
+    dtw_trace = dtw_cpu(x)
+    assert np.allclose(trace, dtw_trace)
+@pytest.mark.requires_cuda
+@pytest.mark.parametrize("N, M", sizes)
+def test_dtw_cuda_equivalence(N: int, M: int):
+    x_numpy = np.random.randn(N, M).astype(np.float32)
+    x_cuda = torch.from_numpy(x_numpy).cuda()
+    trace_cpu = dtw_cpu(x_numpy)
+    trace_cuda = dtw_cuda(x_cuda)
+    assert np.allclose(trace_cpu, trace_cuda)
+@pytest.mark.parametrize("shape", shapes)
+def test_median_filter(shape):
+    x = torch.randn(*shape)
+    for filter_width in [3, 5, 7, 13]:
+        filtered = median_filter(x, filter_width)
+        # using np.pad to reflect-pad, because Scipy's behavior is different near the edges.
+        pad_width = filter_width // 2
+        padded_x = np.pad(
+            x, [(0, 0)] * (x.ndim - 1) + [(pad_width, pad_width)], mode="reflect"
+        )
+        scipy_filtered = scipy.ndimage.median_filter(
+            padded_x, [1] * (x.ndim - 1) + [filter_width]
+        )
+        scipy_filtered = scipy_filtered[..., pad_width:-pad_width]
+        assert np.allclose(filtered, scipy_filtered)
+@pytest.mark.requires_cuda
+@pytest.mark.parametrize("shape", shapes)
+def test_median_filter_equivalence(shape):
+    x = torch.randn(*shape)
+    for filter_width in [3, 5, 7, 13]:
+        filtered_cpu = median_filter(x, filter_width)
+        filtered_gpu = median_filter(x.cuda(), filter_width).cpu()
+        assert np.allclose(filtered_cpu, filtered_gpu)

whisper/tests/test_tokenizer.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from whisper.tokenizer import get_tokenizer
+def test_tokenizer():
+    gpt2_tokenizer = get_tokenizer(multilingual=False)
+    multilingual_tokenizer = get_tokenizer(multilingual=True)
+    text = "다람쥐 헌 쳇바퀴에 타고파"
+    gpt2_tokens = gpt2_tokenizer.encode(text)
+    multilingual_tokens = multilingual_tokenizer.encode(text)
+    assert gpt2_tokenizer.decode(gpt2_tokens) == text
+    assert multilingual_tokenizer.decode(multilingual_tokens) == text
+    assert len(gpt2_tokens) > len(multilingual_tokens)
+def test_split_on_unicode():
+    multilingual_tokenizer = get_tokenizer(multilingual=True)
+    tokens = [8404, 871, 287, 6, 246, 526, 3210, 20378]
+    words, word_tokens = multilingual_tokenizer.split_tokens_on_unicode(tokens)
+    assert words == [" elle", " est", " l", "'", "�", "é", "rit", "oire"]
+    assert word_tokens == [[8404], [871], [287], [6], [246], [526], [3210], [20378]]

whisper/tests/test_transcribe.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+import pytest
+import torch
+import whisper
+from whisper.tokenizer import get_tokenizer
+@pytest.mark.parametrize("model_name", whisper.available_models())
+def test_transcribe(model_name: str):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = whisper.load_model(model_name).to(device)
+    audio_path = os.path.join(os.path.dirname(__file__), "jfk.flac")
+    language = "en" if model_name.endswith(".en") else None
+    result = model.transcribe(
+        audio_path, language=language, temperature=0.0, word_timestamps=True
+    )
+    assert result["language"] == "en"
+    assert result["text"] == "".join([s["text"] for s in result["segments"]])
+    transcription = result["text"].lower()
+    assert "my fellow americans" in transcription
+    assert "your country" in transcription
+    assert "do for you" in transcription
+    tokenizer = get_tokenizer(model.is_multilingual)
+    all_tokens = [t for s in result["segments"] for t in s["tokens"]]
+    assert tokenizer.decode(all_tokens) == result["text"]
+    assert tokenizer.decode_with_timestamps(all_tokens).startswith("<|0.00|>")
+    timing_checked = False
+    for segment in result["segments"]:
+        for timing in segment["words"]:
+            assert timing["start"] < timing["end"]
+            if timing["word"].strip(" ,") == "Americans":
+                assert timing["start"] <= 1.8
+                assert timing["end"] >= 1.8
+                timing_checked = True
+    assert timing_checked

whisper/whisper/__init__.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import hashlib
+import io
+import os
+import urllib
+import warnings
+from typing import List, Optional, Union
+import torch
+from tqdm import tqdm
+from .audio import load_audio, log_mel_spectrogram, pad_or_trim
+from .decoding import DecodingOptions, DecodingResult, decode, detect_language
+from .model import ModelDimensions, Whisper
+from .transcribe import transcribe
+from .version import __version__
+_MODELS = {
+    "tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
+    "tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
+    "base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt",
+    "base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt",
+    "small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt",
+    "small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt",
+    "medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt",
+    "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
+    "large-v1": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt",
+    "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
+    "large": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
+}
+# base85-encoded (n_layers, n_heads) boolean arrays indicating the cross-attention heads that are
+# highly correlated to the word-level timing, i.e. the alignment between audio and text tokens.
+_ALIGNMENT_HEADS = {
+    "tiny.en": b"ABzY8J1N>@0{>%R00Bk>$p{7v037`oCl~+#00",
+    "tiny": b"ABzY8bu8Lr0{>%RKn9Fp%m@SkK7Kt=7ytkO",
+    "base.en": b"ABzY8;40c<0{>%RzzG;p*o+Vo09|#PsxSZm00",
+    "base": b"ABzY8KQ!870{>%RzyTQH3`Q^yNP!>##QT-<FaQ7m",
+    "small.en": b"ABzY8>?_)10{>%RpeA61k&I|OI3I$65C{;;pbCHh0B{qLQ;+}v00",
+    "small": b"ABzY8DmU6=0{>%Rpa?J`kvJ6qF(V^F86#Xh7JUGMK}P<N0000",
+    "medium.en": b"ABzY8usPae0{>%R7<zz_OvQ{)4kMa0BMw6u5rT}kRKX;$NfYBv00*Hl@qhsU00",
+    "medium": b"ABzY8B0Jh+0{>%R7}kK1fFL7w6%<-Pf*t^=N)Qr&0RR9",
+    "large-v1": b"ABzY8r9j$a0{>%R7#4sLmoOs{s)o3~84-RPdcFk!JR<kSfC2yj",
+    "large-v2": b"ABzY8zd+h!0{>%R7=D0pU<_bnWW*tkYAhobTNnu$jnkEkXqp)j;w1Tzk)UH3X%SZd&fFZ2fC2yj",
+    "large": b"ABzY8zd+h!0{>%R7=D0pU<_bnWW*tkYAhobTNnu$jnkEkXqp)j;w1Tzk)UH3X%SZd&fFZ2fC2yj",
+}
+def _download(url: str, root: str, in_memory: bool) -> Union[bytes, str]:
+    os.makedirs(root, exist_ok=True)
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, os.path.basename(url))
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        with open(download_target, "rb") as f:
+            model_bytes = f.read()
+        if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
+            return model_bytes if in_memory else download_target
+        else:
+            warnings.warn(
+                f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file"
+            )
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(
+            total=int(source.info().get("Content-Length")),
+            ncols=80,
+            unit="iB",
+            unit_scale=True,
+            unit_divisor=1024,
+        ) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    model_bytes = open(download_target, "rb").read()
+    if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
+        raise RuntimeError(
+            "Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model."
+        )
+    return model_bytes if in_memory else download_target
+def available_models() -> List[str]:
+    """Returns the names of available models"""
+    return list(_MODELS.keys())
+def load_model(
+    name: str,
+    device: Optional[Union[str, torch.device]] = None,
+    download_root: str = None,
+    in_memory: bool = False,
+) -> Whisper:
+    """
+    Load a Whisper ASR model
+    Parameters
+    ----------
+    name : str
+        one of the official model names listed by `whisper.available_models()`, or
+        path to a model checkpoint containing the model dimensions and the model state_dict.
+    device : Union[str, torch.device]
+        the PyTorch device to put the model into
+    download_root: str
+        path to download the model files; by default, it uses "~/.cache/whisper"
+    in_memory: bool
+        whether to preload the model weights into host memory
+    Returns
+    -------
+    model : Whisper
+        The Whisper ASR model instance
+    """
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    if download_root is None:
+        default = os.path.join(os.path.expanduser("~"), ".cache")
+        download_root = os.path.join(os.getenv("XDG_CACHE_HOME", default), "whisper")
+    if name in _MODELS:
+        checkpoint_file = _download(_MODELS[name], download_root, in_memory)
+        alignment_heads = _ALIGNMENT_HEADS[name]
+    elif os.path.isfile(name):
+        checkpoint_file = open(name, "rb").read() if in_memory else name
+        alignment_heads = None
+    else:
+        raise RuntimeError(
+            f"Model {name} not found; available models = {available_models()}"
+        )
+    with (
+        io.BytesIO(checkpoint_file) if in_memory else open(checkpoint_file, "rb")
+    ) as fp:
+        checkpoint = torch.load(fp, map_location=device)
+    del checkpoint_file
+    dims = ModelDimensions(**checkpoint["dims"])
+    model = Whisper(dims)
+    model.load_state_dict(checkpoint["model_state_dict"])
+    if alignment_heads is not None:
+        model.set_alignment_heads(alignment_heads)
+    return model.to(device)

whisper/whisper/__main__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .transcribe import cli
2	+
3	+ cli()

whisper/whisper/assets/gpt2.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

whisper/whisper/assets/mel_filters.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd2cc75e70e36fcbdd8ffbc2499062f30094093e6bf2cbafa9859f59972b420b
+size 2048

whisper/whisper/assets/multilingual.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

whisper/whisper/audio.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import os
+from functools import lru_cache
+from subprocess import CalledProcessError, run
+from typing import Optional, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from .utils import exact_div
+# hard-coded audio hyperparameters
+SAMPLE_RATE = 16000
+N_FFT = 400
+N_MELS = 80
+HOP_LENGTH = 160
+CHUNK_LENGTH = 30
+N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000 samples in a 30-second chunk
+N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH)  # 3000 frames in a mel spectrogram input
+N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2  # the initial convolutions has stride 2
+FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH)  # 10ms per audio frame
+TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN)  # 20ms per audio token
+def load_audio(file: str, sr: int = SAMPLE_RATE):
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+    Parameters
+    ----------
+    file: str
+        The audio file to open
+    sr: int
+        The sample rate to resample the audio if necessary
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    # This launches a subprocess to decode audio while down-mixing
+    # and resampling as necessary.  Requires the ffmpeg CLI in PATH.
+    # fmt: off
+    cmd = [
+        "ffmpeg",
+        "-nostdin",
+        "-threads", "0",
+        "-i", file,
+        "-f", "s16le",
+        "-ac", "1",
+        "-acodec", "pcm_s16le",
+        "-ar", str(sr),
+        "-"
+    ]
+    # fmt: on
+    try:
+        out = run(cmd, capture_output=True, check=True).stdout
+    except CalledProcessError as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
+    """
+    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
+    """
+    if torch.is_tensor(array):
+        if array.shape[axis] > length:
+            array = array.index_select(
+                dim=axis, index=torch.arange(length, device=array.device)
+            )
+        if array.shape[axis] < length:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length - array.shape[axis])
+            array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
+    else:
+        if array.shape[axis] > length:
+            array = array.take(indices=range(length), axis=axis)
+        if array.shape[axis] < length:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length - array.shape[axis])
+            array = np.pad(array, pad_widths)
+    return array
+@lru_cache(maxsize=None)
+def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
+        )
+    """
+    assert n_mels == 80, f"Unsupported n_mels: {n_mels}"
+    with np.load(
+        os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")
+    ) as f:
+        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
+def log_mel_spectrogram(
+    audio: Union[str, np.ndarray, torch.Tensor],
+    n_mels: int = N_MELS,
+    padding: int = 0,
+    device: Optional[Union[str, torch.device]] = None,
+):
+    """
+    Compute the log-Mel spectrogram of
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+    n_mels: int
+        The number of Mel-frequency filters, only 80 is supported
+    padding: int
+        Number of zero samples to pad to the right
+    device: Optional[Union[str, torch.device]]
+        If given, the audio tensor is moved to this device before STFT
+    Returns
+    -------
+    torch.Tensor, shape = (80, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    if not torch.is_tensor(audio):
+        if isinstance(audio, str):
+            audio = load_audio(audio)
+        audio = torch.from_numpy(audio)
+    if device is not None:
+        audio = audio.to(device)
+    if padding > 0:
+        audio = F.pad(audio, (0, padding))
+    window = torch.hann_window(N_FFT).to(audio.device)
+    stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    filters = mel_filters(audio.device, n_mels)
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    return log_spec

whisper/whisper/decoding.py ADDED Viewed

	@@ -0,0 +1,821 @@

+from dataclasses import dataclass, field, replace
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Sequence, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch.distributions import Categorical
+from .audio import CHUNK_LENGTH
+from .tokenizer import Tokenizer, get_tokenizer
+from .utils import compression_ratio
+if TYPE_CHECKING:
+    from .model import Whisper
+@torch.no_grad()
+def detect_language(
+    model: "Whisper", mel: Tensor, tokenizer: Tokenizer = None
+) -> Tuple[Tensor, List[dict]]:
+    """
+    Detect the spoken language in the audio, and return them as list of strings, along with the ids
+    of the most probable language tokens and the probability distribution over all language tokens.
+    This is performed outside the main decode loop in order to not interfere with kv-caching.
+    Returns
+    -------
+    language_tokens : Tensor, shape = (n_audio,)
+        ids of the most probable language tokens, which appears after the startoftranscript token.
+    language_probs : List[Dict[str, float]], length = n_audio
+        list of dictionaries containing the probability distribution over all languages.
+    """
+    if tokenizer is None:
+        tokenizer = get_tokenizer(model.is_multilingual)
+    if (
+        tokenizer.language is None
+        or tokenizer.language_token not in tokenizer.sot_sequence
+    ):
+        raise ValueError(
+            "This model doesn't have language tokens so it can't perform lang id"
+        )
+    single = mel.ndim == 2
+    if single:
+        mel = mel.unsqueeze(0)
+    # skip encoder forward pass if already-encoded audio features were given
+    if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state):
+        mel = model.encoder(mel)
+    # forward pass using a single token, startoftranscript
+    n_audio = mel.shape[0]
+    x = torch.tensor([[tokenizer.sot]] * n_audio).to(mel.device)  # [n_audio, 1]
+    logits = model.logits(x, mel)[:, 0]
+    # collect detected languages; suppress all non-language tokens
+    mask = torch.ones(logits.shape[-1], dtype=torch.bool)
+    mask[list(tokenizer.all_language_tokens)] = False
+    logits[:, mask] = -np.inf
+    language_tokens = logits.argmax(dim=-1)
+    language_token_probs = logits.softmax(dim=-1).cpu()
+    language_probs = [
+        {
+            c: language_token_probs[i, j].item()
+            for j, c in zip(tokenizer.all_language_tokens, tokenizer.all_language_codes)
+        }
+        for i in range(n_audio)
+    ]
+    if single:
+        language_tokens = language_tokens[0]
+        language_probs = language_probs[0]
+    return language_tokens, language_probs
+@dataclass(frozen=True)
+class DecodingOptions:
+    # whether to perform X->X "transcribe" or X->English "translate"
+    task: str = "transcribe"
+    # language that the audio is in; uses detected language if None
+    language: Optional[str] = None
+    # sampling-related options
+    temperature: float = 0.0
+    sample_len: Optional[int] = None  # maximum number of tokens to sample
+    best_of: Optional[int] = None  # number of independent sample trajectories, if t > 0
+    beam_size: Optional[int] = None  # number of beams in beam search, if t == 0
+    patience: Optional[float] = None  # patience in beam search (arxiv:2204.05424)
+    # "alpha" in Google NMT, or None for length norm, when ranking generations
+    # to select which to return among the beams or best-of-N samples
+    length_penalty: Optional[float] = None
+    # text or tokens to feed as the prompt or the prefix; for more info:
+    # https://github.com/openai/whisper/discussions/117#discussioncomment-3727051
+    prompt: Optional[Union[str, List[int]]] = None  # for the previous context
+    prefix: Optional[Union[str, List[int]]] = None  # to prefix the current context
+    # list of tokens ids (or comma-separated token ids) to suppress
+    # "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`
+    suppress_tokens: Optional[Union[str, Iterable[int]]] = "-1"
+    suppress_blank: bool = True  # this will suppress blank outputs
+    # timestamp sampling options
+    without_timestamps: bool = False  # use <|notimestamps|> to sample text tokens only
+    max_initial_timestamp: Optional[float] = 1.0
+    # implementation details
+    fp16: bool = True  # use fp16 for most of the calculation
+@dataclass(frozen=True)
+class DecodingResult:
+    audio_features: Tensor
+    language: str
+    language_probs: Optional[Dict[str, float]] = None
+    tokens: List[int] = field(default_factory=list)
+    text: str = ""
+    avg_logprob: float = np.nan
+    no_speech_prob: float = np.nan
+    temperature: float = np.nan
+    compression_ratio: float = np.nan
+class Inference:
+    def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
+        """Perform a forward pass on the decoder and return per-token logits"""
+        raise NotImplementedError
+    def rearrange_kv_cache(self, source_indices) -> None:
+        """Update the key-value cache according to the updated beams"""
+        raise NotImplementedError
+    def cleanup_caching(self) -> None:
+        """Clean up any resources or hooks after decoding is finished"""
+        pass
+class PyTorchInference(Inference):
+    def __init__(self, model: "Whisper", initial_token_length: int):
+        self.model: "Whisper" = model
+        self.initial_token_length = initial_token_length
+        self.kv_cache = {}
+        self.hooks = []
+        key_modules = [block.attn.key for block in self.model.decoder.blocks]
+        value_modules = [block.attn.value for block in self.model.decoder.blocks]
+        self.kv_modules = key_modules + value_modules
+    def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
+        if not self.kv_cache:
+            self.kv_cache, self.hooks = self.model.install_kv_cache_hooks()
+        if tokens.shape[-1] > self.initial_token_length:
+            # only need to use the last token except in the first forward pass
+            tokens = tokens[:, -1:]
+        return self.model.decoder(tokens, audio_features, kv_cache=self.kv_cache)
+    def cleanup_caching(self):
+        for hook in self.hooks:
+            hook.remove()
+        self.kv_cache = {}
+        self.hooks = []
+    def rearrange_kv_cache(self, source_indices):
+        if source_indices != list(range(len(source_indices))):
+            for module in self.kv_modules:
+                # update the key/value cache to contain the selected sequences
+                self.kv_cache[module] = self.kv_cache[module][source_indices].detach()
+class SequenceRanker:
+    def rank(
+        self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]
+    ) -> List[int]:
+        """
+        Given a list of groups of samples and their cumulative log probabilities,
+        return the indices of the samples in each group to select as the final result
+        """
+        raise NotImplementedError
+class MaximumLikelihoodRanker(SequenceRanker):
+    """
+    Select the sample with the highest log probabilities, penalized using either
+    a simple length normalization or Google NMT paper's length penalty
+    """
+    def __init__(self, length_penalty: Optional[float]):
+        self.length_penalty = length_penalty
+    def rank(self, tokens: List[List[Tensor]], sum_logprobs: List[List[float]]):
+        def scores(logprobs, lengths):
+            result = []
+            for logprob, length in zip(logprobs, lengths):
+                if self.length_penalty is None:
+                    penalty = length
+                else:
+                    # from the Google NMT paper
+                    penalty = ((5 + length) / 6) ** self.length_penalty
+                result.append(logprob / penalty)
+            return result
+        # get the sequence with the highest score
+        lengths = [[len(t) for t in s] for s in tokens]
+        return [np.argmax(scores(p, l)) for p, l in zip(sum_logprobs, lengths)]
+class TokenDecoder:
+    def reset(self):
+        """Initialize any stateful variables for decoding a new sequence"""
+    def update(
+        self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor
+    ) -> Tuple[Tensor, bool]:
+        """Specify how to select the next token, based on the current trace and logits
+        Parameters
+        ----------
+        tokens : Tensor, shape = (n_batch, current_sequence_length)
+            all tokens in the context so far, including the prefix and sot_sequence tokens
+        logits : Tensor, shape = (n_batch, vocab_size)
+            per-token logits of the probability distribution at the current step
+        sum_logprobs : Tensor, shape = (n_batch)
+            cumulative log probabilities for each sequence
+        Returns
+        -------
+        tokens : Tensor, shape = (n_batch, current_sequence_length + 1)
+            the tokens, appended with the selected next token
+        completed : bool
+            True if all sequences has reached the end of text
+        """
+        raise NotImplementedError
+    def finalize(
+        self, tokens: Tensor, sum_logprobs: Tensor
+    ) -> Tuple[Sequence[Sequence[Tensor]], List[List[float]]]:
+        """Finalize search and return the final candidate sequences
+        Parameters
+        ----------
+        tokens : Tensor, shape = (n_audio, n_group, current_sequence_length)
+            all tokens in the context so far, including the prefix and sot_sequence
+        sum_logprobs : Tensor, shape = (n_audio, n_group)
+            cumulative log probabilities for each sequence
+        Returns
+        -------
+        tokens : Sequence[Sequence[Tensor]], length = n_audio
+            sequence of Tensors containing candidate token sequences, for each audio input
+        sum_logprobs : List[List[float]], length = n_audio
+            sequence of cumulative log probabilities corresponding to the above
+        """
+        raise NotImplementedError
+class GreedyDecoder(TokenDecoder):
+    def __init__(self, temperature: float, eot: int):
+        self.temperature = temperature
+        self.eot = eot
+    def update(
+        self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor
+    ) -> Tuple[Tensor, bool]:
+        if self.temperature == 0:
+            next_tokens = logits.argmax(dim=-1)
+        else:
+            next_tokens = Categorical(logits=logits / self.temperature).sample()
+        logprobs = F.log_softmax(logits.float(), dim=-1)
+        current_logprobs = logprobs[torch.arange(logprobs.shape[0]), next_tokens]
+        sum_logprobs += current_logprobs * (tokens[:, -1] != self.eot)
+        next_tokens[tokens[:, -1] == self.eot] = self.eot
+        tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1)
+        completed = (tokens[:, -1] == self.eot).all()
+        return tokens, completed
+    def finalize(self, tokens: Tensor, sum_logprobs: Tensor):
+        # make sure each sequence has at least one EOT token at the end
+        tokens = F.pad(tokens, (0, 1), value=self.eot)
+        return tokens, sum_logprobs.tolist()
+class BeamSearchDecoder(TokenDecoder):
+    def __init__(
+        self,
+        beam_size: int,
+        eot: int,
+        inference: Inference,
+        patience: Optional[float] = None,
+    ):
+        self.beam_size = beam_size
+        self.eot = eot
+        self.inference = inference
+        self.patience = patience or 1.0
+        self.max_candidates: int = round(beam_size * self.patience)
+        self.finished_sequences = None
+        assert (
+            self.max_candidates > 0
+        ), f"Invalid beam size ({beam_size}) or patience ({patience})"
+    def reset(self):
+        self.finished_sequences = None
+    def update(
+        self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor
+    ) -> Tuple[Tensor, bool]:
+        if tokens.shape[0] % self.beam_size != 0:
+            raise ValueError(f"{tokens.shape}[0] % {self.beam_size} != 0")
+        n_audio = tokens.shape[0] // self.beam_size
+        if self.finished_sequences is None:  # for the first update
+            self.finished_sequences = [{} for _ in range(n_audio)]
+        logprobs = F.log_softmax(logits.float(), dim=-1)
+        next_tokens, source_indices, finished_sequences = [], [], []
+        for i in range(n_audio):
+            scores, sources, finished = {}, {}, {}
+            # STEP 1: calculate the cumulative log probabilities for possible candidates
+            for j in range(self.beam_size):
+                idx = i * self.beam_size + j
+                prefix = tokens[idx].tolist()
+                for logprob, token in zip(*logprobs[idx].topk(self.beam_size + 1)):
+                    new_logprob = (sum_logprobs[idx] + logprob).item()
+                    sequence = tuple(prefix + [token.item()])
+                    scores[sequence] = new_logprob
+                    sources[sequence] = idx
+            # STEP 2: rank the candidates and keep the top beam_size sequences for each audio
+            saved = 0
+            for sequence in sorted(scores, key=scores.get, reverse=True):
+                if sequence[-1] == self.eot:
+                    finished[sequence] = scores[sequence]
+                else:
+                    sum_logprobs[len(next_tokens)] = scores[sequence]
+                    next_tokens.append(sequence)
+                    source_indices.append(sources[sequence])
+                    saved += 1
+                    if saved == self.beam_size:
+                        break
+            finished_sequences.append(finished)
+        tokens = torch.tensor(next_tokens, device=tokens.device)
+        self.inference.rearrange_kv_cache(source_indices)
+        # add newly finished sequences to self.finished_sequences
+        assert len(self.finished_sequences) == len(finished_sequences)
+        for previously_finished, newly_finished in zip(
+            self.finished_sequences, finished_sequences
+        ):
+            for seq in sorted(newly_finished, key=newly_finished.get, reverse=True):
+                if len(previously_finished) >= self.max_candidates:
+                    break  # the candidate list is full
+                previously_finished[seq] = newly_finished[seq]
+        # mark as completed if all audio has enough number of samples
+        completed = all(
+            len(sequences) >= self.max_candidates
+            for sequences in self.finished_sequences
+        )
+        return tokens, completed
+    def finalize(self, preceding_tokens: Tensor, sum_logprobs: Tensor):
+        # collect all finished sequences, including patience, and add unfinished ones if not enough
+        sum_logprobs = sum_logprobs.cpu()
+        for i, sequences in enumerate(self.finished_sequences):
+            if (
+                len(sequences) < self.beam_size
+            ):  # when not enough sequences are finished
+                for j in list(np.argsort(sum_logprobs[i]))[::-1]:
+                    sequence = preceding_tokens[i, j].tolist() + [self.eot]
+                    sequences[tuple(sequence)] = sum_logprobs[i][j].item()
+                    if len(sequences) >= self.beam_size:
+                        break
+        tokens: List[List[Tensor]] = [
+            [torch.tensor(seq) for seq in sequences.keys()]
+            for sequences in self.finished_sequences
+        ]
+        sum_logprobs: List[List[float]] = [
+            list(sequences.values()) for sequences in self.finished_sequences
+        ]
+        return tokens, sum_logprobs
+class LogitFilter:
+    def apply(self, logits: Tensor, tokens: Tensor) -> None:
+        """Apply any filtering or masking to logits in-place
+        Parameters
+        ----------
+        logits : Tensor, shape = (n_batch, vocab_size)
+            per-token logits of the probability distribution at the current step
+        tokens : Tensor, shape = (n_batch, current_sequence_length)
+            all tokens in the context so far, including the prefix and sot_sequence tokens
+        """
+        raise NotImplementedError
+class SuppressBlank(LogitFilter):
+    def __init__(self, tokenizer: Tokenizer, sample_begin: int):
+        self.tokenizer = tokenizer
+        self.sample_begin = sample_begin
+    def apply(self, logits: Tensor, tokens: Tensor):
+        if tokens.shape[1] == self.sample_begin:
+            logits[:, self.tokenizer.encode(" ") + [self.tokenizer.eot]] = -np.inf
+class SuppressTokens(LogitFilter):
+    def __init__(self, suppress_tokens: Sequence[int]):
+        self.suppress_tokens = list(suppress_tokens)
+    def apply(self, logits: Tensor, tokens: Tensor):
+        logits[:, self.suppress_tokens] = -np.inf
+class ApplyTimestampRules(LogitFilter):
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        sample_begin: int,
+        max_initial_timestamp_index: Optional[int],
+    ):
+        self.tokenizer = tokenizer
+        self.sample_begin = sample_begin
+        self.max_initial_timestamp_index = max_initial_timestamp_index
+    def apply(self, logits: Tensor, tokens: Tensor):
+        # suppress <|notimestamps|> which is handled by without_timestamps
+        if self.tokenizer.no_timestamps is not None:
+            logits[:, self.tokenizer.no_timestamps] = -np.inf
+        # timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
+        for k in range(tokens.shape[0]):
+            sampled_tokens = tokens[k, self.sample_begin :]
+            seq = [t for t in sampled_tokens.tolist()]
+            last_was_timestamp = (
+                len(seq) >= 1 and seq[-1] >= self.tokenizer.timestamp_begin
+            )
+            penultimate_was_timestamp = (
+                len(seq) < 2 or seq[-2] >= self.tokenizer.timestamp_begin
+            )
+            if last_was_timestamp:
+                if penultimate_was_timestamp:  # has to be non-timestamp
+                    logits[k, self.tokenizer.timestamp_begin :] = -np.inf
+                else:  # cannot be normal text tokens
+                    logits[k, : self.tokenizer.eot] = -np.inf
+            timestamps = sampled_tokens[
+                sampled_tokens.ge(self.tokenizer.timestamp_begin)
+            ]
+            if timestamps.numel() > 0:
+                # timestamps shouldn't decrease; forbid timestamp tokens smaller than the last
+                # also force each segment to have a nonzero length, to prevent infinite looping
+                if last_was_timestamp and not penultimate_was_timestamp:
+                    timestamp_last = timestamps[-1]
+                else:
+                    timestamp_last = timestamps[-1] + 1
+                logits[k, self.tokenizer.timestamp_begin : timestamp_last] = -np.inf
+        if tokens.shape[1] == self.sample_begin:
+            # suppress generating non-timestamp tokens at the beginning
+            logits[:, : self.tokenizer.timestamp_begin] = -np.inf
+            # apply the `max_initial_timestamp` option
+            if self.max_initial_timestamp_index is not None:
+                last_allowed = (
+                    self.tokenizer.timestamp_begin + self.max_initial_timestamp_index
+                )
+                logits[:, last_allowed + 1 :] = -np.inf
+        # if sum of probability over timestamps is above any other token, sample timestamp
+        logprobs = F.log_softmax(logits.float(), dim=-1)
+        for k in range(tokens.shape[0]):
+            timestamp_logprob = logprobs[k, self.tokenizer.timestamp_begin :].logsumexp(
+                dim=-1
+            )
+            max_text_token_logprob = logprobs[k, : self.tokenizer.timestamp_begin].max()
+            if timestamp_logprob > max_text_token_logprob:
+                logits[k, : self.tokenizer.timestamp_begin] = -np.inf
+class DecodingTask:
+    inference: Inference
+    sequence_ranker: SequenceRanker
+    decoder: TokenDecoder
+    logit_filters: List[LogitFilter]
+    def __init__(self, model: "Whisper", options: DecodingOptions):
+        self.model = model
+        language = options.language or "en"
+        tokenizer = get_tokenizer(
+            model.is_multilingual, language=language, task=options.task
+        )
+        self.tokenizer: Tokenizer = tokenizer
+        self.options: DecodingOptions = self._verify_options(options)
+        self.n_group: int = options.beam_size or options.best_of or 1
+        self.n_ctx: int = model.dims.n_text_ctx
+        self.sample_len: int = options.sample_len or model.dims.n_text_ctx // 2
+        self.sot_sequence: Tuple[int] = tokenizer.sot_sequence
+        if self.options.without_timestamps:
+            self.sot_sequence = tokenizer.sot_sequence_including_notimestamps
+        self.initial_tokens: Tuple[int] = self._get_initial_tokens()
+        self.sample_begin: int = len(self.initial_tokens)
+        self.sot_index: int = self.initial_tokens.index(tokenizer.sot)
+        # inference: implements the forward pass through the decoder, including kv caching
+        self.inference = PyTorchInference(model, len(self.initial_tokens))
+        # sequence ranker: implements how to rank a group of sampled sequences
+        self.sequence_ranker = MaximumLikelihoodRanker(options.length_penalty)
+        # decoder: implements how to select the next tokens, given the autoregressive distribution
+        if options.beam_size is not None:
+            self.decoder = BeamSearchDecoder(
+                options.beam_size, tokenizer.eot, self.inference, options.patience
+            )
+        else:
+            self.decoder = GreedyDecoder(options.temperature, tokenizer.eot)
+        # logit filters: applies various rules to suppress or penalize certain tokens
+        self.logit_filters = []
+        if self.options.suppress_blank:
+            self.logit_filters.append(SuppressBlank(self.tokenizer, self.sample_begin))
+        if self.options.suppress_tokens:
+            self.logit_filters.append(SuppressTokens(self._get_suppress_tokens()))
+        if not options.without_timestamps:
+            precision = CHUNK_LENGTH / model.dims.n_audio_ctx  # usually 0.02 seconds
+            max_initial_timestamp_index = None
+            if options.max_initial_timestamp:
+                max_initial_timestamp_index = round(
+                    self.options.max_initial_timestamp / precision
+                )
+            self.logit_filters.append(
+                ApplyTimestampRules(
+                    tokenizer, self.sample_begin, max_initial_timestamp_index
+                )
+            )
+    def _verify_options(self, options: DecodingOptions) -> DecodingOptions:
+        if options.beam_size is not None and options.best_of is not None:
+            raise ValueError("beam_size and best_of can't be given together")
+        if options.temperature == 0:
+            if options.best_of is not None:
+                raise ValueError("best_of with greedy sampling (T=0) is not compatible")
+        if options.patience is not None and options.beam_size is None:
+            raise ValueError("patience requires beam_size to be given")
+        if options.length_penalty is not None and not (
+            0 <= options.length_penalty <= 1
+        ):
+            raise ValueError("length_penalty (alpha) should be a value between 0 and 1")
+        return options
+    def _get_initial_tokens(self) -> Tuple[int]:
+        tokens = list(self.sot_sequence)
+        if prefix := self.options.prefix:
+            prefix_tokens = (
+                self.tokenizer.encode(" " + prefix.strip())
+                if isinstance(prefix, str)
+                else prefix
+            )
+            if self.sample_len is not None:
+                max_prefix_len = self.n_ctx // 2 - self.sample_len
+                prefix_tokens = prefix_tokens[-max_prefix_len:]
+            tokens = tokens + prefix_tokens
+        if prompt := self.options.prompt:
+            prompt_tokens = (
+                self.tokenizer.encode(" " + prompt.strip())
+                if isinstance(prompt, str)
+                else prompt
+            )
+            tokens = (
+                [self.tokenizer.sot_prev]
+                + prompt_tokens[-(self.n_ctx // 2 - 1) :]
+                + tokens
+            )
+        return tuple(tokens)
+    def _get_suppress_tokens(self) -> Tuple[int]:
+        suppress_tokens = self.options.suppress_tokens
+        if isinstance(suppress_tokens, str):
+            suppress_tokens = [int(t) for t in suppress_tokens.split(",")]
+        if -1 in suppress_tokens:
+            suppress_tokens = [t for t in suppress_tokens if t >= 0]
+            suppress_tokens.extend(self.tokenizer.non_speech_tokens)
+        elif suppress_tokens is None or len(suppress_tokens) == 0:
+            suppress_tokens = []  # interpret empty string as an empty list
+        else:
+            assert isinstance(suppress_tokens, list), "suppress_tokens must be a list"
+        suppress_tokens.extend(
+            [
+                self.tokenizer.transcribe,
+                self.tokenizer.translate,
+                self.tokenizer.sot,
+                self.tokenizer.sot_prev,
+                self.tokenizer.sot_lm,
+            ]
+        )
+        if self.tokenizer.no_speech is not None:
+            # no-speech probability is collected separately
+            suppress_tokens.append(self.tokenizer.no_speech)
+        return tuple(sorted(set(suppress_tokens)))
+    def _get_audio_features(self, mel: Tensor):
+        if self.options.fp16:
+            mel = mel.half()
+        if mel.shape[-2:] == (
+            self.model.dims.n_audio_ctx,
+            self.model.dims.n_audio_state,
+        ):
+            # encoded audio features are given; skip audio encoding
+            audio_features = mel
+        else:
+            audio_features = self.model.encoder(mel)
+        if audio_features.dtype != (
+            torch.float16 if self.options.fp16 else torch.float32
+        ):
+            return TypeError(
+                f"audio_features has an incorrect dtype: {audio_features.dtype}"
+            )
+        return audio_features
+    def _detect_language(self, audio_features: Tensor, tokens: Tensor):
+        languages = [self.options.language] * audio_features.shape[0]
+        lang_probs = None
+        if self.options.language is None or self.options.task == "lang_id":
+            lang_tokens, lang_probs = self.model.detect_language(
+                audio_features, self.tokenizer
+            )
+            languages = [max(probs, key=probs.get) for probs in lang_probs]
+            if self.options.language is None:
+                tokens[:, self.sot_index + 1] = lang_tokens  # write language tokens
+        return languages, lang_probs
+    def _main_loop(self, audio_features: Tensor, tokens: Tensor):
+        n_batch = tokens.shape[0]
+        sum_logprobs: Tensor = torch.zeros(n_batch, device=audio_features.device)
+        no_speech_probs = [np.nan] * n_batch
+        try:
+            for i in range(self.sample_len):
+                logits = self.inference.logits(tokens, audio_features)
+                if (
+                    i == 0 and self.tokenizer.no_speech is not None
+                ):  # save no_speech_probs
+                    probs_at_sot = logits[:, self.sot_index].float().softmax(dim=-1)
+                    no_speech_probs = probs_at_sot[:, self.tokenizer.no_speech].tolist()
+                # now we need to consider the logits at the last token only
+                logits = logits[:, -1]
+                # apply the logit filters, e.g. for suppressing or applying penalty to
+                for logit_filter in self.logit_filters:
+                    logit_filter.apply(logits, tokens)
+                # expand the tokens tensor with the selected next tokens
+                tokens, completed = self.decoder.update(tokens, logits, sum_logprobs)
+                if completed or tokens.shape[-1] > self.n_ctx:
+                    break
+        finally:
+            self.inference.cleanup_caching()
+        return tokens, sum_logprobs, no_speech_probs
+    @torch.no_grad()
+    def run(self, mel: Tensor) -> List[DecodingResult]:
+        self.decoder.reset()
+        tokenizer: Tokenizer = self.tokenizer
+        n_audio: int = mel.shape[0]
+        audio_features: Tensor = self._get_audio_features(mel)  # encoder forward pass
+        tokens: Tensor = torch.tensor([self.initial_tokens]).repeat(n_audio, 1)
+        # detect language if requested, overwriting the language token
+        languages, language_probs = self._detect_language(audio_features, tokens)
+        if self.options.task == "lang_id":
+            return [
+                DecodingResult(
+                    audio_features=features, language=language, language_probs=probs
+                )
+                for features, language, probs in zip(
+                    audio_features, languages, language_probs
+                )
+            ]
+        # repeat text tensors by the group size, for beam search or best-of-n sampling
+        tokens = tokens.repeat_interleave(self.n_group, dim=0).to(audio_features.device)
+        # call the main sampling loop
+        tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features, tokens)
+        # reshape the tensors to have (n_audio, n_group) as the first two dimensions
+        audio_features = audio_features[:: self.n_group]
+        no_speech_probs = no_speech_probs[:: self.n_group]
+        assert audio_features.shape[0] == len(no_speech_probs) == n_audio
+        tokens = tokens.reshape(n_audio, self.n_group, -1)
+        sum_logprobs = sum_logprobs.reshape(n_audio, self.n_group)
+        # get the final candidates for each group, and slice between the first sampled token and EOT
+        tokens, sum_logprobs = self.decoder.finalize(tokens, sum_logprobs)
+        tokens: List[List[Tensor]] = [
+            [t[self.sample_begin : (t == tokenizer.eot).nonzero()[0, 0]] for t in s]
+            for s in tokens
+        ]
+        # select the top-ranked sample in each group
+        selected = self.sequence_ranker.rank(tokens, sum_logprobs)
+        tokens: List[List[int]] = [t[i].tolist() for i, t in zip(selected, tokens)]
+        texts: List[str] = [tokenizer.decode(t).strip() for t in tokens]
+        sum_logprobs: List[float] = [lp[i] for i, lp in zip(selected, sum_logprobs)]
+        avg_logprobs: List[float] = [
+            lp / (len(t) + 1) for t, lp in zip(tokens, sum_logprobs)
+        ]
+        fields = (
+            texts,
+            languages,
+            tokens,
+            audio_features,
+            avg_logprobs,
+            no_speech_probs,
+        )
+        if len(set(map(len, fields))) != 1:
+            raise RuntimeError(f"inconsistent result lengths: {list(map(len, fields))}")
+        return [
+            DecodingResult(
+                audio_features=features,
+                language=language,
+                tokens=tokens,
+                text=text,
+                avg_logprob=avg_logprob,
+                no_speech_prob=no_speech_prob,
+                temperature=self.options.temperature,
+                compression_ratio=compression_ratio(text),
+            )
+            for text, language, tokens, features, avg_logprob, no_speech_prob in zip(
+                *fields
+            )
+        ]
+@torch.no_grad()
+def decode(
+    model: "Whisper",
+    mel: Tensor,
+    options: DecodingOptions = DecodingOptions(),
+    **kwargs,
+) -> Union[DecodingResult, List[DecodingResult]]:
+    """
+    Performs decoding of 30-second audio segment(s), provided as Mel spectrogram(s).
+    Parameters
+    ----------
+    model: Whisper
+        the Whisper model instance
+    mel: torch.Tensor, shape = (80, 3000) or (*, 80, 3000)
+        A tensor containing the Mel spectrogram(s)
+    options: DecodingOptions
+        A dataclass that contains all necessary options for decoding 30-second segments
+    Returns
+    -------
+    result: Union[DecodingResult, List[DecodingResult]]
+        The result(s) of decoding contained in `DecodingResult` dataclass instance(s)
+    """
+    if single := mel.ndim == 2:
+        mel = mel.unsqueeze(0)
+    if kwargs:
+        options = replace(options, **kwargs)
+    result = DecodingTask(model, options).run(mel)
+    return result[0] if single else result

whisper/whisper/model.py ADDED Viewed

	@@ -0,0 +1,309 @@

+import base64
+import gzip
+from dataclasses import dataclass
+from typing import Dict, Iterable, Optional
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from .decoding import decode as decode_function
+from .decoding import detect_language as detect_language_function
+from .transcribe import transcribe as transcribe_function
+@dataclass
+class ModelDimensions:
+    n_mels: int
+    n_audio_ctx: int
+    n_audio_state: int
+    n_audio_head: int
+    n_audio_layer: int
+    n_vocab: int
+    n_text_ctx: int
+    n_text_state: int
+    n_text_head: int
+    n_text_layer: int
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x: Tensor) -> Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class Linear(nn.Linear):
+    def forward(self, x: Tensor) -> Tensor:
+        return F.linear(
+            x,
+            self.weight.to(x.dtype),
+            None if self.bias is None else self.bias.to(x.dtype),
+        )
+class Conv1d(nn.Conv1d):
+    def _conv_forward(
+        self, x: Tensor, weight: Tensor, bias: Optional[Tensor]
+    ) -> Tensor:
+        return super()._conv_forward(
+            x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
+        )
+def sinusoids(length, channels, max_timescale=10000):
+    """Returns sinusoids for positional embedding"""
+    assert channels % 2 == 0
+    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        q = self.query(x)
+        if kv_cache is None or xa is None or self.key not in kv_cache:
+            # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors;
+            # otherwise, perform key/value projections for self- or cross-attention as usual.
+            k = self.key(x if xa is None else xa)
+            v = self.value(x if xa is None else xa)
+        else:
+            # for cross-attention, calculate keys and values once and reuse in subsequent calls.
+            k = kv_cache[self.key]
+            v = kv_cache[self.value]
+        wv, qk = self.qkv_attention(q, k, v, mask)
+        return self.out(wv), qk
+    def qkv_attention(
+        self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None
+    ):
+        n_batch, n_ctx, n_state = q.shape
+        scale = (n_state // self.n_head) ** -0.25
+        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
+        k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale
+        v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        qk = q @ k
+        if mask is not None:
+            qk = qk + mask[:n_ctx, :n_ctx]
+        qk = qk.float()
+        w = F.softmax(qk, dim=-1).to(q.dtype)
+        return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach()
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
+        super().__init__()
+        self.attn = MultiHeadAttention(n_state, n_head)
+        self.attn_ln = LayerNorm(n_state)
+        self.cross_attn = (
+            MultiHeadAttention(n_state, n_head) if cross_attention else None
+        )
+        self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(
+            Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)
+        )
+        self.mlp_ln = LayerNorm(n_state)
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)[0]
+        if self.cross_attn:
+            x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)[0]
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+class AudioEncoder(nn.Module):
+    def __init__(
+        self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
+    ):
+        super().__init__()
+        self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
+        self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
+        self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
+        )
+        self.ln_post = LayerNorm(n_state)
+    def forward(self, x: Tensor):
+        """
+        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
+            the mel spectrogram of the audio
+        """
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+        x = x.permute(0, 2, 1)
+        assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
+        x = (x + self.positional_embedding).to(x.dtype)
+        for block in self.blocks:
+            x = block(x)
+        x = self.ln_post(x)
+        return x
+class TextDecoder(nn.Module):
+    def __init__(
+        self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
+    ):
+        super().__init__()
+        self.token_embedding = nn.Embedding(n_vocab, n_state)
+        self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state))
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [
+                ResidualAttentionBlock(n_state, n_head, cross_attention=True)
+                for _ in range(n_layer)
+            ]
+        )
+        self.ln = LayerNorm(n_state)
+        mask = torch.empty(n_ctx, n_ctx).fill_(-np.inf).triu_(1)
+        self.register_buffer("mask", mask, persistent=False)
+    def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
+        """
+        x : torch.LongTensor, shape = (batch_size, <= n_ctx)
+            the text tokens
+        xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state)
+            the encoded audio features to be attended on
+        """
+        offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0
+        x = (
+            self.token_embedding(x)
+            + self.positional_embedding[offset : offset + x.shape[-1]]
+        )
+        x = x.to(xa.dtype)
+        for block in self.blocks:
+            x = block(x, xa, mask=self.mask, kv_cache=kv_cache)
+        x = self.ln(x)
+        logits = (
+            x @ torch.transpose(self.token_embedding.weight.to(x.dtype), 0, 1)
+        ).float()
+        return logits
+class Whisper(nn.Module):
+    def __init__(self, dims: ModelDimensions):
+        super().__init__()
+        self.dims = dims
+        self.encoder = AudioEncoder(
+            self.dims.n_mels,
+            self.dims.n_audio_ctx,
+            self.dims.n_audio_state,
+            self.dims.n_audio_head,
+            self.dims.n_audio_layer,
+        )
+        self.decoder = TextDecoder(
+            self.dims.n_vocab,
+            self.dims.n_text_ctx,
+            self.dims.n_text_state,
+            self.dims.n_text_head,
+            self.dims.n_text_layer,
+        )
+        # use the last half layers for alignment by default; see `set_alignment_heads()` below
+        all_heads = torch.zeros(
+            self.dims.n_text_layer, self.dims.n_text_head, dtype=torch.bool
+        )
+        all_heads[self.dims.n_text_layer // 2 :] = True
+        self.register_buffer("alignment_heads", all_heads.to_sparse(), persistent=False)
+    def set_alignment_heads(self, dump: bytes):
+        array = np.frombuffer(
+            gzip.decompress(base64.b85decode(dump)), dtype=bool
+        ).copy()
+        mask = torch.from_numpy(array).reshape(
+            self.dims.n_text_layer, self.dims.n_text_head
+        )
+        self.register_buffer("alignment_heads", mask.to_sparse(), persistent=False)
+    def embed_audio(self, mel: torch.Tensor):
+        return self.encoder(mel)
+    def logits(self, tokens: torch.Tensor, audio_features: torch.Tensor):
+        return self.decoder(tokens, audio_features)
+    def forward(
+        self, mel: torch.Tensor, tokens: torch.Tensor
+    ) -> Dict[str, torch.Tensor]:
+        return self.decoder(tokens, self.encoder(mel))
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def is_multilingual(self):
+        return self.dims.n_vocab == 51865
+    def install_kv_cache_hooks(self, cache: Optional[dict] = None):
+        """
+        The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
+        tensors calculated for the previous positions. This method returns a dictionary that stores
+        all caches, and the necessary hooks for the key and value projection modules that save the
+        intermediate tensors to be reused during later calculations.
+        Returns
+        -------
+        cache : Dict[nn.Module, torch.Tensor]
+            A dictionary object mapping the key/value projection modules to its cache
+        hooks : List[RemovableHandle]
+            List of PyTorch RemovableHandle objects to stop the hooks to be called
+        """
+        cache = {**cache} if cache is not None else {}
+        hooks = []
+        def save_to_cache(module, _, output):
+            if module not in cache or output.shape[1] > self.dims.n_text_ctx:
+                # save as-is, for the first token or cross attention
+                cache[module] = output
+            else:
+                cache[module] = torch.cat([cache[module], output], dim=1).detach()
+            return cache[module]
+        def install_hooks(layer: nn.Module):
+            if isinstance(layer, MultiHeadAttention):
+                hooks.append(layer.key.register_forward_hook(save_to_cache))
+                hooks.append(layer.value.register_forward_hook(save_to_cache))
+        self.decoder.apply(install_hooks)
+        return cache, hooks
+    detect_language = detect_language_function
+    transcribe = transcribe_function
+    decode = decode_function

whisper/whisper/normalizers/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .basic import BasicTextNormalizer as BasicTextNormalizer
2	+ from .english import EnglishTextNormalizer as EnglishTextNormalizer

whisper/whisper/normalizers/basic.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import re
+import unicodedata
+import regex
+# non-ASCII letters that are not separated by "NFKD" normalization
+ADDITIONAL_DIACRITICS = {
+    "œ": "oe",
+    "Œ": "OE",
+    "ø": "o",
+    "Ø": "O",
+    "æ": "ae",
+    "Æ": "AE",
+    "ß": "ss",
+    "ẞ": "SS",
+    "đ": "d",
+    "Đ": "D",
+    "ð": "d",
+    "Ð": "D",
+    "þ": "th",
+    "Þ": "th",
+    "ł": "l",
+    "Ł": "L",
+}
+def remove_symbols_and_diacritics(s: str, keep=""):
+    """
+    Replace any other markers, symbols, and punctuations with a space,
+    and drop any diacritics (category 'Mn' and some manual mappings)
+    """
+    return "".join(
+        c
+        if c in keep
+        else ADDITIONAL_DIACRITICS[c]
+        if c in ADDITIONAL_DIACRITICS
+        else ""
+        if unicodedata.category(c) == "Mn"
+        else " "
+        if unicodedata.category(c)[0] in "MSP"
+        else c
+        for c in unicodedata.normalize("NFKD", s)
+    )
+def remove_symbols(s: str):
+    """
+    Replace any other markers, symbols, punctuations with a space, keeping diacritics
+    """
+    return "".join(
+        " " if unicodedata.category(c)[0] in "MSP" else c
+        for c in unicodedata.normalize("NFKC", s)
+    )
+class BasicTextNormalizer:
+    def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
+        self.clean = (
+            remove_symbols_and_diacritics if remove_diacritics else remove_symbols
+        )
+        self.split_letters = split_letters
+    def __call__(self, s: str):
+        s = s.lower()
+        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
+        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
+        s = self.clean(s).lower()
+        if self.split_letters:
+            s = " ".join(regex.findall(r"\X", s, regex.U))
+        s = re.sub(
+            r"\s+", " ", s
+        )  # replace any successive whitespace characters with a space
+        return s

whisper/whisper/normalizers/english.json ADDED Viewed

	@@ -0,0 +1,1741 @@

+{
+    "accessorise": "accessorize",
+    "accessorised": "accessorized",
+    "accessorises": "accessorizes",
+    "accessorising": "accessorizing",
+    "acclimatisation": "acclimatization",
+    "acclimatise": "acclimatize",
+    "acclimatised": "acclimatized",
+    "acclimatises": "acclimatizes",
+    "acclimatising": "acclimatizing",
+    "accoutrements": "accouterments",
+    "aeon": "eon",
+    "aeons": "eons",
+    "aerogramme": "aerogram",
+    "aerogrammes": "aerograms",
+    "aeroplane": "airplane",
+    "aeroplanes": "airplanes",
+    "aesthete": "esthete",
+    "aesthetes": "esthetes",
+    "aesthetic": "esthetic",
+    "aesthetically": "esthetically",
+    "aesthetics": "esthetics",
+    "aetiology": "etiology",
+    "ageing": "aging",
+    "aggrandisement": "aggrandizement",
+    "agonise": "agonize",
+    "agonised": "agonized",
+    "agonises": "agonizes",
+    "agonising": "agonizing",
+    "agonisingly": "agonizingly",
+    "almanack": "almanac",
+    "almanacks": "almanacs",
+    "aluminium": "aluminum",
+    "amortisable": "amortizable",
+    "amortisation": "amortization",
+    "amortisations": "amortizations",
+    "amortise": "amortize",
+    "amortised": "amortized",
+    "amortises": "amortizes",
+    "amortising": "amortizing",
+    "amphitheatre": "amphitheater",
+    "amphitheatres": "amphitheaters",
+    "anaemia": "anemia",
+    "anaemic": "anemic",
+    "anaesthesia": "anesthesia",
+    "anaesthetic": "anesthetic",
+    "anaesthetics": "anesthetics",
+    "anaesthetise": "anesthetize",
+    "anaesthetised": "anesthetized",
+    "anaesthetises": "anesthetizes",
+    "anaesthetising": "anesthetizing",
+    "anaesthetist": "anesthetist",
+    "anaesthetists": "anesthetists",
+    "anaesthetize": "anesthetize",
+    "anaesthetized": "anesthetized",
+    "anaesthetizes": "anesthetizes",
+    "anaesthetizing": "anesthetizing",
+    "analogue": "analog",
+    "analogues": "analogs",
+    "analyse": "analyze",
+    "analysed": "analyzed",
+    "analyses": "analyzes",
+    "analysing": "analyzing",
+    "anglicise": "anglicize",
+    "anglicised": "anglicized",
+    "anglicises": "anglicizes",
+    "anglicising": "anglicizing",
+    "annualised": "annualized",
+    "antagonise": "antagonize",
+    "antagonised": "antagonized",
+    "antagonises": "antagonizes",
+    "antagonising": "antagonizing",
+    "apologise": "apologize",
+    "apologised": "apologized",
+    "apologises": "apologizes",
+    "apologising": "apologizing",
+    "appal": "appall",
+    "appals": "appalls",
+    "appetiser": "appetizer",
+    "appetisers": "appetizers",
+    "appetising": "appetizing",
+    "appetisingly": "appetizingly",
+    "arbour": "arbor",
+    "arbours": "arbors",
+    "archeological": "archaeological",
+    "archaeologically": "archeologically",
+    "archaeologist": "archeologist",
+    "archaeologists": "archeologists",
+    "archaeology": "archeology</span>",
+    "ardour": "ardor",
+    "armour": "armor",
+    "armoured": "armored",
+    "armourer": "armorer",
+    "armourers": "armorers",
+    "armouries": "armories",
+    "armoury": "armory",
+    "artefact": "artifact",
+    "artefacts": "artifacts",
+    "authorise": "authorize",
+    "authorised": "authorized",
+    "authorises": "authorizes",
+    "authorising": "authorizing",
+    "axe": "ax",
+    "backpedalled": "backpedaled",
+    "backpedalling": "backpedaling",
+    "bannister": "banister",
+    "bannisters": "banisters",
+    "baptise": "baptize",
+    "baptised": "baptized",
+    "baptises": "baptizes",
+    "baptising": "baptizing",
+    "bastardise": "bastardize",
+    "bastardised": "bastardized",
+    "bastardises": "bastardizes",
+    "bastardising": "bastardizing",
+    "battleax": "battleaxe",
+    "baulk": "balk",
+    "baulked": "balked",
+    "baulking": "balking",
+    "baulks": "balks",
+    "bedevilled": "bedeviled",
+    "bedevilling": "bedeviling",
+    "behaviour": "behavior",
+    "behavioural": "behavioral",
+    "behaviourism": "behaviorism",
+    "behaviourist": "behaviorist",
+    "behaviourists": "behaviorists",
+    "behaviours": "behaviors",
+    "behove": "behoove",
+    "behoved": "behooved",
+    "behoves": "behooves",
+    "bejewelled": "bejeweled",
+    "belabour": "belabor",
+    "belaboured": "belabored",
+    "belabouring": "belaboring",
+    "belabours": "belabors",
+    "bevelled": "beveled",
+    "bevvies": "bevies",
+    "bevvy": "bevy",
+    "biassed": "biased",
+    "biassing": "biasing",
+    "bingeing": "binging",
+    "bougainvillaea": "bougainvillea",
+    "bougainvillaeas": "bougainvilleas",
+    "bowdlerise": "bowdlerize",
+    "bowdlerised": "bowdlerized",
+    "bowdlerises": "bowdlerizes",
+    "bowdlerising": "bowdlerizing",
+    "breathalyse": "breathalyze",
+    "breathalysed": "breathalyzed",
+    "breathalyser": "breathalyzer",
+    "breathalysers": "breathalyzers",
+    "breathalyses": "breathalyzes",
+    "breathalysing": "breathalyzing",
+    "brutalise": "brutalize",
+    "brutalised": "brutalized",
+    "brutalises": "brutalizes",
+    "brutalising": "brutalizing",
+    "busses": "buses",
+    "bussing": "busing",
+    "caesarean": "cesarean",
+    "caesareans": "cesareans",
+    "calibre": "caliber",
+    "calibres": "calibers",
+    "calliper": "caliper",
+    "callipers": "calipers",
+    "callisthenics": "calisthenics",
+    "canalise": "canalize",
+    "canalised": "canalized",
+    "canalises": "canalizes",
+    "canalising": "canalizing",
+    "cancelation": "cancellation",
+    "cancelations": "cancellations",
+    "cancelled": "canceled",
+    "cancelling": "canceling",
+    "candour": "candor",
+    "cannibalise": "cannibalize",
+    "cannibalised": "cannibalized",
+    "cannibalises": "cannibalizes",
+    "cannibalising": "cannibalizing",
+    "canonise": "canonize",
+    "canonised": "canonized",
+    "canonises": "canonizes",
+    "canonising": "canonizing",
+    "capitalise": "capitalize",
+    "capitalised": "capitalized",
+    "capitalises": "capitalizes",
+    "capitalising": "capitalizing",
+    "caramelise": "caramelize",
+    "caramelised": "caramelized",
+    "caramelises": "caramelizes",
+    "caramelising": "caramelizing",
+    "carbonise": "carbonize",
+    "carbonised": "carbonized",
+    "carbonises": "carbonizes",
+    "carbonising": "carbonizing",
+    "carolled": "caroled",
+    "carolling": "caroling",
+    "catalogue": "catalog",
+    "catalogued": "cataloged",
+    "catalogues": "catalogs",
+    "cataloguing": "cataloging",
+    "catalyse": "catalyze",
+    "catalysed": "catalyzed",
+    "catalyses": "catalyzes",
+    "catalysing": "catalyzing",
+    "categorise": "categorize",
+    "categorised": "categorized",
+    "categorises": "categorizes",
+    "categorising": "categorizing",
+    "cauterise": "cauterize",
+    "cauterised": "cauterized",
+    "cauterises": "cauterizes",
+    "cauterising": "cauterizing",
+    "cavilled": "caviled",
+    "cavilling": "caviling",
+    "centigramme": "centigram",
+    "centigrammes": "centigrams",
+    "centilitre": "centiliter",
+    "centilitres": "centiliters",
+    "centimetre": "centimeter",
+    "centimetres": "centimeters",
+    "centralise": "centralize",
+    "centralised": "centralized",
+    "centralises": "centralizes",
+    "centralising": "centralizing",
+    "centre": "center",
+    "centred": "centered",
+    "centrefold": "centerfold",
+    "centrefolds": "centerfolds",
+    "centrepiece": "centerpiece",
+    "centrepieces": "centerpieces",
+    "centres": "centers",
+    "channelled": "channeled",
+    "channelling": "channeling",
+    "characterise": "characterize",
+    "characterised": "characterized",
+    "characterises": "characterizes",
+    "characterising": "characterizing",
+    "cheque": "check",
+    "chequebook": "checkbook",
+    "chequebooks": "checkbooks",
+    "chequered": "checkered",
+    "cheques": "checks",
+    "chilli": "chili",
+    "chimaera": "chimera",
+    "chimaeras": "chimeras",
+    "chiselled": "chiseled",
+    "chiselling": "chiseling",
+    "circularise": "circularize",
+    "circularised": "circularized",
+    "circularises": "circularizes",
+    "circularising": "circularizing",
+    "civilise": "civilize",
+    "civilised": "civilized",
+    "civilises": "civilizes",
+    "civilising": "civilizing",
+    "clamour": "clamor",
+    "clamoured": "clamored",
+    "clamouring": "clamoring",
+    "clamours": "clamors",
+    "clangour": "clangor",
+    "clarinettist": "clarinetist",
+    "clarinettists": "clarinetists",
+    "collectivise": "collectivize",
+    "collectivised": "collectivized",
+    "collectivises": "collectivizes",
+    "collectivising": "collectivizing",
+    "colonisation": "colonization",
+    "colonise": "colonize",
+    "colonised": "colonized",
+    "coloniser": "colonizer",
+    "colonisers": "colonizers",
+    "colonises": "colonizes",
+    "colonising": "colonizing",
+    "colour": "color",
+    "colourant": "colorant",
+    "colourants": "colorants",
+    "coloured": "colored",
+    "coloureds": "coloreds",
+    "colourful": "colorful",
+    "colourfully": "colorfully",
+    "colouring": "coloring",
+    "colourize": "colorize",
+    "colourized": "colorized",
+    "colourizes": "colorizes",
+    "colourizing": "colorizing",
+    "colourless": "colorless",
+    "colours": "colors",
+    "commercialise": "commercialize",
+    "commercialised": "commercialized",
+    "commercialises": "commercializes",
+    "commercialising": "commercializing",
+    "compartmentalise": "compartmentalize",
+    "compartmentalised": "compartmentalized",
+    "compartmentalises": "compartmentalizes",
+    "compartmentalising": "compartmentalizing",
+    "computerise": "computerize",
+    "computerised": "computerized",
+    "computerises": "computerizes",
+    "computerising": "computerizing",
+    "conceptualise": "conceptualize",
+    "conceptualised": "conceptualized",
+    "conceptualises": "conceptualizes",
+    "conceptualising": "conceptualizing",
+    "connexion": "connection",
+    "connexions": "connections",
+    "contextualise": "contextualize",
+    "contextualised": "contextualized",
+    "contextualises": "contextualizes",
+    "contextualising": "contextualizing",
+    "cosier": "cozier",
+    "cosies": "cozies",
+    "cosiest": "coziest",
+    "cosily": "cozily",
+    "cosiness": "coziness",
+    "cosy": "cozy",
+    "councillor": "councilor",
+    "councillors": "councilors",
+    "counselled": "counseled",
+    "counselling": "counseling",
+    "counsellor": "counselor",
+    "counsellors": "counselors",
+    "crenelated": "crenellated",
+    "criminalise": "criminalize",
+    "criminalised": "criminalized",
+    "criminalises": "criminalizes",
+    "criminalising": "criminalizing",
+    "criticise": "criticize",
+    "criticised": "criticized",
+    "criticises": "criticizes",
+    "criticising": "criticizing",
+    "crueller": "crueler",
+    "cruellest": "cruelest",
+    "crystallisation": "crystallization",
+    "crystallise": "crystallize",
+    "crystallised": "crystallized",
+    "crystallises": "crystallizes",
+    "crystallising": "crystallizing",
+    "cudgelled": "cudgeled",
+    "cudgelling": "cudgeling",
+    "customise": "customize",
+    "customised": "customized",
+    "customises": "customizes",
+    "customising": "customizing",
+    "cypher": "cipher",
+    "cyphers": "ciphers",
+    "decentralisation": "decentralization",
+    "decentralise": "decentralize",
+    "decentralised": "decentralized",
+    "decentralises": "decentralizes",
+    "decentralising": "decentralizing",
+    "decriminalisation": "decriminalization",
+    "decriminalise": "decriminalize",
+    "decriminalised": "decriminalized",
+    "decriminalises": "decriminalizes",
+    "decriminalising": "decriminalizing",
+    "defence": "defense",
+    "defenceless": "defenseless",
+    "defences": "defenses",
+    "dehumanisation": "dehumanization",
+    "dehumanise": "dehumanize",
+    "dehumanised": "dehumanized",
+    "dehumanises": "dehumanizes",
+    "dehumanising": "dehumanizing",
+    "demeanour": "demeanor",
+    "demilitarisation": "demilitarization",
+    "demilitarise": "demilitarize",
+    "demilitarised": "demilitarized",
+    "demilitarises": "demilitarizes",
+    "demilitarising": "demilitarizing",
+    "demobilisation": "demobilization",
+    "demobilise": "demobilize",
+    "demobilised": "demobilized",
+    "demobilises": "demobilizes",
+    "demobilising": "demobilizing",
+    "democratisation": "democratization",
+    "democratise": "democratize",
+    "democratised": "democratized",
+    "democratises": "democratizes",
+    "democratising": "democratizing",
+    "demonise": "demonize",
+    "demonised": "demonized",
+    "demonises": "demonizes",
+    "demonising": "demonizing",
+    "demoralisation": "demoralization",
+    "demoralise": "demoralize",
+    "demoralised": "demoralized",
+    "demoralises": "demoralizes",
+    "demoralising": "demoralizing",
+    "denationalisation": "denationalization",
+    "denationalise": "denationalize",
+    "denationalised": "denationalized",
+    "denationalises": "denationalizes",
+    "denationalising": "denationalizing",
+    "deodorise": "deodorize",
+    "deodorised": "deodorized",
+    "deodorises": "deodorizes",
+    "deodorising": "deodorizing",
+    "depersonalise": "depersonalize",
+    "depersonalised": "depersonalized",
+    "depersonalises": "depersonalizes",
+    "depersonalising": "depersonalizing",
+    "deputise": "deputize",
+    "deputised": "deputized",
+    "deputises": "deputizes",
+    "deputising": "deputizing",
+    "desensitisation": "desensitization",
+    "desensitise": "desensitize",
+    "desensitised": "desensitized",
+    "desensitises": "desensitizes",
+    "desensitising": "desensitizing",
+    "destabilisation": "destabilization",
+    "destabilise": "destabilize",
+    "destabilised": "destabilized",
+    "destabilises": "destabilizes",
+    "destabilising": "destabilizing",
+    "dialled": "dialed",
+    "dialling": "dialing",
+    "dialogue": "dialog",
+    "dialogues": "dialogs",
+    "diarrhoea": "diarrhea",
+    "digitise": "digitize",
+    "digitised": "digitized",
+    "digitises": "digitizes",
+    "digitising": "digitizing",
+    "disc": "disk",
+    "discolour": "discolor",
+    "discoloured": "discolored",
+    "discolouring": "discoloring",
+    "discolours": "discolors",
+    "discs": "disks",
+    "disembowelled": "disemboweled",
+    "disembowelling": "disemboweling",
+    "disfavour": "disfavor",
+    "dishevelled": "disheveled",
+    "dishonour": "dishonor",
+    "dishonourable": "dishonorable",
+    "dishonourably": "dishonorably",
+    "dishonoured": "dishonored",
+    "dishonouring": "dishonoring",
+    "dishonours": "dishonors",
+    "disorganisation": "disorganization",
+    "disorganised": "disorganized",
+    "distil": "distill",
+    "distils": "distills",
+    "dramatisation": "dramatization",
+    "dramatisations": "dramatizations",
+    "dramatise": "dramatize",
+    "dramatised": "dramatized",
+    "dramatises": "dramatizes",
+    "dramatising": "dramatizing",
+    "draught": "draft",
+    "draughtboard": "draftboard",
+    "draughtboards": "draftboards",
+    "draughtier": "draftier",
+    "draughtiest": "draftiest",
+    "draughts": "drafts",
+    "draughtsman": "draftsman",
+    "draughtsmanship": "draftsmanship",
+    "draughtsmen": "draftsmen",
+    "draughtswoman": "draftswoman",
+    "draughtswomen": "draftswomen",
+    "draughty": "drafty",
+    "drivelled": "driveled",
+    "drivelling": "driveling",
+    "duelled": "dueled",
+    "duelling": "dueling",
+    "economise": "economize",
+    "economised": "economized",
+    "economises": "economizes",
+    "economising": "economizing",
+    "edoema": "edema",
+    "editorialise": "editorialize",
+    "editorialised": "editorialized",
+    "editorialises": "editorializes",
+    "editorialising": "editorializing",
+    "empathise": "empathize",
+    "empathised": "empathized",
+    "empathises": "empathizes",
+    "empathising": "empathizing",
+    "emphasise": "emphasize",
+    "emphasised": "emphasized",
+    "emphasises": "emphasizes",
+    "emphasising": "emphasizing",
+    "enamelled": "enameled",
+    "enamelling": "enameling",
+    "enamoured": "enamored",
+    "encyclopaedia": "encyclopedia",
+    "encyclopaedias": "encyclopedias",
+    "encyclopaedic": "encyclopedic",
+    "endeavour": "endeavor",
+    "endeavoured": "endeavored",
+    "endeavouring": "endeavoring",
+    "endeavours": "endeavors",
+    "energise": "energize",
+    "energised": "energized",
+    "energises": "energizes",
+    "energising": "energizing",
+    "enrol": "enroll",
+    "enrols": "enrolls",
+    "enthral": "enthrall",
+    "enthrals": "enthralls",
+    "epaulette": "epaulet",
+    "epaulettes": "epaulets",
+    "epicentre": "epicenter",
+    "epicentres": "epicenters",
+    "epilogue": "epilog",
+    "epilogues": "epilogs",
+    "epitomise": "epitomize",
+    "epitomised": "epitomized",
+    "epitomises": "epitomizes",
+    "epitomising": "epitomizing",
+    "equalisation": "equalization",
+    "equalise": "equalize",
+    "equalised": "equalized",
+    "equaliser": "equalizer",
+    "equalisers": "equalizers",
+    "equalises": "equalizes",
+    "equalising": "equalizing",
+    "eulogise": "eulogize",
+    "eulogised": "eulogized",
+    "eulogises": "eulogizes",
+    "eulogising": "eulogizing",
+    "evangelise": "evangelize",
+    "evangelised": "evangelized",
+    "evangelises": "evangelizes",
+    "evangelising": "evangelizing",
+    "exorcise": "exorcize",
+    "exorcised": "exorcized",
+    "exorcises": "exorcizes",
+    "exorcising": "exorcizing",
+    "extemporisation": "extemporization",
+    "extemporise": "extemporize",
+    "extemporised": "extemporized",
+    "extemporises": "extemporizes",
+    "extemporising": "extemporizing",
+    "externalisation": "externalization",
+    "externalisations": "externalizations",
+    "externalise": "externalize",
+    "externalised": "externalized",
+    "externalises": "externalizes",
+    "externalising": "externalizing",
+    "factorise": "factorize",
+    "factorised": "factorized",
+    "factorises": "factorizes",
+    "factorising": "factorizing",
+    "faecal": "fecal",
+    "faeces": "feces",
+    "familiarisation": "familiarization",
+    "familiarise": "familiarize",
+    "familiarised": "familiarized",
+    "familiarises": "familiarizes",
+    "familiarising": "familiarizing",
+    "fantasise": "fantasize",
+    "fantasised": "fantasized",
+    "fantasises": "fantasizes",
+    "fantasising": "fantasizing",
+    "favour": "favor",
+    "favourable": "favorable",
+    "favourably": "favorably",
+    "favoured": "favored",
+    "favouring": "favoring",
+    "favourite": "favorite",
+    "favourites": "favorites",
+    "favouritism": "favoritism",
+    "favours": "favors",
+    "feminise": "feminize",
+    "feminised": "feminized",
+    "feminises": "feminizes",
+    "feminising": "feminizing",
+    "fertilisation": "fertilization",
+    "fertilise": "fertilize",
+    "fertilised": "fertilized",
+    "fertiliser": "fertilizer",
+    "fertilisers": "fertilizers",
+    "fertilises": "fertilizes",
+    "fertilising": "fertilizing",
+    "fervour": "fervor",
+    "fibre": "fiber",
+    "fibreglass": "fiberglass",
+    "fibres": "fibers",
+    "fictionalisation": "fictionalization",
+    "fictionalisations": "fictionalizations",
+    "fictionalise": "fictionalize",
+    "fictionalised": "fictionalized",
+    "fictionalises": "fictionalizes",
+    "fictionalising": "fictionalizing",
+    "fillet": "filet",
+    "filleted": "fileted",
+    "filleting": "fileting",
+    "fillets": "filets",
+    "finalisation": "finalization",
+    "finalise": "finalize",
+    "finalised": "finalized",
+    "finalises": "finalizes",
+    "finalising": "finalizing",
+    "flautist": "flutist",
+    "flautists": "flutists",
+    "flavour": "flavor",
+    "flavoured": "flavored",
+    "flavouring": "flavoring",
+    "flavourings": "flavorings",
+    "flavourless": "flavorless",
+    "flavours": "flavors",
+    "flavoursome": "flavorsome",
+    "flyer / flier": "flier / flyer",
+    "foetal": "fetal",
+    "foetid": "fetid",
+    "foetus": "fetus",
+    "foetuses": "fetuses",
+    "formalisation": "formalization",
+    "formalise": "formalize",
+    "formalised": "formalized",
+    "formalises": "formalizes",
+    "formalising": "formalizing",
+    "fossilisation": "fossilization",
+    "fossilise": "fossilize",
+    "fossilised": "fossilized",
+    "fossilises": "fossilizes",
+    "fossilising": "fossilizing",
+    "fraternisation": "fraternization",
+    "fraternise": "fraternize",
+    "fraternised": "fraternized",
+    "fraternises": "fraternizes",
+    "fraternising": "fraternizing",
+    "fulfil": "fulfill",
+    "fulfilment": "fulfillment",
+    "fulfils": "fulfills",
+    "funnelled": "funneled",
+    "funnelling": "funneling",
+    "galvanise": "galvanize",
+    "galvanised": "galvanized",
+    "galvanises": "galvanizes",
+    "galvanising": "galvanizing",
+    "gambolled": "gamboled",
+    "gambolling": "gamboling",
+    "gaol": "jail",
+    "gaolbird": "jailbird",
+    "gaolbirds": "jailbirds",
+    "gaolbreak": "jailbreak",
+    "gaolbreaks": "jailbreaks",
+    "gaoled": "jailed",
+    "gaoler": "jailer",
+    "gaolers": "jailers",
+    "gaoling": "jailing",
+    "gaols": "jails",
+    "gasses": "gases",
+    "gage": "gauge",
+    "gaged": "gauged",
+    "gages": "gauges",
+    "gaging": "gauging",
+    "generalisation": "generalization",
+    "generalisations": "generalizations",
+    "generalise": "generalize",
+    "generalised": "generalized",
+    "generalises": "generalizes",
+    "generalising": "generalizing",
+    "ghettoise": "ghettoize",
+    "ghettoised": "ghettoized",
+    "ghettoises": "ghettoizes",
+    "ghettoising": "ghettoizing",
+    "gipsies": "gypsies",
+    "glamorise": "glamorize",
+    "glamorised": "glamorized",
+    "glamorises": "glamorizes",
+    "glamorising": "glamorizing",
+    "glamor": "glamour",
+    "globalisation": "globalization",
+    "globalise": "globalize",
+    "globalised": "globalized",
+    "globalises": "globalizes",
+    "globalising": "globalizing",
+    "glueing": "gluing",
+    "goitre": "goiter",
+    "goitres": "goiters",
+    "gonorrhoea": "gonorrhea",
+    "gramme": "gram",
+    "grammes": "grams",
+    "gravelled": "graveled",
+    "grey": "gray",
+    "greyed": "grayed",
+    "greying": "graying",
+    "greyish": "grayish",
+    "greyness": "grayness",
+    "greys": "grays",
+    "grovelled": "groveled",
+    "grovelling": "groveling",
+    "groyne": "groin",
+    "groynes": "groins",
+    "gruelling": "grueling",
+    "gruellingly": "gruelingly",
+    "gryphon": "griffin",
+    "gryphons": "griffins",
+    "gynaecological": "gynecological",
+    "gynaecologist": "gynecologist",
+    "gynaecologists": "gynecologists",
+    "gynaecology": "gynecology",
+    "haematological": "hematological",
+    "haematologist": "hematologist",
+    "haematologists": "hematologists",
+    "haematology": "hematology",
+    "haemoglobin": "hemoglobin",
+    "haemophilia": "hemophilia",
+    "haemophiliac": "hemophiliac",
+    "haemophiliacs": "hemophiliacs",
+    "haemorrhage": "hemorrhage",
+    "haemorrhaged": "hemorrhaged",
+    "haemorrhages": "hemorrhages",
+    "haemorrhaging": "hemorrhaging",
+    "haemorrhoids": "hemorrhoids",
+    "harbour": "harbor",
+    "harboured": "harbored",
+    "harbouring": "harboring",
+    "harbours": "harbors",
+    "harmonisation": "harmonization",
+    "harmonise": "harmonize",
+    "harmonised": "harmonized",
+    "harmonises": "harmonizes",
+    "harmonising": "harmonizing",
+    "homoeopath": "homeopath",
+    "homoeopathic": "homeopathic",
+    "homoeopaths": "homeopaths",
+    "homoeopathy": "homeopathy",
+    "homogenise": "homogenize",
+    "homogenised": "homogenized",
+    "homogenises": "homogenizes",
+    "homogenising": "homogenizing",
+    "honour": "honor",
+    "honourable": "honorable",
+    "honourably": "honorably",
+    "honoured": "honored",
+    "honouring": "honoring",
+    "honours": "honors",
+    "hospitalisation": "hospitalization",
+    "hospitalise": "hospitalize",
+    "hospitalised": "hospitalized",
+    "hospitalises": "hospitalizes",
+    "hospitalising": "hospitalizing",
+    "humanise": "humanize",
+    "humanised": "humanized",
+    "humanises": "humanizes",
+    "humanising": "humanizing",
+    "humour": "humor",
+    "humoured": "humored",
+    "humouring": "humoring",
+    "humourless": "humorless",
+    "humours": "humors",
+    "hybridise": "hybridize",
+    "hybridised": "hybridized",
+    "hybridises": "hybridizes",
+    "hybridising": "hybridizing",
+    "hypnotise": "hypnotize",
+    "hypnotised": "hypnotized",
+    "hypnotises": "hypnotizes",
+    "hypnotising": "hypnotizing",
+    "hypothesise": "hypothesize",
+    "hypothesised": "hypothesized",
+    "hypothesises": "hypothesizes",
+    "hypothesising": "hypothesizing",
+    "idealisation": "idealization",
+    "idealise": "idealize",
+    "idealised": "idealized",
+    "idealises": "idealizes",
+    "idealising": "idealizing",
+    "idolise": "idolize",
+    "idolised": "idolized",
+    "idolises": "idolizes",
+    "idolising": "idolizing",
+    "immobilisation": "immobilization",
+    "immobilise": "immobilize",
+    "immobilised": "immobilized",
+    "immobiliser": "immobilizer",
+    "immobilisers": "immobilizers",
+    "immobilises": "immobilizes",
+    "immobilising": "immobilizing",
+    "immortalise": "immortalize",
+    "immortalised": "immortalized",
+    "immortalises": "immortalizes",
+    "immortalising": "immortalizing",
+    "immunisation": "immunization",
+    "immunise": "immunize",
+    "immunised": "immunized",
+    "immunises": "immunizes",
+    "immunising": "immunizing",
+    "impanelled": "impaneled",
+    "impanelling": "impaneling",
+    "imperilled": "imperiled",
+    "imperilling": "imperiling",
+    "individualise": "individualize",
+    "individualised": "individualized",
+    "individualises": "individualizes",
+    "individualising": "individualizing",
+    "industrialise": "industrialize",
+    "industrialised": "industrialized",
+    "industrialises": "industrializes",
+    "industrialising": "industrializing",
+    "inflexion": "inflection",
+    "inflexions": "inflections",
+    "initialise": "initialize",
+    "initialised": "initialized",
+    "initialises": "initializes",
+    "initialising": "initializing",
+    "initialled": "initialed",
+    "initialling": "initialing",
+    "instal": "install",
+    "instalment": "installment",
+    "instalments": "installments",
+    "instals": "installs",
+    "instil": "instill",
+    "instils": "instills",
+    "institutionalisation": "institutionalization",
+    "institutionalise": "institutionalize",
+    "institutionalised": "institutionalized",
+    "institutionalises": "institutionalizes",
+    "institutionalising": "institutionalizing",
+    "intellectualise": "intellectualize",
+    "intellectualised": "intellectualized",
+    "intellectualises": "intellectualizes",
+    "intellectualising": "intellectualizing",
+    "internalisation": "internalization",
+    "internalise": "internalize",
+    "internalised": "internalized",
+    "internalises": "internalizes",
+    "internalising": "internalizing",
+    "internationalisation": "internationalization",
+    "internationalise": "internationalize",
+    "internationalised": "internationalized",
+    "internationalises": "internationalizes",
+    "internationalising": "internationalizing",
+    "ionisation": "ionization",
+    "ionise": "ionize",
+    "ionised": "ionized",
+    "ioniser": "ionizer",
+    "ionisers": "ionizers",
+    "ionises": "ionizes",
+    "ionising": "ionizing",
+    "italicise": "italicize",
+    "italicised": "italicized",
+    "italicises": "italicizes",
+    "italicising": "italicizing",
+    "itemise": "itemize",
+    "itemised": "itemized",
+    "itemises": "itemizes",
+    "itemising": "itemizing",
+    "jeopardise": "jeopardize",
+    "jeopardised": "jeopardized",
+    "jeopardises": "jeopardizes",
+    "jeopardising": "jeopardizing",
+    "jewelled": "jeweled",
+    "jeweller": "jeweler",
+    "jewellers": "jewelers",
+    "jewellery": "jewelry",
+    "judgement": "judgment",
+    "kilogramme": "kilogram",
+    "kilogrammes": "kilograms",
+    "kilometre": "kilometer",
+    "kilometres": "kilometers",
+    "labelled": "labeled",
+    "labelling": "labeling",
+    "labour": "labor",
+    "laboured": "labored",
+    "labourer": "laborer",
+    "labourers": "laborers",
+    "labouring": "laboring",
+    "labours": "labors",
+    "lacklustre": "lackluster",
+    "legalisation": "legalization",
+    "legalise": "legalize",
+    "legalised": "legalized",
+    "legalises": "legalizes",
+    "legalising": "legalizing",
+    "legitimise": "legitimize",
+    "legitimised": "legitimized",
+    "legitimises": "legitimizes",
+    "legitimising": "legitimizing",
+    "leukaemia": "leukemia",
+    "levelled": "leveled",
+    "leveller": "leveler",
+    "levellers": "levelers",
+    "levelling": "leveling",
+    "libelled": "libeled",
+    "libelling": "libeling",
+    "libellous": "libelous",
+    "liberalisation": "liberalization",
+    "liberalise": "liberalize",
+    "liberalised": "liberalized",
+    "liberalises": "liberalizes",
+    "liberalising": "liberalizing",
+    "licence": "license",
+    "licenced": "licensed",
+    "licences": "licenses",
+    "licencing": "licensing",
+    "likeable": "likable",
+    "lionisation": "lionization",
+    "lionise": "lionize",
+    "lionised": "lionized",
+    "lionises": "lionizes",
+    "lionising": "lionizing",
+    "liquidise": "liquidize",
+    "liquidised": "liquidized",
+    "liquidiser": "liquidizer",
+    "liquidisers": "liquidizers",
+    "liquidises": "liquidizes",
+    "liquidising": "liquidizing",
+    "litre": "liter",
+    "litres": "liters",
+    "localise": "localize",
+    "localised": "localized",
+    "localises": "localizes",
+    "localising": "localizing",
+    "louvre": "louver",
+    "louvred": "louvered",
+    "louvres": "louvers",
+    "lustre": "luster",
+    "magnetise": "magnetize",
+    "magnetised": "magnetized",
+    "magnetises": "magnetizes",
+    "magnetising": "magnetizing",
+    "manoeuvrability": "maneuverability",
+    "manoeuvrable": "maneuverable",
+    "manoeuvre": "maneuver",
+    "manoeuvred": "maneuvered",
+    "manoeuvres": "maneuvers",
+    "manoeuvring": "maneuvering",
+    "manoeuvrings": "maneuverings",
+    "marginalisation": "marginalization",
+    "marginalise": "marginalize",
+    "marginalised": "marginalized",
+    "marginalises": "marginalizes",
+    "marginalising": "marginalizing",
+    "marshalled": "marshaled",
+    "marshalling": "marshaling",
+    "marvelled": "marveled",
+    "marvelling": "marveling",
+    "marvellous": "marvelous",
+    "marvellously": "marvelously",
+    "materialisation": "materialization",
+    "materialise": "materialize",
+    "materialised": "materialized",
+    "materialises": "materializes",
+    "materialising": "materializing",
+    "maximisation": "maximization",
+    "maximise": "maximize",
+    "maximised": "maximized",
+    "maximises": "maximizes",
+    "maximising": "maximizing",
+    "meagre": "meager",
+    "mechanisation": "mechanization",
+    "mechanise": "mechanize",
+    "mechanised": "mechanized",
+    "mechanises": "mechanizes",
+    "mechanising": "mechanizing",
+    "mediaeval": "medieval",
+    "memorialise": "memorialize",
+    "memorialised": "memorialized",
+    "memorialises": "memorializes",
+    "memorialising": "memorializing",
+    "memorise": "memorize",
+    "memorised": "memorized",
+    "memorises": "memorizes",
+    "memorising": "memorizing",
+    "mesmerise": "mesmerize",
+    "mesmerised": "mesmerized",
+    "mesmerises": "mesmerizes",
+    "mesmerising": "mesmerizing",
+    "metabolise": "metabolize",
+    "metabolised": "metabolized",
+    "metabolises": "metabolizes",
+    "metabolising": "metabolizing",
+    "metre": "meter",
+    "metres": "meters",
+    "micrometre": "micrometer",
+    "micrometres": "micrometers",
+    "militarise": "militarize",
+    "militarised": "militarized",
+    "militarises": "militarizes",
+    "militarising": "militarizing",
+    "milligramme": "milligram",
+    "milligrammes": "milligrams",
+    "millilitre": "milliliter",
+    "millilitres": "milliliters",
+    "millimetre": "millimeter",
+    "millimetres": "millimeters",
+    "miniaturisation": "miniaturization",
+    "miniaturise": "miniaturize",
+    "miniaturised": "miniaturized",
+    "miniaturises": "miniaturizes",
+    "miniaturising": "miniaturizing",
+    "minibusses": "minibuses",
+    "minimise": "minimize",
+    "minimised": "minimized",
+    "minimises": "minimizes",
+    "minimising": "minimizing",
+    "misbehaviour": "misbehavior",
+    "misdemeanour": "misdemeanor",
+    "misdemeanours": "misdemeanors",
+    "misspelt": "misspelled",
+    "mitre": "miter",
+    "mitres": "miters",
+    "mobilisation": "mobilization",
+    "mobilise": "mobilize",
+    "mobilised": "mobilized",
+    "mobilises": "mobilizes",
+    "mobilising": "mobilizing",
+    "modelled": "modeled",
+    "modeller": "modeler",
+    "modellers": "modelers",
+    "modelling": "modeling",
+    "modernise": "modernize",
+    "modernised": "modernized",
+    "modernises": "modernizes",
+    "modernising": "modernizing",
+    "moisturise": "moisturize",
+    "moisturised": "moisturized",
+    "moisturiser": "moisturizer",
+    "moisturisers": "moisturizers",
+    "moisturises": "moisturizes",
+    "moisturising": "moisturizing",
+    "monologue": "monolog",
+    "monologues": "monologs",
+    "monopolisation": "monopolization",
+    "monopolise": "monopolize",
+    "monopolised": "monopolized",
+    "monopolises": "monopolizes",
+    "monopolising": "monopolizing",
+    "moralise": "moralize",
+    "moralised": "moralized",
+    "moralises": "moralizes",
+    "moralising": "moralizing",
+    "motorised": "motorized",
+    "mould": "mold",
+    "moulded": "molded",
+    "moulder": "molder",
+    "mouldered": "moldered",
+    "mouldering": "moldering",
+    "moulders": "molders",
+    "mouldier": "moldier",
+    "mouldiest": "moldiest",
+    "moulding": "molding",
+    "mouldings": "moldings",
+    "moulds": "molds",
+    "mouldy": "moldy",
+    "moult": "molt",
+    "moulted": "molted",
+    "moulting": "molting",
+    "moults": "molts",
+    "moustache": "mustache",
+    "moustached": "mustached",
+    "moustaches": "mustaches",
+    "moustachioed": "mustachioed",
+    "multicoloured": "multicolored",
+    "nationalisation": "nationalization",
+    "nationalisations": "nationalizations",
+    "nationalise": "nationalize",
+    "nationalised": "nationalized",
+    "nationalises": "nationalizes",
+    "nationalising": "nationalizing",
+    "naturalisation": "naturalization",
+    "naturalise": "naturalize",
+    "naturalised": "naturalized",
+    "naturalises": "naturalizes",
+    "naturalising": "naturalizing",
+    "neighbour": "neighbor",
+    "neighbourhood": "neighborhood",
+    "neighbourhoods": "neighborhoods",
+    "neighbouring": "neighboring",
+    "neighbourliness": "neighborliness",
+    "neighbourly": "neighborly",
+    "neighbours": "neighbors",
+    "neutralisation": "neutralization",
+    "neutralise": "neutralize",
+    "neutralised": "neutralized",
+    "neutralises": "neutralizes",
+    "neutralising": "neutralizing",
+    "normalisation": "normalization",
+    "normalise": "normalize",
+    "normalised": "normalized",
+    "normalises": "normalizes",
+    "normalising": "normalizing",
+    "odour": "odor",
+    "odourless": "odorless",
+    "odours": "odors",
+    "oesophagus": "esophagus",
+    "oesophaguses": "esophaguses",
+    "oestrogen": "estrogen",
+    "offence": "offense",
+    "offences": "offenses",
+    "omelette": "omelet",
+    "omelettes": "omelets",
+    "optimise": "optimize",
+    "optimised": "optimized",
+    "optimises": "optimizes",
+    "optimising": "optimizing",
+    "organisation": "organization",
+    "organisational": "organizational",
+    "organisations": "organizations",
+    "organise": "organize",
+    "organised": "organized",
+    "organiser": "organizer",
+    "organisers": "organizers",
+    "organises": "organizes",
+    "organising": "organizing",
+    "orthopaedic": "orthopedic",
+    "orthopaedics": "orthopedics",
+    "ostracise": "ostracize",
+    "ostracised": "ostracized",
+    "ostracises": "ostracizes",
+    "ostracising": "ostracizing",
+    "outmanoeuvre": "outmaneuver",
+    "outmanoeuvred": "outmaneuvered",
+    "outmanoeuvres": "outmaneuvers",
+    "outmanoeuvring": "outmaneuvering",
+    "overemphasise": "overemphasize",
+    "overemphasised": "overemphasized",
+    "overemphasises": "overemphasizes",
+    "overemphasising": "overemphasizing",
+    "oxidisation": "oxidization",
+    "oxidise": "oxidize",
+    "oxidised": "oxidized",
+    "oxidises": "oxidizes",
+    "oxidising": "oxidizing",
+    "paederast": "pederast",
+    "paederasts": "pederasts",
+    "paediatric": "pediatric",
+    "paediatrician": "pediatrician",
+    "paediatricians": "pediatricians",
+    "paediatrics": "pediatrics",
+    "paedophile": "pedophile",
+    "paedophiles": "pedophiles",
+    "paedophilia": "pedophilia",
+    "palaeolithic": "paleolithic",
+    "palaeontologist": "paleontologist",
+    "palaeontologists": "paleontologists",
+    "palaeontology": "paleontology",
+    "panelled": "paneled",
+    "panelling": "paneling",
+    "panellist": "panelist",
+    "panellists": "panelists",
+    "paralyse": "paralyze",
+    "paralysed": "paralyzed",
+    "paralyses": "paralyzes",
+    "paralysing": "paralyzing",
+    "parcelled": "parceled",
+    "parcelling": "parceling",
+    "parlour": "parlor",
+    "parlours": "parlors",
+    "particularise": "particularize",
+    "particularised": "particularized",
+    "particularises": "particularizes",
+    "particularising": "particularizing",
+    "passivisation": "passivization",
+    "passivise": "passivize",
+    "passivised": "passivized",
+    "passivises": "passivizes",
+    "passivising": "passivizing",
+    "pasteurisation": "pasteurization",
+    "pasteurise": "pasteurize",
+    "pasteurised": "pasteurized",
+    "pasteurises": "pasteurizes",
+    "pasteurising": "pasteurizing",
+    "patronise": "patronize",
+    "patronised": "patronized",
+    "patronises": "patronizes",
+    "patronising": "patronizing",
+    "patronisingly": "patronizingly",
+    "pedalled": "pedaled",
+    "pedalling": "pedaling",
+    "pedestrianisation": "pedestrianization",
+    "pedestrianise": "pedestrianize",
+    "pedestrianised": "pedestrianized",
+    "pedestrianises": "pedestrianizes",
+    "pedestrianising": "pedestrianizing",
+    "penalise": "penalize",
+    "penalised": "penalized",
+    "penalises": "penalizes",
+    "penalising": "penalizing",
+    "pencilled": "penciled",
+    "pencilling": "penciling",
+    "personalise": "personalize",
+    "personalised": "personalized",
+    "personalises": "personalizes",
+    "personalising": "personalizing",
+    "pharmacopoeia": "pharmacopeia",
+    "pharmacopoeias": "pharmacopeias",
+    "philosophise": "philosophize",
+    "philosophised": "philosophized",
+    "philosophises": "philosophizes",
+    "philosophising": "philosophizing",
+    "philtre": "filter",
+    "philtres": "filters",
+    "phoney": "phony",
+    "plagiarise": "plagiarize",
+    "plagiarised": "plagiarized",
+    "plagiarises": "plagiarizes",
+    "plagiarising": "plagiarizing",
+    "plough": "plow",
+    "ploughed": "plowed",
+    "ploughing": "plowing",
+    "ploughman": "plowman",
+    "ploughmen": "plowmen",
+    "ploughs": "plows",
+    "ploughshare": "plowshare",
+    "ploughshares": "plowshares",
+    "polarisation": "polarization",
+    "polarise": "polarize",
+    "polarised": "polarized",
+    "polarises": "polarizes",
+    "polarising": "polarizing",
+    "politicisation": "politicization",
+    "politicise": "politicize",
+    "politicised": "politicized",
+    "politicises": "politicizes",
+    "politicising": "politicizing",
+    "popularisation": "popularization",
+    "popularise": "popularize",
+    "popularised": "popularized",
+    "popularises": "popularizes",
+    "popularising": "popularizing",
+    "pouffe": "pouf",
+    "pouffes": "poufs",
+    "practise": "practice",
+    "practised": "practiced",
+    "practises": "practices",
+    "practising": "practicing",
+    "praesidium": "presidium",
+    "praesidiums": "presidiums",
+    "pressurisation": "pressurization",
+    "pressurise": "pressurize",
+    "pressurised": "pressurized",
+    "pressurises": "pressurizes",
+    "pressurising": "pressurizing",
+    "pretence": "pretense",
+    "pretences": "pretenses",
+    "primaeval": "primeval",
+    "prioritisation": "prioritization",
+    "prioritise": "prioritize",
+    "prioritised": "prioritized",
+    "prioritises": "prioritizes",
+    "prioritising": "prioritizing",
+    "privatisation": "privatization",
+    "privatisations": "privatizations",
+    "privatise": "privatize",
+    "privatised": "privatized",
+    "privatises": "privatizes",
+    "privatising": "privatizing",
+    "professionalisation": "professionalization",
+    "professionalise": "professionalize",
+    "professionalised": "professionalized",
+    "professionalises": "professionalizes",
+    "professionalising": "professionalizing",
+    "programme": "program",
+    "programmes": "programs",
+    "prologue": "prolog",
+    "prologues": "prologs",
+    "propagandise": "propagandize",
+    "propagandised": "propagandized",
+    "propagandises": "propagandizes",
+    "propagandising": "propagandizing",
+    "proselytise": "proselytize",
+    "proselytised": "proselytized",
+    "proselytiser": "proselytizer",
+    "proselytisers": "proselytizers",
+    "proselytises": "proselytizes",
+    "proselytising": "proselytizing",
+    "psychoanalyse": "psychoanalyze",
+    "psychoanalysed": "psychoanalyzed",
+    "psychoanalyses": "psychoanalyzes",
+    "psychoanalysing": "psychoanalyzing",
+    "publicise": "publicize",
+    "publicised": "publicized",
+    "publicises": "publicizes",
+    "publicising": "publicizing",
+    "pulverisation": "pulverization",
+    "pulverise": "pulverize",
+    "pulverised": "pulverized",
+    "pulverises": "pulverizes",
+    "pulverising": "pulverizing",
+    "pummelled": "pummel",
+    "pummelling": "pummeled",
+    "pyjama": "pajama",
+    "pyjamas": "pajamas",
+    "pzazz": "pizzazz",
+    "quarrelled": "quarreled",
+    "quarrelling": "quarreling",
+    "radicalise": "radicalize",
+    "radicalised": "radicalized",
+    "radicalises": "radicalizes",
+    "radicalising": "radicalizing",
+    "rancour": "rancor",
+    "randomise": "randomize",
+    "randomised": "randomized",
+    "randomises": "randomizes",
+    "randomising": "randomizing",
+    "rationalisation": "rationalization",
+    "rationalisations": "rationalizations",
+    "rationalise": "rationalize",
+    "rationalised": "rationalized",
+    "rationalises": "rationalizes",
+    "rationalising": "rationalizing",
+    "ravelled": "raveled",
+    "ravelling": "raveling",
+    "realisable": "realizable",
+    "realisation": "realization",
+    "realisations": "realizations",
+    "realise": "realize",
+    "realised": "realized",
+    "realises": "realizes",
+    "realising": "realizing",
+    "recognisable": "recognizable",
+    "recognisably": "recognizably",
+    "recognisance": "recognizance",
+    "recognise": "recognize",
+    "recognised": "recognized",
+    "recognises": "recognizes",
+    "recognising": "recognizing",
+    "reconnoitre": "reconnoiter",
+    "reconnoitred": "reconnoitered",
+    "reconnoitres": "reconnoiters",
+    "reconnoitring": "reconnoitering",
+    "refuelled": "refueled",
+    "refuelling": "refueling",
+    "regularisation": "regularization",
+    "regularise": "regularize",
+    "regularised": "regularized",
+    "regularises": "regularizes",
+    "regularising": "regularizing",
+    "remodelled": "remodeled",
+    "remodelling": "remodeling",
+    "remould": "remold",
+    "remoulded": "remolded",
+    "remoulding": "remolding",
+    "remoulds": "remolds",
+    "reorganisation": "reorganization",
+    "reorganisations": "reorganizations",
+    "reorganise": "reorganize",
+    "reorganised": "reorganized",
+    "reorganises": "reorganizes",
+    "reorganising": "reorganizing",
+    "revelled": "reveled",
+    "reveller": "reveler",
+    "revellers": "revelers",
+    "revelling": "reveling",
+    "revitalise": "revitalize",
+    "revitalised": "revitalized",
+    "revitalises": "revitalizes",
+    "revitalising": "revitalizing",
+    "revolutionise": "revolutionize",
+    "revolutionised": "revolutionized",
+    "revolutionises": "revolutionizes",
+    "revolutionising": "revolutionizing",
+    "rhapsodise": "rhapsodize",
+    "rhapsodised": "rhapsodized",
+    "rhapsodises": "rhapsodizes",
+    "rhapsodising": "rhapsodizing",
+    "rigour": "rigor",
+    "rigours": "rigors",
+    "ritualised": "ritualized",
+    "rivalled": "rivaled",
+    "rivalling": "rivaling",
+    "romanticise": "romanticize",
+    "romanticised": "romanticized",
+    "romanticises": "romanticizes",
+    "romanticising": "romanticizing",
+    "rumour": "rumor",
+    "rumoured": "rumored",
+    "rumours": "rumors",
+    "sabre": "saber",
+    "sabres": "sabers",
+    "saltpetre": "saltpeter",
+    "sanitise": "sanitize",
+    "sanitised": "sanitized",
+    "sanitises": "sanitizes",
+    "sanitising": "sanitizing",
+    "satirise": "satirize",
+    "satirised": "satirized",
+    "satirises": "satirizes",
+    "satirising": "satirizing",
+    "saviour": "savior",
+    "saviours": "saviors",
+    "savour": "savor",
+    "savoured": "savored",
+    "savouries": "savories",
+    "savouring": "savoring",
+    "savours": "savors",
+    "savoury": "savory",
+    "scandalise": "scandalize",
+    "scandalised": "scandalized",
+    "scandalises": "scandalizes",
+    "scandalising": "scandalizing",
+    "sceptic": "skeptic",
+    "sceptical": "skeptical",
+    "sceptically": "skeptically",
+    "scepticism": "skepticism",
+    "sceptics": "skeptics",
+    "sceptre": "scepter",
+    "sceptres": "scepters",
+    "scrutinise": "scrutinize",
+    "scrutinised": "scrutinized",
+    "scrutinises": "scrutinizes",
+    "scrutinising": "scrutinizing",
+    "secularisation": "secularization",
+    "secularise": "secularize",
+    "secularised": "secularized",
+    "secularises": "secularizes",
+    "secularising": "secularizing",
+    "sensationalise": "sensationalize",
+    "sensationalised": "sensationalized",
+    "sensationalises": "sensationalizes",
+    "sensationalising": "sensationalizing",
+    "sensitise": "sensitize",
+    "sensitised": "sensitized",
+    "sensitises": "sensitizes",
+    "sensitising": "sensitizing",
+    "sentimentalise": "sentimentalize",
+    "sentimentalised": "sentimentalized",
+    "sentimentalises": "sentimentalizes",
+    "sentimentalising": "sentimentalizing",
+    "sepulchre": "sepulcher",
+    "sepulchres": "sepulchers",
+    "serialisation": "serialization",
+    "serialisations": "serializations",
+    "serialise": "serialize",
+    "serialised": "serialized",
+    "serialises": "serializes",
+    "serialising": "serializing",
+    "sermonise": "sermonize",
+    "sermonised": "sermonized",
+    "sermonises": "sermonizes",
+    "sermonising": "sermonizing",
+    "sheikh": "sheik",
+    "shovelled": "shoveled",
+    "shovelling": "shoveling",
+    "shrivelled": "shriveled",
+    "shrivelling": "shriveling",
+    "signalise": "signalize",
+    "signalised": "signalized",
+    "signalises": "signalizes",
+    "signalising": "signalizing",
+    "signalled": "signaled",
+    "signalling": "signaling",
+    "smoulder": "smolder",
+    "smouldered": "smoldered",
+    "smouldering": "smoldering",
+    "smoulders": "smolders",
+    "snivelled": "sniveled",
+    "snivelling": "sniveling",
+    "snorkelled": "snorkeled",
+    "snorkelling": "snorkeling",
+    "snowplough": "snowplow",
+    "snowploughs": "snowplow",
+    "socialisation": "socialization",
+    "socialise": "socialize",
+    "socialised": "socialized",
+    "socialises": "socializes",
+    "socialising": "socializing",
+    "sodomise": "sodomize",
+    "sodomised": "sodomized",
+    "sodomises": "sodomizes",
+    "sodomising": "sodomizing",
+    "solemnise": "solemnize",
+    "solemnised": "solemnized",
+    "solemnises": "solemnizes",
+    "solemnising": "solemnizing",
+    "sombre": "somber",
+    "specialisation": "specialization",
+    "specialisations": "specializations",
+    "specialise": "specialize",
+    "specialised": "specialized",
+    "specialises": "specializes",
+    "specialising": "specializing",
+    "spectre": "specter",
+    "spectres": "specters",
+    "spiralled": "spiraled",
+    "spiralling": "spiraling",
+    "splendour": "splendor",
+    "splendours": "splendors",
+    "squirrelled": "squirreled",
+    "squirrelling": "squirreling",
+    "stabilisation": "stabilization",
+    "stabilise": "stabilize",
+    "stabilised": "stabilized",
+    "stabiliser": "stabilizer",
+    "stabilisers": "stabilizers",
+    "stabilises": "stabilizes",
+    "stabilising": "stabilizing",
+    "standardisation": "standardization",
+    "standardise": "standardize",
+    "standardised": "standardized",
+    "standardises": "standardizes",
+    "standardising": "standardizing",
+    "stencilled": "stenciled",
+    "stencilling": "stenciling",
+    "sterilisation": "sterilization",
+    "sterilisations": "sterilizations",
+    "sterilise": "sterilize",
+    "sterilised": "sterilized",
+    "steriliser": "sterilizer",
+    "sterilisers": "sterilizers",
+    "sterilises": "sterilizes",
+    "sterilising": "sterilizing",
+    "stigmatisation": "stigmatization",
+    "stigmatise": "stigmatize",
+    "stigmatised": "stigmatized",
+    "stigmatises": "stigmatizes",
+    "stigmatising": "stigmatizing",
+    "storey": "story",
+    "storeys": "stories",
+    "subsidisation": "subsidization",
+    "subsidise": "subsidize",
+    "subsidised": "subsidized",
+    "subsidiser": "subsidizer",
+    "subsidisers": "subsidizers",
+    "subsidises": "subsidizes",
+    "subsidising": "subsidizing",
+    "succour": "succor",
+    "succoured": "succored",
+    "succouring": "succoring",
+    "succours": "succors",
+    "sulphate": "sulfate",
+    "sulphates": "sulfates",
+    "sulphide": "sulfide",
+    "sulphides": "sulfides",
+    "sulphur": "sulfur",
+    "sulphurous": "sulfurous",
+    "summarise": "summarize",
+    "summarised": "summarized",
+    "summarises": "summarizes",
+    "summarising": "summarizing",
+    "swivelled": "swiveled",
+    "swivelling": "swiveling",
+    "symbolise": "symbolize",
+    "symbolised": "symbolized",
+    "symbolises": "symbolizes",
+    "symbolising": "symbolizing",
+    "sympathise": "sympathize",
+    "sympathised": "sympathized",
+    "sympathiser": "sympathizer",
+    "sympathisers": "sympathizers",
+    "sympathises": "sympathizes",
+    "sympathising": "sympathizing",
+    "synchronisation": "synchronization",
+    "synchronise": "synchronize",
+    "synchronised": "synchronized",
+    "synchronises": "synchronizes",
+    "synchronising": "synchronizing",
+    "synthesise": "synthesize",
+    "synthesised": "synthesized",
+    "synthesiser": "synthesizer",
+    "synthesisers": "synthesizers",
+    "synthesises": "synthesizes",
+    "synthesising": "synthesizing",
+    "syphon": "siphon",
+    "syphoned": "siphoned",
+    "syphoning": "siphoning",
+    "syphons": "siphons",
+    "systematisation": "systematization",
+    "systematise": "systematize",
+    "systematised": "systematized",
+    "systematises": "systematizes",
+    "systematising": "systematizing",
+    "tantalise": "tantalize",
+    "tantalised": "tantalized",
+    "tantalises": "tantalizes",
+    "tantalising": "tantalizing",
+    "tantalisingly": "tantalizingly",
+    "tasselled": "tasseled",
+    "technicolour": "technicolor",
+    "temporise": "temporize",
+    "temporised": "temporized",
+    "temporises": "temporizes",
+    "temporising": "temporizing",
+    "tenderise": "tenderize",
+    "tenderised": "tenderized",
+    "tenderises": "tenderizes",
+    "tenderising": "tenderizing",
+    "terrorise": "terrorize",
+    "terrorised": "terrorized",
+    "terrorises": "terrorizes",
+    "terrorising": "terrorizing",
+    "theatre": "theater",
+    "theatregoer": "theatergoer",
+    "theatregoers": "theatergoers",
+    "theatres": "theaters",
+    "theorise": "theorize",
+    "theorised": "theorized",
+    "theorises": "theorizes",
+    "theorising": "theorizing",
+    "tonne": "ton",
+    "tonnes": "tons",
+    "towelled": "toweled",
+    "towelling": "toweling",
+    "toxaemia": "toxemia",
+    "tranquillise": "tranquilize",
+    "tranquillised": "tranquilized",
+    "tranquilliser": "tranquilizer",
+    "tranquillisers": "tranquilizers",
+    "tranquillises": "tranquilizes",
+    "tranquillising": "tranquilizing",
+    "tranquillity": "tranquility",
+    "tranquillize": "tranquilize",
+    "tranquillized": "tranquilized",
+    "tranquillizer": "tranquilizer",
+    "tranquillizers": "tranquilizers",
+    "tranquillizes": "tranquilizes",
+    "tranquillizing": "tranquilizing",
+    "tranquilly": "tranquility",
+    "transistorised": "transistorized",
+    "traumatise": "traumatize",
+    "traumatised": "traumatized",
+    "traumatises": "traumatizes",
+    "traumatising": "traumatizing",
+    "travelled": "traveled",
+    "traveller": "traveler",
+    "travellers": "travelers",
+    "travelling": "traveling",
+    "travelog": "travelogue",
+    "travelogs": "travelogues",
+    "trialled": "trialed",
+    "trialling": "trialing",
+    "tricolour": "tricolor",
+    "tricolours": "tricolors",
+    "trivialise": "trivialize",
+    "trivialised": "trivialized",
+    "trivialises": "trivializes",
+    "trivialising": "trivializing",
+    "tumour": "tumor",
+    "tumours": "tumors",
+    "tunnelled": "tunneled",
+    "tunnelling": "tunneling",
+    "tyrannise": "tyrannize",
+    "tyrannised": "tyrannized",
+    "tyrannises": "tyrannizes",
+    "tyrannising": "tyrannizing",
+    "tyre": "tire",
+    "tyres": "tires",
+    "unauthorised": "unauthorized",
+    "uncivilised": "uncivilized",
+    "underutilised": "underutilized",
+    "unequalled": "unequaled",
+    "unfavourable": "unfavorable",
+    "unfavourably": "unfavorably",
+    "unionisation": "unionization",
+    "unionise": "unionize",
+    "unionised": "unionized",
+    "unionises": "unionizes",
+    "unionising": "unionizing",
+    "unorganised": "unorganized",
+    "unravelled": "unraveled",
+    "unravelling": "unraveling",
+    "unrecognisable": "unrecognizable",
+    "unrecognised": "unrecognized",
+    "unrivalled": "unrivaled",
+    "unsavoury": "unsavory",
+    "untrammelled": "untrammeled",
+    "urbanisation": "urbanization",
+    "urbanise": "urbanize",
+    "urbanised": "urbanized",
+    "urbanises": "urbanizes",
+    "urbanising": "urbanizing",
+    "utilisable": "utilizable",
+    "utilisation": "utilization",
+    "utilise": "utilize",
+    "utilised": "utilized",
+    "utilises": "utilizes",
+    "utilising": "utilizing",
+    "valour": "valor",
+    "vandalise": "vandalize",
+    "vandalised": "vandalized",
+    "vandalises": "vandalizes",
+    "vandalising": "vandalizing",
+    "vaporisation": "vaporization",
+    "vaporise": "vaporize",
+    "vaporised": "vaporized",
+    "vaporises": "vaporizes",
+    "vaporising": "vaporizing",
+    "vapour": "vapor",
+    "vapours": "vapors",
+    "verbalise": "verbalize",
+    "verbalised": "verbalized",
+    "verbalises": "verbalizes",
+    "verbalising": "verbalizing",
+    "victimisation": "victimization",
+    "victimise": "victimize",
+    "victimised": "victimized",
+    "victimises": "victimizes",
+    "victimising": "victimizing",
+    "videodisc": "videodisk",
+    "videodiscs": "videodisks",
+    "vigour": "vigor",
+    "visualisation": "visualization",
+    "visualisations": "visualizations",
+    "visualise": "visualize",
+    "visualised": "visualized",
+    "visualises": "visualizes",
+    "visualising": "visualizing",
+    "vocalisation": "vocalization",
+    "vocalisations": "vocalizations",
+    "vocalise": "vocalize",
+    "vocalised": "vocalized",
+    "vocalises": "vocalizes",
+    "vocalising": "vocalizing",
+    "vulcanised": "vulcanized",
+    "vulgarisation": "vulgarization",
+    "vulgarise": "vulgarize",
+    "vulgarised": "vulgarized",
+    "vulgarises": "vulgarizes",
+    "vulgarising": "vulgarizing",
+    "waggon": "wagon",
+    "waggons": "wagons",
+    "watercolour": "watercolor",
+    "watercolours": "watercolors",
+    "weaselled": "weaseled",
+    "weaselling": "weaseling",
+    "westernisation": "westernization",
+    "westernise": "westernize",
+    "westernised": "westernized",
+    "westernises": "westernizes",
+    "westernising": "westernizing",
+    "womanise": "womanize",
+    "womanised": "womanized",
+    "womaniser": "womanizer",
+    "womanisers": "womanizers",
+    "womanises": "womanizes",
+    "womanising": "womanizing",
+    "woollen": "woolen",
+    "woollens": "woolens",
+    "woollies": "woolies",
+    "woolly": "wooly",
+    "worshipped": "worshiped",
+    "worshipping": "worshiping",
+    "worshipper": "worshiper",
+    "yodelled": "yodeled",
+    "yodelling": "yodeling",
+    "yoghourt": "yogurt",
+    "yoghourts": "yogurts",
+    "yoghurt": "yogurt",
+    "yoghurts": "yogurts",
+    "mhm": "hmm",
+    "mmm": "hmm"
+}

whisper/whisper/normalizers/english.py ADDED Viewed

	@@ -0,0 +1,550 @@

+import json
+import os
+import re
+from fractions import Fraction
+from typing import Iterator, List, Match, Optional, Union
+from more_itertools import windowed
+from .basic import remove_symbols_and_diacritics
+class EnglishNumberNormalizer:
+    """
+    Convert any spelled-out numbers into arabic numbers, while handling:
+    - remove any commas
+    - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
+    - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
+    - spell out `one` and `ones`
+    - interpret successive single-digit numbers as nominal: `one oh one` -> `101`
+    """
+    def __init__(self):
+        super().__init__()
+        self.zeros = {"o", "oh", "zero"}
+        self.ones = {
+            name: i
+            for i, name in enumerate(
+                [
+                    "one",
+                    "two",
+                    "three",
+                    "four",
+                    "five",
+                    "six",
+                    "seven",
+                    "eight",
+                    "nine",
+                    "ten",
+                    "eleven",
+                    "twelve",
+                    "thirteen",
+                    "fourteen",
+                    "fifteen",
+                    "sixteen",
+                    "seventeen",
+                    "eighteen",
+                    "nineteen",
+                ],
+                start=1,
+            )
+        }
+        self.ones_plural = {
+            "sixes" if name == "six" else name + "s": (value, "s")
+            for name, value in self.ones.items()
+        }
+        self.ones_ordinal = {
+            "zeroth": (0, "th"),
+            "first": (1, "st"),
+            "second": (2, "nd"),
+            "third": (3, "rd"),
+            "fifth": (5, "th"),
+            "twelfth": (12, "th"),
+            **{
+                name + ("h" if name.endswith("t") else "th"): (value, "th")
+                for name, value in self.ones.items()
+                if value > 3 and value != 5 and value != 12
+            },
+        }
+        self.ones_suffixed = {**self.ones_plural, **self.ones_ordinal}
+        self.tens = {
+            "twenty": 20,
+            "thirty": 30,
+            "forty": 40,
+            "fifty": 50,
+            "sixty": 60,
+            "seventy": 70,
+            "eighty": 80,
+            "ninety": 90,
+        }
+        self.tens_plural = {
+            name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()
+        }
+        self.tens_ordinal = {
+            name.replace("y", "ieth"): (value, "th")
+            for name, value in self.tens.items()
+        }
+        self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal}
+        self.multipliers = {
+            "hundred": 100,
+            "thousand": 1_000,
+            "million": 1_000_000,
+            "billion": 1_000_000_000,
+            "trillion": 1_000_000_000_000,
+            "quadrillion": 1_000_000_000_000_000,
+            "quintillion": 1_000_000_000_000_000_000,
+            "sextillion": 1_000_000_000_000_000_000_000,
+            "septillion": 1_000_000_000_000_000_000_000_000,
+            "octillion": 1_000_000_000_000_000_000_000_000_000,
+            "nonillion": 1_000_000_000_000_000_000_000_000_000_000,
+            "decillion": 1_000_000_000_000_000_000_000_000_000_000_000,
+        }
+        self.multipliers_plural = {
+            name + "s": (value, "s") for name, value in self.multipliers.items()
+        }
+        self.multipliers_ordinal = {
+            name + "th": (value, "th") for name, value in self.multipliers.items()
+        }
+        self.multipliers_suffixed = {
+            **self.multipliers_plural,
+            **self.multipliers_ordinal,
+        }
+        self.decimals = {*self.ones, *self.tens, *self.zeros}
+        self.preceding_prefixers = {
+            "minus": "-",
+            "negative": "-",
+            "plus": "+",
+            "positive": "+",
+        }
+        self.following_prefixers = {
+            "pound": "£",
+            "pounds": "£",
+            "euro": "€",
+            "euros": "€",
+            "dollar": "$",
+            "dollars": "$",
+            "cent": "¢",
+            "cents": "¢",
+        }
+        self.prefixes = set(
+            list(self.preceding_prefixers.values())
+            + list(self.following_prefixers.values())
+        )
+        self.suffixers = {
+            "per": {"cent": "%"},
+            "percent": "%",
+        }
+        self.specials = {"and", "double", "triple", "point"}
+        self.words = set(
+            [
+                key
+                for mapping in [
+                    self.zeros,
+                    self.ones,
+                    self.ones_suffixed,
+                    self.tens,
+                    self.tens_suffixed,
+                    self.multipliers,
+                    self.multipliers_suffixed,
+                    self.preceding_prefixers,
+                    self.following_prefixers,
+                    self.suffixers,
+                    self.specials,
+                ]
+                for key in mapping
+            ]
+        )
+        self.literal_words = {"one", "ones"}
+    def process_words(self, words: List[str]) -> Iterator[str]:
+        prefix: Optional[str] = None
+        value: Optional[Union[str, int]] = None
+        skip = False
+        def to_fraction(s: str):
+            try:
+                return Fraction(s)
+            except ValueError:
+                return None
+        def output(result: Union[str, int]):
+            nonlocal prefix, value
+            result = str(result)
+            if prefix is not None:
+                result = prefix + result
+            value = None
+            prefix = None
+            return result
+        if len(words) == 0:
+            return
+        for prev, current, next in windowed([None] + words + [None], 3):
+            if skip:
+                skip = False
+                continue
+            next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next)
+            has_prefix = current[0] in self.prefixes
+            current_without_prefix = current[1:] if has_prefix else current
+            if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
+                # arabic numbers (potentially with signs and fractions)
+                f = to_fraction(current_without_prefix)
+                assert f is not None
+                if value is not None:
+                    if isinstance(value, str) and value.endswith("."):
+                        # concatenate decimals / ip address components
+                        value = str(value) + str(current)
+                        continue
+                    else:
+                        yield output(value)
+                prefix = current[0] if has_prefix else prefix
+                if f.denominator == 1:
+                    value = f.numerator  # store integers as int
+                else:
+                    value = current_without_prefix
+            elif current not in self.words:
+                # non-numeric words
+                if value is not None:
+                    yield output(value)
+                yield output(current)
+            elif current in self.zeros:
+                value = str(value or "") + "0"
+            elif current in self.ones:
+                ones = self.ones[current]
+                if value is None:
+                    value = ones
+                elif isinstance(value, str) or prev in self.ones:
+                    if (
+                        prev in self.tens and ones < 10
+                    ):  # replace the last zero with the digit
+                        assert value[-1] == "0"
+                        value = value[:-1] + str(ones)
+                    else:
+                        value = str(value) + str(ones)
+                elif ones < 10:
+                    if value % 10 == 0:
+                        value += ones
+                    else:
+                        value = str(value) + str(ones)
+                else:  # eleven to nineteen
+                    if value % 100 == 0:
+                        value += ones
+                    else:
+                        value = str(value) + str(ones)
+            elif current in self.ones_suffixed:
+                # ordinal or cardinal; yield the number right away
+                ones, suffix = self.ones_suffixed[current]
+                if value is None:
+                    yield output(str(ones) + suffix)
+                elif isinstance(value, str) or prev in self.ones:
+                    if prev in self.tens and ones < 10:
+                        assert value[-1] == "0"
+                        yield output(value[:-1] + str(ones) + suffix)
+                    else:
+                        yield output(str(value) + str(ones) + suffix)
+                elif ones < 10:
+                    if value % 10 == 0:
+                        yield output(str(value + ones) + suffix)
+                    else:
+                        yield output(str(value) + str(ones) + suffix)
+                else:  # eleven to nineteen
+                    if value % 100 == 0:
+                        yield output(str(value + ones) + suffix)
+                    else:
+                        yield output(str(value) + str(ones) + suffix)
+                value = None
+            elif current in self.tens:
+                tens = self.tens[current]
+                if value is None:
+                    value = tens
+                elif isinstance(value, str):
+                    value = str(value) + str(tens)
+                else:
+                    if value % 100 == 0:
+                        value += tens
+                    else:
+                        value = str(value) + str(tens)
+            elif current in self.tens_suffixed:
+                # ordinal or cardinal; yield the number right away
+                tens, suffix = self.tens_suffixed[current]
+                if value is None:
+                    yield output(str(tens) + suffix)
+                elif isinstance(value, str):
+                    yield output(str(value) + str(tens) + suffix)
+                else:
+                    if value % 100 == 0:
+                        yield output(str(value + tens) + suffix)
+                    else:
+                        yield output(str(value) + str(tens) + suffix)
+            elif current in self.multipliers:
+                multiplier = self.multipliers[current]
+                if value is None:
+                    value = multiplier
+                elif isinstance(value, str) or value == 0:
+                    f = to_fraction(value)
+                    p = f * multiplier if f is not None else None
+                    if f is not None and p.denominator == 1:
+                        value = p.numerator
+                    else:
+                        yield output(value)
+                        value = multiplier
+                else:
+                    before = value // 1000 * 1000
+                    residual = value % 1000
+                    value = before + residual * multiplier
+            elif current in self.multipliers_suffixed:
+                multiplier, suffix = self.multipliers_suffixed[current]
+                if value is None:
+                    yield output(str(multiplier) + suffix)
+                elif isinstance(value, str):
+                    f = to_fraction(value)
+                    p = f * multiplier if f is not None else None
+                    if f is not None and p.denominator == 1:
+                        yield output(str(p.numerator) + suffix)
+                    else:
+                        yield output(value)
+                        yield output(str(multiplier) + suffix)
+                else:  # int
+                    before = value // 1000 * 1000
+                    residual = value % 1000
+                    value = before + residual * multiplier
+                    yield output(str(value) + suffix)
+                value = None
+            elif current in self.preceding_prefixers:
+                # apply prefix (positive, minus, etc.) if it precedes a number
+                if value is not None:
+                    yield output(value)
+                if next in self.words or next_is_numeric:
+                    prefix = self.preceding_prefixers[current]
+                else:
+                    yield output(current)
+            elif current in self.following_prefixers:
+                # apply prefix (dollars, cents, etc.) only after a number
+                if value is not None:
+                    prefix = self.following_prefixers[current]
+                    yield output(value)
+                else:
+                    yield output(current)
+            elif current in self.suffixers:
+                # apply suffix symbols (percent -> '%')
+                if value is not None:
+                    suffix = self.suffixers[current]
+                    if isinstance(suffix, dict):
+                        if next in suffix:
+                            yield output(str(value) + suffix[next])
+                            skip = True
+                        else:
+                            yield output(value)
+                            yield output(current)
+                    else:
+                        yield output(str(value) + suffix)
+                else:
+                    yield output(current)
+            elif current in self.specials:
+                if next not in self.words and not next_is_numeric:
+                    # apply special handling only if the next word can be numeric
+                    if value is not None:
+                        yield output(value)
+                    yield output(current)
+                elif current == "and":
+                    # ignore "and" after hundreds, thousands, etc.
+                    if prev not in self.multipliers:
+                        if value is not None:
+                            yield output(value)
+                        yield output(current)
+                elif current == "double" or current == "triple":
+                    if next in self.ones or next in self.zeros:
+                        repeats = 2 if current == "double" else 3
+                        ones = self.ones.get(next, 0)
+                        value = str(value or "") + str(ones) * repeats
+                        skip = True
+                    else:
+                        if value is not None:
+                            yield output(value)
+                        yield output(current)
+                elif current == "point":
+                    if next in self.decimals or next_is_numeric:
+                        value = str(value or "") + "."
+                else:
+                    # should all have been covered at this point
+                    raise ValueError(f"Unexpected token: {current}")
+            else:
+                # all should have been covered at this point
+                raise ValueError(f"Unexpected token: {current}")
+        if value is not None:
+            yield output(value)
+    def preprocess(self, s: str):
+        # replace "<number> and a half" with "<number> point five"
+        results = []
+        segments = re.split(r"\band\s+a\s+half\b", s)
+        for i, segment in enumerate(segments):
+            if len(segment.strip()) == 0:
+                continue
+            if i == len(segments) - 1:
+                results.append(segment)
+            else:
+                results.append(segment)
+                last_word = segment.rsplit(maxsplit=2)[-1]
+                if last_word in self.decimals or last_word in self.multipliers:
+                    results.append("point five")
+                else:
+                    results.append("and a half")
+        s = " ".join(results)
+        # put a space at number/letter boundary
+        s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
+        s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
+        # but remove spaces which could be a suffix
+        s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s)
+        return s
+    def postprocess(self, s: str):
+        def combine_cents(m: Match):
+            try:
+                currency = m.group(1)
+                integer = m.group(2)
+                cents = int(m.group(3))
+                return f"{currency}{integer}.{cents:02d}"
+            except ValueError:
+                return m.string
+        def extract_cents(m: Match):
+            try:
+                return f"¢{int(m.group(1))}"
+            except ValueError:
+                return m.string
+        # apply currency postprocessing; "$2 and ¢7" -> "$2.07"
+        s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
+        s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)
+        # write "one(s)" instead of "1(s)", just for the readability
+        s = re.sub(r"\b1(s?)\b", r"one\1", s)
+        return s
+    def __call__(self, s: str):
+        s = self.preprocess(s)
+        s = " ".join(word for word in self.process_words(s.split()) if word is not None)
+        s = self.postprocess(s)
+        return s
+class EnglishSpellingNormalizer:
+    """
+    Applies British-American spelling mappings as listed in [1].
+    [1] https://www.tysto.com/uk-us-spelling-list.html
+    """
+    def __init__(self):
+        mapping_path = os.path.join(os.path.dirname(__file__), "english.json")
+        self.mapping = json.load(open(mapping_path))
+    def __call__(self, s: str):
+        return " ".join(self.mapping.get(word, word) for word in s.split())
+class EnglishTextNormalizer:
+    def __init__(self):
+        self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
+        self.replacers = {
+            # common contractions
+            r"\bwon't\b": "will not",
+            r"\bcan't\b": "can not",
+            r"\blet's\b": "let us",
+            r"\bain't\b": "aint",
+            r"\by'all\b": "you all",
+            r"\bwanna\b": "want to",
+            r"\bgotta\b": "got to",
+            r"\bgonna\b": "going to",
+            r"\bi'ma\b": "i am going to",
+            r"\bimma\b": "i am going to",
+            r"\bwoulda\b": "would have",
+            r"\bcoulda\b": "could have",
+            r"\bshoulda\b": "should have",
+            r"\bma'am\b": "madam",
+            # contractions in titles/prefixes
+            r"\bmr\b": "mister ",
+            r"\bmrs\b": "missus ",
+            r"\bst\b": "saint ",
+            r"\bdr\b": "doctor ",
+            r"\bprof\b": "professor ",
+            r"\bcapt\b": "captain ",
+            r"\bgov\b": "governor ",
+            r"\bald\b": "alderman ",
+            r"\bgen\b": "general ",
+            r"\bsen\b": "senator ",
+            r"\brep\b": "representative ",
+            r"\bpres\b": "president ",
+            r"\brev\b": "reverend ",
+            r"\bhon\b": "honorable ",
+            r"\basst\b": "assistant ",
+            r"\bassoc\b": "associate ",
+            r"\blt\b": "lieutenant ",
+            r"\bcol\b": "colonel ",
+            r"\bjr\b": "junior ",
+            r"\bsr\b": "senior ",
+            r"\besq\b": "esquire ",
+            # prefect tenses, ideally it should be any past participles, but it's harder..
+            r"'d been\b": " had been",
+            r"'s been\b": " has been",
+            r"'d gone\b": " had gone",
+            r"'s gone\b": " has gone",
+            r"'d done\b": " had done",  # "'s done" is ambiguous
+            r"'s got\b": " has got",
+            # general contractions
+            r"n't\b": " not",
+            r"'re\b": " are",
+            r"'s\b": " is",
+            r"'d\b": " would",
+            r"'ll\b": " will",
+            r"'t\b": " not",
+            r"'ve\b": " have",
+            r"'m\b": " am",
+        }
+        self.standardize_numbers = EnglishNumberNormalizer()
+        self.standardize_spellings = EnglishSpellingNormalizer()
+    def __call__(self, s: str):
+        s = s.lower()
+        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
+        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
+        s = re.sub(self.ignore_patterns, "", s)
+        s = re.sub(r"\s+'", "'", s)  # when there's a space before an apostrophe
+        for pattern, replacement in self.replacers.items():
+            s = re.sub(pattern, replacement, s)
+        s = re.sub(r"(\d),(\d)", r"\1\2", s)  # remove commas between digits
+        s = re.sub(r"\.([^0-9]|$)", r" \1", s)  # remove periods not followed by numbers
+        s = remove_symbols_and_diacritics(s, keep=".%$¢€£")  # keep numeric symbols
+        s = self.standardize_numbers(s)
+        s = self.standardize_spellings(s)
+        # now remove prefix/suffix symbols that are not preceded/followed by numbers
+        s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
+        s = re.sub(r"([^0-9])%", r"\1 ", s)
+        s = re.sub(r"\s+", " ", s)  # replace any successive whitespaces with a space
+        return s

whisper/whisper/timing.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import itertools
+import subprocess
+import warnings
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List
+import numba
+import numpy as np
+import torch
+import torch.nn.functional as F
+from .audio import HOP_LENGTH, SAMPLE_RATE, TOKENS_PER_SECOND
+from .tokenizer import Tokenizer
+if TYPE_CHECKING:
+    from .model import Whisper
+def median_filter(x: torch.Tensor, filter_width: int):
+    """Apply a median filter of width `filter_width` along the last dimension of `x`"""
+    pad_width = filter_width // 2
+    if x.shape[-1] <= pad_width:
+        # F.pad requires the padding width to be smaller than the input dimension
+        return x
+    if (ndim := x.ndim) <= 2:
+        # `F.pad` does not support 1D or 2D inputs for reflect padding but supports 3D and 4D
+        x = x[None, None, :]
+    assert (
+        filter_width > 0 and filter_width % 2 == 1
+    ), "`filter_width` should be an odd number"
+    result = None
+    x = F.pad(x, (filter_width // 2, filter_width // 2, 0, 0), mode="reflect")
+    if x.is_cuda:
+        try:
+            from .triton_ops import median_filter_cuda
+            result = median_filter_cuda(x, filter_width)
+        except (RuntimeError, subprocess.CalledProcessError):
+            warnings.warn(
+                "Failed to launch Triton kernels, likely due to missing CUDA toolkit; "
+                "falling back to a slower median kernel implementation..."
+            )
+    if result is None:
+        # sort() is faster than torch.median (https://github.com/pytorch/pytorch/issues/51450)
+        result = x.unfold(-1, filter_width, 1).sort()[0][..., filter_width // 2]
+    if ndim <= 2:
+        result = result[0, 0]
+    return result
+@numba.jit(nopython=True)
+def backtrace(trace: np.ndarray):
+    i = trace.shape[0] - 1
+    j = trace.shape[1] - 1
+    trace[0, :] = 2
+    trace[:, 0] = 1
+    result = []
+    while i > 0 or j > 0:
+        result.append((i - 1, j - 1))
+        if trace[i, j] == 0:
+            i -= 1
+            j -= 1
+        elif trace[i, j] == 1:
+            i -= 1
+        elif trace[i, j] == 2:
+            j -= 1
+        else:
+            raise ValueError("Unexpected trace[i, j]")
+    result = np.array(result)
+    return result[::-1, :].T
+@numba.jit(nopython=True, parallel=True)
+def dtw_cpu(x: np.ndarray):
+    N, M = x.shape
+    cost = np.ones((N + 1, M + 1), dtype=np.float32) * np.inf
+    trace = -np.ones((N + 1, M + 1), dtype=np.float32)
+    cost[0, 0] = 0
+    for j in range(1, M + 1):
+        for i in range(1, N + 1):
+            c0 = cost[i - 1, j - 1]
+            c1 = cost[i - 1, j]
+            c2 = cost[i, j - 1]
+            if c0 < c1 and c0 < c2:
+                c, t = c0, 0
+            elif c1 < c0 and c1 < c2:
+                c, t = c1, 1
+            else:
+                c, t = c2, 2
+            cost[i, j] = x[i - 1, j - 1] + c
+            trace[i, j] = t
+    return backtrace(trace)
+def dtw_cuda(x, BLOCK_SIZE=1024):
+    from .triton_ops import dtw_kernel
+    M, N = x.shape
+    assert M < BLOCK_SIZE, f"M should be smaller than {BLOCK_SIZE=}"
+    x_skew = (
+        F.pad(x, (0, M + 1), value=np.inf).flatten()[: M * (N + M)].reshape(M, N + M)
+    )
+    x_skew = x_skew.T.contiguous()
+    cost = torch.ones(N + M + 2, M + 2) * np.inf
+    cost[0, 0] = 0
+    cost = cost.cuda()
+    trace = torch.zeros_like(cost, dtype=torch.int32)
+    dtw_kernel[(1,)](
+        cost,
+        trace,
+        x_skew,
+        x_skew.stride(0),
+        cost.stride(0),
+        trace.stride(0),
+        N,
+        M,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    trace = trace.T.flatten()[: (M + 1) * (M + N + 3)].reshape(M + 1, M + N + 3)[
+        :, : N + 1
+    ]
+    return backtrace(trace.cpu().numpy())
+def dtw(x: torch.Tensor) -> np.ndarray:
+    if x.is_cuda:
+        try:
+            return dtw_cuda(x)
+        except (RuntimeError, subprocess.CalledProcessError):
+            warnings.warn(
+                "Failed to launch Triton kernels, likely due to missing CUDA toolkit; "
+                "falling back to a slower DTW implementation..."
+            )
+    return dtw_cpu(x.double().cpu().numpy())
+@dataclass
+class WordTiming:
+    word: str
+    tokens: List[int]
+    start: float
+    end: float
+    probability: float
+def find_alignment(
+    model: "Whisper",
+    tokenizer: Tokenizer,
+    text_tokens: List[int],
+    mel: torch.Tensor,
+    num_frames: int,
+    *,
+    medfilt_width: int = 7,
+    qk_scale: float = 1.0,
+) -> List[WordTiming]:
+    if len(text_tokens) == 0:
+        return []
+    tokens = torch.tensor(
+        [
+            *tokenizer.sot_sequence,
+            tokenizer.no_timestamps,
+            *text_tokens,
+            tokenizer.eot,
+        ]
+    ).to(model.device)
+    # install hooks on the cross attention layers to retrieve the attention weights
+    QKs = [None] * model.dims.n_text_layer
+    hooks = [
+        block.cross_attn.register_forward_hook(
+            lambda _, ins, outs, index=i: QKs.__setitem__(index, outs[-1][0])
+        )
+        for i, block in enumerate(model.decoder.blocks)
+    ]
+    with torch.no_grad():
+        logits = model(mel.unsqueeze(0), tokens.unsqueeze(0))[0]
+        sampled_logits = logits[len(tokenizer.sot_sequence) :, : tokenizer.eot]
+        token_probs = sampled_logits.softmax(dim=-1)
+        text_token_probs = token_probs[np.arange(len(text_tokens)), text_tokens]
+        text_token_probs = text_token_probs.tolist()
+    for hook in hooks:
+        hook.remove()
+    # heads * tokens * frames
+    weights = torch.stack([QKs[_l][_h] for _l, _h in model.alignment_heads.indices().T])
+    weights = weights[:, :, : num_frames // 2]
+    weights = (weights * qk_scale).softmax(dim=-1)
+    std, mean = torch.std_mean(weights, dim=-2, keepdim=True, unbiased=False)
+    weights = (weights - mean) / std
+    weights = median_filter(weights, medfilt_width)
+    matrix = weights.mean(axis=0)
+    matrix = matrix[len(tokenizer.sot_sequence) : -1]
+    text_indices, time_indices = dtw(-matrix)
+    words, word_tokens = tokenizer.split_to_word_tokens(text_tokens + [tokenizer.eot])
+    if len(word_tokens) <= 1:
+        # return on eot only
+        # >>> np.pad([], (1, 0))
+        # array([0.])
+        # This results in crashes when we lookup jump_times with float, like
+        # IndexError: arrays used as indices must be of integer (or boolean) type
+        return []
+    word_boundaries = np.pad(np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0))
+    jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
+    jump_times = time_indices[jumps] / TOKENS_PER_SECOND
+    start_times = jump_times[word_boundaries[:-1]]
+    end_times = jump_times[word_boundaries[1:]]
+    word_probabilities = [
+        np.mean(text_token_probs[i:j])
+        for i, j in zip(word_boundaries[:-1], word_boundaries[1:])
+    ]
+    return [
+        WordTiming(word, tokens, start, end, probability)
+        for word, tokens, start, end, probability in zip(
+            words, word_tokens, start_times, end_times, word_probabilities
+        )
+    ]
+def merge_punctuations(alignment: List[WordTiming], prepended: str, appended: str):
+    # merge prepended punctuations
+    i = len(alignment) - 2
+    j = len(alignment) - 1
+    while i >= 0:
+        previous = alignment[i]
+        following = alignment[j]
+        if previous.word.startswith(" ") and previous.word.strip() in prepended:
+            # prepend it to the following word
+            following.word = previous.word + following.word
+            following.tokens = previous.tokens + following.tokens
+            previous.word = ""
+            previous.tokens = []
+        else:
+            j = i
+        i -= 1
+    # merge appended punctuations
+    i = 0
+    j = 1
+    while j < len(alignment):
+        previous = alignment[i]
+        following = alignment[j]
+        if not previous.word.endswith(" ") and following.word in appended:
+            # append it to the previous word
+            previous.word = previous.word + following.word
+            previous.tokens = previous.tokens + following.tokens
+            following.word = ""
+            following.tokens = []
+        else:
+            i = j
+        j += 1
+def add_word_timestamps(
+    *,
+    segments: List[dict],
+    model: "Whisper",
+    tokenizer: Tokenizer,
+    mel: torch.Tensor,
+    num_frames: int,
+    prepend_punctuations: str = "\"'“¿([{-",
+    append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
+    last_speech_timestamp: float,
+    **kwargs,
+):
+    if len(segments) == 0:
+        return
+    text_tokens_per_segment = [
+        [token for token in segment["tokens"] if token < tokenizer.eot]
+        for segment in segments
+    ]
+    text_tokens = list(itertools.chain.from_iterable(text_tokens_per_segment))
+    alignment = find_alignment(model, tokenizer, text_tokens, mel, num_frames, **kwargs)
+    word_durations = np.array([t.end - t.start for t in alignment])
+    word_durations = word_durations[word_durations.nonzero()]
+    median_duration = np.median(word_durations) if len(word_durations) > 0 else 0.0
+    max_duration = median_duration * 2
+    # hack: truncate long words at sentence boundaries.
+    # a better segmentation algorithm based on VAD should be able to replace this.
+    if len(word_durations) > 0:
+        sentence_end_marks = ".。!！?？"
+        # ensure words at sentence boundaries are not longer than twice the median word duration.
+        for i in range(1, len(alignment)):
+            if alignment[i].end - alignment[i].start > max_duration:
+                if alignment[i].word in sentence_end_marks:
+                    alignment[i].end = alignment[i].start + max_duration
+                elif alignment[i - 1].word in sentence_end_marks:
+                    alignment[i].start = alignment[i].end - max_duration
+    merge_punctuations(alignment, prepend_punctuations, append_punctuations)
+    time_offset = segments[0]["seek"] * HOP_LENGTH / SAMPLE_RATE
+    word_index = 0
+    for segment, text_tokens in zip(segments, text_tokens_per_segment):
+        saved_tokens = 0
+        words = []
+        while word_index < len(alignment) and saved_tokens < len(text_tokens):
+            timing = alignment[word_index]
+            if timing.word:
+                words.append(
+                    dict(
+                        word=timing.word,
+                        start=round(time_offset + timing.start, 2),
+                        end=round(time_offset + timing.end, 2),
+                        probability=timing.probability,
+                    )
+                )
+            saved_tokens += len(timing.tokens)
+            word_index += 1
+        # hack: truncate long words at segment boundaries.
+        # a better segmentation algorithm based on VAD should be able to replace this.
+        if len(words) > 0:
+            # ensure the first and second word after a pause is not longer than
+            # twice the median word duration.
+            if words[0]["end"] - last_speech_timestamp > median_duration * 4 and (
+                words[0]["end"] - words[0]["start"] > max_duration
+                or (
+                    len(words) > 1
+                    and words[1]["end"] - words[0]["start"] > max_duration * 2
+                )
+            ):
+                if (
+                    len(words) > 1
+                    and words[1]["end"] - words[1]["start"] > max_duration
+                ):
+                    boundary = max(words[1]["end"] / 2, words[1]["end"] - max_duration)
+                    words[0]["end"] = words[1]["start"] = boundary
+                words[0]["start"] = max(0, words[0]["end"] - max_duration)
+            # prefer the segment-level start timestamp if the first word is too long.
+            if (
+                segment["start"] < words[0]["end"]
+                and segment["start"] - 0.5 > words[0]["start"]
+            ):
+                words[0]["start"] = max(
+                    0, min(words[0]["end"] - median_duration, segment["start"])
+                )
+            else:
+                segment["start"] = words[0]["start"]
+            # prefer the segment-level end timestamp if the last word is too long.
+            if (
+                segment["end"] > words[-1]["start"]
+                and segment["end"] + 0.5 < words[-1]["end"]
+            ):
+                words[-1]["end"] = max(
+                    words[-1]["start"] + median_duration, segment["end"]
+                )
+            else:
+                segment["end"] = words[-1]["end"]
+            last_speech_timestamp = segment["end"]
+        segment["words"] = words

whisper/whisper/tokenizer.py ADDED Viewed

	@@ -0,0 +1,386 @@

+import base64
+import os
+import string
+from dataclasses import dataclass, field
+from functools import cached_property, lru_cache
+from typing import Dict, List, Optional, Tuple
+import tiktoken
+LANGUAGES = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ru": "russian",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "pt": "portuguese",
+    "tr": "turkish",
+    "pl": "polish",
+    "ca": "catalan",
+    "nl": "dutch",
+    "ar": "arabic",
+    "sv": "swedish",
+    "it": "italian",
+    "id": "indonesian",
+    "hi": "hindi",
+    "fi": "finnish",
+    "vi": "vietnamese",
+    "he": "hebrew",
+    "uk": "ukrainian",
+    "el": "greek",
+    "ms": "malay",
+    "cs": "czech",
+    "ro": "romanian",
+    "da": "danish",
+    "hu": "hungarian",
+    "ta": "tamil",
+    "no": "norwegian",
+    "th": "thai",
+    "ur": "urdu",
+    "hr": "croatian",
+    "bg": "bulgarian",
+    "lt": "lithuanian",
+    "la": "latin",
+    "mi": "maori",
+    "ml": "malayalam",
+    "cy": "welsh",
+    "sk": "slovak",
+    "te": "telugu",
+    "fa": "persian",
+    "lv": "latvian",
+    "bn": "bengali",
+    "sr": "serbian",
+    "az": "azerbaijani",
+    "sl": "slovenian",
+    "kn": "kannada",
+    "et": "estonian",
+    "mk": "macedonian",
+    "br": "breton",
+    "eu": "basque",
+    "is": "icelandic",
+    "hy": "armenian",
+    "ne": "nepali",
+    "mn": "mongolian",
+    "bs": "bosnian",
+    "kk": "kazakh",
+    "sq": "albanian",
+    "sw": "swahili",
+    "gl": "galician",
+    "mr": "marathi",
+    "pa": "punjabi",
+    "si": "sinhala",
+    "km": "khmer",
+    "sn": "shona",
+    "yo": "yoruba",
+    "so": "somali",
+    "af": "afrikaans",
+    "oc": "occitan",
+    "ka": "georgian",
+    "be": "belarusian",
+    "tg": "tajik",
+    "sd": "sindhi",
+    "gu": "gujarati",
+    "am": "amharic",
+    "yi": "yiddish",
+    "lo": "lao",
+    "uz": "uzbek",
+    "fo": "faroese",
+    "ht": "haitian creole",
+    "ps": "pashto",
+    "tk": "turkmen",
+    "nn": "nynorsk",
+    "mt": "maltese",
+    "sa": "sanskrit",
+    "lb": "luxembourgish",
+    "my": "myanmar",
+    "bo": "tibetan",
+    "tl": "tagalog",
+    "mg": "malagasy",
+    "as": "assamese",
+    "tt": "tatar",
+    "haw": "hawaiian",
+    "ln": "lingala",
+    "ha": "hausa",
+    "ba": "bashkir",
+    "jw": "javanese",
+    "su": "sundanese",
+}
+# language code lookup by name, with a few language aliases
+TO_LANGUAGE_CODE = {
+    **{language: code for code, language in LANGUAGES.items()},
+    "burmese": "my",
+    "valencian": "ca",
+    "flemish": "nl",
+    "haitian": "ht",
+    "letzeburgesch": "lb",
+    "pushto": "ps",
+    "panjabi": "pa",
+    "moldavian": "ro",
+    "moldovan": "ro",
+    "sinhalese": "si",
+    "castilian": "es",
+}
+@dataclass
+class Tokenizer:
+    """A thin wrapper around `tiktoken` providing quick access to special tokens"""
+    encoding: tiktoken.Encoding
+    language: Optional[str] = None
+    task: Optional[str] = None
+    sot_sequence: Tuple[int] = ()
+    special_tokens: Dict[str, int] = field(default_factory=dict)
+    def __post_init__(self):
+        for special in self.encoding.special_tokens_set:
+            special_token = self.encoding.encode_single_token(special)
+            self.special_tokens[special] = special_token
+        sot: int = self.special_tokens["<|startoftranscript|>"]
+        translate: int = self.special_tokens["<|translate|>"]
+        transcribe: int = self.special_tokens["<|transcribe|>"]
+        langs = tuple(LANGUAGES.keys())
+        sot_sequence = [sot]
+        if self.language is not None:
+            sot_sequence.append(sot + 1 + langs.index(self.language))
+        if self.task is not None:
+            task_token: int = transcribe if self.task == "transcribe" else translate
+            sot_sequence.append(task_token)
+        self.sot_sequence = tuple(sot_sequence)
+    def encode(self, text, **kwargs):
+        return self.encoding.encode(text, **kwargs)
+    def decode(self, token_ids: List[int], **kwargs) -> str:
+        token_ids = [t for t in token_ids if t < self.timestamp_begin]
+        return self.encoding.decode(token_ids, **kwargs)
+    def decode_with_timestamps(self, token_ids: List[int], **kwargs) -> str:
+        """
+        Timestamp tokens are above other special tokens' id range and are ignored by `decode()`.
+        This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
+        """
+        return self.encoding.decode(token_ids, **kwargs)
+    @cached_property
+    def eot(self) -> int:
+        return self.encoding.eot_token
+    @cached_property
+    def transcribe(self) -> int:
+        return self.special_tokens["<|transcribe|>"]
+    @cached_property
+    def translate(self) -> int:
+        return self.special_tokens["<|translate|>"]
+    @cached_property
+    def sot(self) -> int:
+        return self.special_tokens["<|startoftranscript|>"]
+    @cached_property
+    def sot_lm(self) -> int:
+        return self.special_tokens["<|startoflm|>"]
+    @cached_property
+    def sot_prev(self) -> int:
+        return self.special_tokens["<|startofprev|>"]
+    @cached_property
+    def no_speech(self) -> int:
+        return self.special_tokens["<|nospeech|>"]
+    @cached_property
+    def no_timestamps(self) -> int:
+        return self.special_tokens["<|notimestamps|>"]
+    @cached_property
+    def timestamp_begin(self) -> int:
+        return self.special_tokens["<|0.00|>"]
+    @cached_property
+    def language_token(self) -> int:
+        """Returns the token id corresponding to the value of the `language` field"""
+        if self.language is None:
+            raise ValueError("This tokenizer does not have language token configured")
+        if token := self.special_tokens.get(f"<|{self.language}|>", None):
+            return token
+        raise KeyError(f"Language {self.language} not found in tokenizer.")
+    @cached_property
+    def all_language_tokens(self) -> Tuple[int]:
+        result = []
+        for token, token_id in self.special_tokens.items():
+            if token.strip("<|>") in LANGUAGES:
+                result.append(token_id)
+        return tuple(result)
+    @cached_property
+    def all_language_codes(self) -> Tuple[str]:
+        return tuple(self.decode([_l]).strip("<|>") for _l in self.all_language_tokens)
+    @cached_property
+    def sot_sequence_including_notimestamps(self) -> Tuple[int]:
+        return tuple(list(self.sot_sequence) + [self.no_timestamps])
+    @cached_property
+    def non_speech_tokens(self) -> Tuple[int]:
+        """
+        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
+        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
+        - ♪♪♪
+        - ( SPEAKING FOREIGN LANGUAGE )
+        - [DAVID] Hey there,
+        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
+        """
+        symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
+        symbols += (
+            "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
+        )
+        # symbols that may be a single token or multiple tokens depending on the tokenizer.
+        # In case they're multiple tokens, suppress the first token, which is safe because:
+        # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
+        # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
+        miscellaneous = set("♩♪♫♬♭♮♯")
+        assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
+        # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
+        result = {self.encoding.encode(" -")[0], self.encoding.encode(" '")[0]}
+        for symbol in symbols + list(miscellaneous):
+            for tokens in [
+                self.encoding.encode(symbol),
+                self.encoding.encode(" " + symbol),
+            ]:
+                if len(tokens) == 1 or symbol in miscellaneous:
+                    result.add(tokens[0])
+        return tuple(sorted(result))
+    def split_to_word_tokens(self, tokens: List[int]):
+        if self.language in {"zh", "ja", "th", "lo", "my"}:
+            # These languages don't typically use spaces, so it is difficult to split words
+            # without morpheme analysis. Here, we instead split words at any
+            # position where the tokens are decoded as valid unicode points
+            return self.split_tokens_on_unicode(tokens)
+        return self.split_tokens_on_spaces(tokens)
+    def split_tokens_on_unicode(self, tokens: List[int]):
+        decoded_full = self.decode_with_timestamps(tokens)
+        replacement_char = "\ufffd"
+        words = []
+        word_tokens = []
+        current_tokens = []
+        unicode_offset = 0
+        for token in tokens:
+            current_tokens.append(token)
+            decoded = self.decode_with_timestamps(current_tokens)
+            if (
+                replacement_char not in decoded
+                or decoded_full[unicode_offset + decoded.index(replacement_char)]
+                == replacement_char
+            ):
+                words.append(decoded)
+                word_tokens.append(current_tokens)
+                current_tokens = []
+                unicode_offset += len(decoded)
+        return words, word_tokens
+    def split_tokens_on_spaces(self, tokens: List[int]):
+        subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens)
+        words = []
+        word_tokens = []
+        for subword, subword_tokens in zip(subwords, subword_tokens_list):
+            special = subword_tokens[0] >= self.eot
+            with_space = subword.startswith(" ")
+            punctuation = subword.strip() in string.punctuation
+            if special or with_space or punctuation or len(words) == 0:
+                words.append(subword)
+                word_tokens.append(subword_tokens)
+            else:
+                words[-1] = words[-1] + subword
+                word_tokens[-1].extend(subword_tokens)
+        return words, word_tokens
+@lru_cache(maxsize=None)
+def get_encoding(name: str = "gpt2"):
+    vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
+    ranks = {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in open(vocab_path) if line)
+    }
+    n_vocab = len(ranks)
+    special_tokens = {}
+    specials = [
+        "<|endoftext|>",
+        "<|startoftranscript|>",
+        *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
+        "<|translate|>",
+        "<|transcribe|>",
+        "<|startoflm|>",
+        "<|startofprev|>",
+        "<|nospeech|>",
+        "<|notimestamps|>",
+        *[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
+    ]
+    for token in specials:
+        special_tokens[token] = n_vocab
+        n_vocab += 1
+    return tiktoken.Encoding(
+        name=os.path.basename(vocab_path),
+        explicit_n_vocab=n_vocab,
+        pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
+        mergeable_ranks=ranks,
+        special_tokens=special_tokens,
+    )
+@lru_cache(maxsize=None)
+def get_tokenizer(
+    multilingual: bool,
+    *,
+    language: Optional[str] = None,
+    task: Optional[str] = None,  # Literal["transcribe", "translate", None]
+) -> Tokenizer:
+    if language is not None:
+        language = language.lower()
+        if language not in LANGUAGES:
+            if language in TO_LANGUAGE_CODE:
+                language = TO_LANGUAGE_CODE[language]
+            else:
+                raise ValueError(f"Unsupported language: {language}")
+    if multilingual:
+        encoding_name = "multilingual"
+        language = language or "en"
+        task = task or "transcribe"
+    else:
+        encoding_name = "gpt2"
+        language = None
+        task = None
+    encoding = get_encoding(name=encoding_name)
+    return Tokenizer(encoding=encoding, language=language, task=task)

whisper/whisper/transcribe.py ADDED Viewed

	@@ -0,0 +1,461 @@

+import argparse
+import os
+import warnings
+from typing import TYPE_CHECKING, Optional, Tuple, Union
+import numpy as np
+import torch
+import tqdm
+from .audio import (
+    FRAMES_PER_SECOND,
+    HOP_LENGTH,
+    N_FRAMES,
+    N_SAMPLES,
+    SAMPLE_RATE,
+    log_mel_spectrogram,
+    pad_or_trim,
+)
+from .decoding import DecodingOptions, DecodingResult
+from .timing import add_word_timestamps
+from .tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer
+from .utils import (
+    exact_div,
+    format_timestamp,
+    get_writer,
+    make_safe,
+    optional_float,
+    optional_int,
+    str2bool,
+)
+if TYPE_CHECKING:
+    from .model import Whisper
+def transcribe(
+    model: "Whisper",
+    audio: Union[str, np.ndarray, torch.Tensor],
+    *,
+    verbose: Optional[bool] = None,
+    temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
+    compression_ratio_threshold: Optional[float] = 2.4,
+    logprob_threshold: Optional[float] = -1.0,
+    no_speech_threshold: Optional[float] = 0.6,
+    condition_on_previous_text: bool = True,
+    initial_prompt: Optional[str] = None,
+    word_timestamps: bool = False,
+    prepend_punctuations: str = "\"'“¿([{-",
+    append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
+    **decode_options,
+):
+    """
+    Transcribe an audio file using Whisper
+    Parameters
+    ----------
+    model: Whisper
+        The Whisper model instance
+    audio: Union[str, np.ndarray, torch.Tensor]
+        The path to the audio file to open, or the audio waveform
+    verbose: bool
+        Whether to display the text being decoded to the console. If True, displays all the details,
+        If False, displays minimal details. If None, does not display anything
+    temperature: Union[float, Tuple[float, ...]]
+        Temperature for sampling. It can be a tuple of temperatures, which will be successively used
+        upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.
+    compression_ratio_threshold: float
+        If the gzip compression ratio is above this value, treat as failed
+    logprob_threshold: float
+        If the average log probability over sampled tokens is below this value, treat as failed
+    no_speech_threshold: float
+        If the no_speech probability is higher than this value AND the average log probability
+        over sampled tokens is below `logprob_threshold`, consider the segment as silent
+    condition_on_previous_text: bool
+        if True, the previous output of the model is provided as a prompt for the next window;
+        disabling may make the text inconsistent across windows, but the model becomes less prone to
+        getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
+    word_timestamps: bool
+        Extract word-level timestamps using the cross-attention pattern and dynamic time warping,
+        and include the timestamps for each word in each segment.
+    prepend_punctuations: str
+        If word_timestamps is True, merge these punctuation symbols with the next word
+    append_punctuations: str
+        If word_timestamps is True, merge these punctuation symbols with the previous word
+    initial_prompt: Optional[str]
+        Optional text to provide as a prompt for the first window. This can be used to provide, or
+        "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
+        to make it more likely to predict those word correctly.
+    decode_options: dict
+        Keyword arguments to construct `DecodingOptions` instances
+    Returns
+    -------
+    A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
+    the spoken language ("language"), which is detected when `decode_options["language"]` is None.
+    """
+    dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32
+    if model.device == torch.device("cpu"):
+        if torch.cuda.is_available():
+            warnings.warn("Performing inference on CPU when CUDA is available")
+        if dtype == torch.float16:
+            warnings.warn("FP16 is not supported on CPU; using FP32 instead")
+            dtype = torch.float32
+    if dtype == torch.float32:
+        decode_options["fp16"] = False
+    # Pad 30-seconds of silence to the input audio, for slicing
+    mel = log_mel_spectrogram(audio, padding=N_SAMPLES)
+    content_frames = mel.shape[-1] - N_FRAMES
+    if decode_options.get("language", None) is None:
+        if not model.is_multilingual:
+            decode_options["language"] = "en"
+        else:
+            if verbose:
+                print(
+                    "Detecting language using up to the first 30 seconds. Use `--language` to specify the language"
+                )
+            mel_segment = pad_or_trim(mel, N_FRAMES).to(model.device).to(dtype)
+            _, probs = model.detect_language(mel_segment)
+            decode_options["language"] = max(probs, key=probs.get)
+            if verbose is not None:
+                print(
+                    f"Detected language: {LANGUAGES[decode_options['language']].title()}"
+                )
+    language: str = decode_options["language"]
+    task: str = decode_options.get("task", "transcribe")
+    tokenizer = get_tokenizer(model.is_multilingual, language=language, task=task)
+    if word_timestamps and task == "translate":
+        warnings.warn("Word-level timestamps on translations may not be reliable.")
+    def decode_with_fallback(segment: torch.Tensor) -> DecodingResult:
+        temperatures = (
+            [temperature] if isinstance(temperature, (int, float)) else temperature
+        )
+        decode_result = None
+        for t in temperatures:
+            kwargs = {**decode_options}
+            if t > 0:
+                # disable beam_size and patience when t > 0
+                kwargs.pop("beam_size", None)
+                kwargs.pop("patience", None)
+            else:
+                # disable best_of when t == 0
+                kwargs.pop("best_of", None)
+            options = DecodingOptions(**kwargs, temperature=t)
+            decode_result = model.decode(segment, options)
+            needs_fallback = False
+            if (
+                compression_ratio_threshold is not None
+                and decode_result.compression_ratio > compression_ratio_threshold
+            ):
+                needs_fallback = True  # too repetitive
+            if (
+                logprob_threshold is not None
+                and decode_result.avg_logprob < logprob_threshold
+            ):
+                needs_fallback = True  # average log probability is too low
+            if (
+                no_speech_threshold is not None
+                and decode_result.no_speech_prob > no_speech_threshold
+            ):
+                needs_fallback = False  # silence
+            if not needs_fallback:
+                break
+        return decode_result
+    seek = 0
+    input_stride = exact_div(
+        N_FRAMES, model.dims.n_audio_ctx
+    )  # mel frames per output token: 2
+    time_precision = (
+        input_stride * HOP_LENGTH / SAMPLE_RATE
+    )  # time per output token: 0.02 (seconds)
+    all_tokens = []
+    all_segments = []
+    prompt_reset_since = 0
+    if initial_prompt is not None:
+        initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip())
+        all_tokens.extend(initial_prompt_tokens)
+    else:
+        initial_prompt_tokens = []
+    def new_segment(
+        *, start: float, end: float, tokens: torch.Tensor, result: DecodingResult
+    ):
+        tokens = tokens.tolist()
+        text_tokens = [token for token in tokens if token < tokenizer.eot]
+        return {
+            "seek": seek,
+            "start": start,
+            "end": end,
+            "text": tokenizer.decode(text_tokens),
+            "tokens": tokens,
+            "temperature": result.temperature,
+            "avg_logprob": result.avg_logprob,
+            "compression_ratio": result.compression_ratio,
+            "no_speech_prob": result.no_speech_prob,
+        }
+    # show the progress bar when verbose is False (if True, transcribed text will be printed)
+    with tqdm.tqdm(
+        total=content_frames, unit="frames", disable=verbose is not False
+    ) as pbar:
+        last_speech_timestamp = 0.0
+        while seek < content_frames:
+            time_offset = float(seek * HOP_LENGTH / SAMPLE_RATE)
+            mel_segment = mel[:, seek : seek + N_FRAMES]
+            segment_size = min(N_FRAMES, content_frames - seek)
+            segment_duration = segment_size * HOP_LENGTH / SAMPLE_RATE
+            mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(model.device).to(dtype)
+            decode_options["prompt"] = all_tokens[prompt_reset_since:]
+            result: DecodingResult = decode_with_fallback(mel_segment)
+            tokens = torch.tensor(result.tokens)
+            if no_speech_threshold is not None:
+                # no voice activity check
+                should_skip = result.no_speech_prob > no_speech_threshold
+                if (
+                    logprob_threshold is not None
+                    and result.avg_logprob > logprob_threshold
+                ):
+                    # don't skip if the logprob is high enough, despite the no_speech_prob
+                    should_skip = False
+                if should_skip:
+                    seek += segment_size  # fast-forward to the next segment boundary
+                    continue
+            previous_seek = seek
+            current_segments = []
+            timestamp_tokens: torch.Tensor = tokens.ge(tokenizer.timestamp_begin)
+            single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True]
+            consecutive = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0]
+            consecutive.add_(1)
+            if len(consecutive) > 0:
+                # if the output contains two consecutive timestamp tokens
+                slices = consecutive.tolist()
+                if single_timestamp_ending:
+                    slices.append(len(tokens))
+                last_slice = 0
+                for current_slice in slices:
+                    sliced_tokens = tokens[last_slice:current_slice]
+                    start_timestamp_pos = (
+                        sliced_tokens[0].item() - tokenizer.timestamp_begin
+                    )
+                    end_timestamp_pos = (
+                        sliced_tokens[-1].item() - tokenizer.timestamp_begin
+                    )
+                    current_segments.append(
+                        new_segment(
+                            start=time_offset + start_timestamp_pos * time_precision,
+                            end=time_offset + end_timestamp_pos * time_precision,
+                            tokens=sliced_tokens,
+                            result=result,
+                        )
+                    )
+                    last_slice = current_slice
+                if single_timestamp_ending:
+                    # single timestamp at the end means no speech after the last timestamp.
+                    seek += segment_size
+                else:
+                    # otherwise, ignore the unfinished segment and seek to the last timestamp
+                    last_timestamp_pos = (
+                        tokens[last_slice - 1].item() - tokenizer.timestamp_begin
+                    )
+                    seek += last_timestamp_pos * input_stride
+            else:
+                duration = segment_duration
+                timestamps = tokens[timestamp_tokens.nonzero().flatten()]
+                if (
+                    len(timestamps) > 0
+                    and timestamps[-1].item() != tokenizer.timestamp_begin
+                ):
+                    # no consecutive timestamps but it has a timestamp; use the last one.
+                    last_timestamp_pos = (
+                        timestamps[-1].item() - tokenizer.timestamp_begin
+                    )
+                    duration = last_timestamp_pos * time_precision
+                current_segments.append(
+                    new_segment(
+                        start=time_offset,
+                        end=time_offset + duration,
+                        tokens=tokens,
+                        result=result,
+                    )
+                )
+                seek += segment_size
+            if word_timestamps:
+                add_word_timestamps(
+                    segments=current_segments,
+                    model=model,
+                    tokenizer=tokenizer,
+                    mel=mel_segment,
+                    num_frames=segment_size,
+                    prepend_punctuations=prepend_punctuations,
+                    append_punctuations=append_punctuations,
+                    last_speech_timestamp=last_speech_timestamp,
+                )
+                word_end_timestamps = [
+                    w["end"] for s in current_segments for w in s["words"]
+                ]
+                if len(word_end_timestamps) > 0:
+                    last_speech_timestamp = word_end_timestamps[-1]
+                if not single_timestamp_ending and len(word_end_timestamps) > 0:
+                    seek_shift = round(
+                        (word_end_timestamps[-1] - time_offset) * FRAMES_PER_SECOND
+                    )
+                    if seek_shift > 0:
+                        seek = previous_seek + seek_shift
+            if verbose:
+                for segment in current_segments:
+                    start, end, text = segment["start"], segment["end"], segment["text"]
+                    line = f"[{format_timestamp(start)} --> {format_timestamp(end)}] {text}"
+                    print(make_safe(line))
+            # if a segment is instantaneous or does not contain text, clear it
+            for i, segment in enumerate(current_segments):
+                if segment["start"] == segment["end"] or segment["text"].strip() == "":
+                    segment["text"] = ""
+                    segment["tokens"] = []
+                    segment["words"] = []
+            all_segments.extend(
+                [
+                    {"id": i, **segment}
+                    for i, segment in enumerate(
+                        current_segments, start=len(all_segments)
+                    )
+                ]
+            )
+            all_tokens.extend(
+                [token for segment in current_segments for token in segment["tokens"]]
+            )
+            if not condition_on_previous_text or result.temperature > 0.5:
+                # do not feed the prompt tokens if a high temperature was used
+                prompt_reset_since = len(all_tokens)
+            # update progress bar
+            pbar.update(min(content_frames, seek) - previous_seek)
+    return dict(
+        text=tokenizer.decode(all_tokens[len(initial_prompt_tokens) :]),
+        segments=all_segments,
+        language=language,
+    )
+def cli():
+    from . import available_models
+    # fmt: off
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
+    parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
+    parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
+    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
+    parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
+    parser.add_argument("--output_format", "-f", type=str, default="all", choices=["txt", "vtt", "srt", "tsv", "json", "all"], help="format of the output file; if not specified, all available formats will be produced")
+    parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
+    parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
+    parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection")
+    parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
+    parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
+    parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
+    parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
+    parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")
+    parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
+    parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
+    parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
+    parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
+    parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
+    parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
+    parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
+    parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
+    parser.add_argument("--word_timestamps", type=str2bool, default=False, help="(experimental) extract word-level timestamps and refine the results based on them")
+    parser.add_argument("--prepend_punctuations", type=str, default="\"\'“¿([{-", help="if word_timestamps is True, merge these punctuation symbols with the next word")
+    parser.add_argument("--append_punctuations", type=str, default="\"\'.。,，!！?？:：”)]}、", help="if word_timestamps is True, merge these punctuation symbols with the previous word")
+    parser.add_argument("--highlight_words", type=str2bool, default=False, help="(requires --word_timestamps True) underline each word as it is spoken in srt and vtt")
+    parser.add_argument("--max_line_width", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of characters in a line before breaking the line")
+    parser.add_argument("--max_line_count", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of lines in a segment")
+    parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
+    # fmt: on
+    args = parser.parse_args().__dict__
+    model_name: str = args.pop("model")
+    model_dir: str = args.pop("model_dir")
+    output_dir: str = args.pop("output_dir")
+    output_format: str = args.pop("output_format")
+    device: str = args.pop("device")
+    os.makedirs(output_dir, exist_ok=True)
+    if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
+        if args["language"] is not None:
+            warnings.warn(
+                f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead."
+            )
+        args["language"] = "en"
+    temperature = args.pop("temperature")
+    if (increment := args.pop("temperature_increment_on_fallback")) is not None:
+        temperature = tuple(np.arange(temperature, 1.0 + 1e-6, increment))
+    else:
+        temperature = [temperature]
+    if (threads := args.pop("threads")) > 0:
+        torch.set_num_threads(threads)
+    from . import load_model
+    model = load_model(model_name, device=device, download_root=model_dir)
+    writer = get_writer(output_format, output_dir)
+    word_options = ["highlight_words", "max_line_count", "max_line_width"]
+    if not args["word_timestamps"]:
+        for option in word_options:
+            if args[option]:
+                parser.error(f"--{option} requires --word_timestamps True")
+    if args["max_line_count"] and not args["max_line_width"]:
+        warnings.warn("--max_line_count has no effect without --max_line_width")
+    writer_args = {arg: args.pop(arg) for arg in word_options}
+    for audio_path in args.pop("audio"):
+        result = transcribe(model, audio_path, temperature=temperature, **args)
+        writer(result, audio_path, writer_args)
+if __name__ == "__main__":
+    cli()

whisper/whisper/triton_ops.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from functools import lru_cache
+import numpy as np
+import torch
+try:
+    import triton
+    import triton.language as tl
+except ImportError:
+    raise RuntimeError("triton import failed; try `pip install --pre triton`")
+@triton.jit
+def dtw_kernel(
+    cost, trace, x, x_stride, cost_stride, trace_stride, N, M, BLOCK_SIZE: tl.constexpr
+):
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < M
+    for k in range(1, N + M + 1):  # k = i + j
+        tl.debug_barrier()
+        p0 = cost + (k - 1) * cost_stride
+        p1 = cost + k * cost_stride
+        p2 = cost + k * cost_stride + 1
+        c0 = tl.load(p0 + offsets, mask=mask)
+        c1 = tl.load(p1 + offsets, mask=mask)
+        c2 = tl.load(p2 + offsets, mask=mask)
+        x_row = tl.load(x + (k - 1) * x_stride + offsets, mask=mask, other=0)
+        cost_row = x_row + tl.minimum(tl.minimum(c0, c1), c2)
+        cost_ptr = cost + (k + 1) * cost_stride + 1
+        tl.store(cost_ptr + offsets, cost_row, mask=mask)
+        trace_ptr = trace + (k + 1) * trace_stride + 1
+        tl.store(trace_ptr + offsets, 2, mask=mask & (c2 <= c0) & (c2 <= c1))
+        tl.store(trace_ptr + offsets, 1, mask=mask & (c1 <= c0) & (c1 <= c2))
+        tl.store(trace_ptr + offsets, 0, mask=mask & (c0 <= c1) & (c0 <= c2))
+@lru_cache(maxsize=None)
+def median_kernel(filter_width: int):
+    @triton.jit
+    def kernel(
+        y, x, x_stride, y_stride, BLOCK_SIZE: tl.constexpr
+    ):  # x.shape[-1] == filter_width
+        row_idx = tl.program_id(0)
+        offsets = tl.arange(0, BLOCK_SIZE)
+        mask = offsets < y_stride
+        x_ptr = x + row_idx * x_stride  # noqa: F841
+        y_ptr = y + row_idx * y_stride
+        LOAD_ALL_ROWS_HERE  # noqa: F821
+        BUBBLESORT_HERE  # noqa: F821
+        tl.store(y_ptr + offsets, MIDDLE_ROW_HERE, mask=mask)  # noqa: F821
+    kernel = triton.JITFunction(kernel.fn)
+    kernel.src = kernel.src.replace(
+        "    LOAD_ALL_ROWS_HERE",
+        "\n".join(
+            [
+                f"    row{i} = tl.load(x_ptr + offsets + {i}, mask=mask)"
+                for i in range(filter_width)
+            ]
+        ),
+    )
+    kernel.src = kernel.src.replace(
+        "    BUBBLESORT_HERE",
+        "\n\n".join(
+            [
+                "\n\n".join(
+                    [
+                        "\n".join(
+                            [
+                                f"    smaller = tl.where(row{j} < row{j + 1}, row{j}, row{j + 1})",
+                                f"    larger = tl.where(row{j} > row{j + 1}, row{j}, row{j + 1})",
+                                f"    row{j} = smaller",
+                                f"    row{j + 1} = larger",
+                            ]
+                        )
+                        for j in range(filter_width - i - 1)
+                    ]
+                )
+                for i in range(filter_width // 2 + 1)
+            ]
+        ),
+    )
+    kernel.src = kernel.src.replace("MIDDLE_ROW_HERE", f"row{filter_width // 2}")
+    return kernel
+def median_filter_cuda(x: torch.Tensor, filter_width: int):
+    """Apply a median filter of given width along the last dimension of x"""
+    slices = x.contiguous().unfold(-1, filter_width, 1)
+    grid = np.prod(slices.shape[:-2])
+    kernel = median_kernel(filter_width)
+    y = torch.empty_like(slices[..., 0])
+    BLOCK_SIZE = 1 << (y.stride(-2) - 1).bit_length()
+    kernel[(grid,)](y, x, x.stride(-2), y.stride(-2), BLOCK_SIZE=BLOCK_SIZE)
+    return y

whisper/whisper/utils.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import json
+import os
+import re
+import sys
+import zlib
+from typing import Callable, Optional, TextIO
+system_encoding = sys.getdefaultencoding()
+if system_encoding != "utf-8":
+    def make_safe(string):
+        # replaces any character not representable using the system default encoding with an '?',
+        # avoiding UnicodeEncodeError (https://github.com/openai/whisper/discussions/729).
+        return string.encode(system_encoding, errors="replace").decode(system_encoding)
+else:
+    def make_safe(string):
+        # utf-8 can encode any Unicode code point, so no need to do the round-trip encoding
+        return string
+def exact_div(x, y):
+    assert x % y == 0
+    return x // y
+def str2bool(string):
+    str2val = {"True": True, "False": False}
+    if string in str2val:
+        return str2val[string]
+    else:
+        raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
+def optional_int(string):
+    return None if string == "None" else int(string)
+def optional_float(string):
+    return None if string == "None" else float(string)
+def compression_ratio(text) -> float:
+    text_bytes = text.encode("utf-8")
+    return len(text_bytes) / len(zlib.compress(text_bytes))
+def format_timestamp(
+    seconds: float, always_include_hours: bool = False, decimal_marker: str = "."
+):
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return (
+        f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
+    )
+class ResultWriter:
+    extension: str
+    def __init__(self, output_dir: str):
+        self.output_dir = output_dir
+    def __call__(self, result: dict, audio_path: str, options: dict):
+        audio_basename = os.path.basename(audio_path)
+        audio_basename = os.path.splitext(audio_basename)[0]
+        output_path = os.path.join(
+            self.output_dir, audio_basename + "." + self.extension
+        )
+        with open(output_path, "w", encoding="utf-8") as f:
+            self.write_result(result, file=f, options=options)
+    def write_result(self, result: dict, file: TextIO, options: dict):
+        raise NotImplementedError
+class WriteTXT(ResultWriter):
+    extension: str = "txt"
+    def write_result(self, result: dict, file: TextIO, options: dict):
+        for segment in result["segments"]:
+            print(segment["text"].strip(), file=file, flush=True)
+class SubtitlesWriter(ResultWriter):
+    always_include_hours: bool
+    decimal_marker: str
+    def iterate_result(self, result: dict, options: dict):
+        raw_max_line_width: Optional[int] = options["max_line_width"]
+        max_line_count: Optional[int] = options["max_line_count"]
+        highlight_words: bool = options["highlight_words"]
+        max_line_width = 1000 if raw_max_line_width is None else raw_max_line_width
+        preserve_segments = max_line_count is None or raw_max_line_width is None
+        def iterate_subtitles():
+            line_len = 0
+            line_count = 1
+            # the next subtitle to yield (a list of word timings with whitespace)
+            subtitle: list[dict] = []
+            last = result["segments"][0]["words"][0]["start"]
+            for segment in result["segments"]:
+                for i, original_timing in enumerate(segment["words"]):
+                    timing = original_timing.copy()
+                    long_pause = not preserve_segments and timing["start"] - last > 3.0
+                    has_room = line_len + len(timing["word"]) <= max_line_width
+                    seg_break = i == 0 and len(subtitle) > 0 and preserve_segments
+                    if line_len > 0 and has_room and not long_pause and not seg_break:
+                        # line continuation
+                        line_len += len(timing["word"])
+                    else:
+                        # new line
+                        timing["word"] = timing["word"].strip()
+                        if (
+                            len(subtitle) > 0
+                            and max_line_count is not None
+                            and (long_pause or line_count >= max_line_count)
+                            or seg_break
+                        ):
+                            # subtitle break
+                            yield subtitle
+                            subtitle = []
+                            line_count = 1
+                        elif line_len > 0:
+                            # line break
+                            line_count += 1
+                            timing["word"] = "\n" + timing["word"]
+                        line_len = len(timing["word"].strip())
+                    subtitle.append(timing)
+                    last = timing["start"]
+            if len(subtitle) > 0:
+                yield subtitle
+        if "words" in result["segments"][0]:
+            for subtitle in iterate_subtitles():
+                subtitle_start = self.format_timestamp(subtitle[0]["start"])
+                subtitle_end = self.format_timestamp(subtitle[-1]["end"])
+                subtitle_text = "".join([word["word"] for word in subtitle])
+                if highlight_words:
+                    last = subtitle_start
+                    all_words = [timing["word"] for timing in subtitle]
+                    for i, this_word in enumerate(subtitle):
+                        start = self.format_timestamp(this_word["start"])
+                        end = self.format_timestamp(this_word["end"])
+                        if last != start:
+                            yield last, start, subtitle_text
+                        yield start, end, "".join(
+                            [
+                                re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
+                                if j == i
+                                else word
+                                for j, word in enumerate(all_words)
+                            ]
+                        )
+                        last = end
+                else:
+                    yield subtitle_start, subtitle_end, subtitle_text
+        else:
+            for segment in result["segments"]:
+                segment_start = self.format_timestamp(segment["start"])
+                segment_end = self.format_timestamp(segment["end"])
+                segment_text = segment["text"].strip().replace("-->", "->")
+                yield segment_start, segment_end, segment_text
+    def format_timestamp(self, seconds: float):
+        return format_timestamp(
+            seconds=seconds,
+            always_include_hours=self.always_include_hours,
+            decimal_marker=self.decimal_marker,
+        )
+class WriteVTT(SubtitlesWriter):
+    extension: str = "vtt"
+    always_include_hours: bool = False
+    decimal_marker: str = "."
+    def write_result(self, result: dict, file: TextIO, options: dict):
+        print("WEBVTT\n", file=file)
+        for start, end, text in self.iterate_result(result, options):
+            print(f"{start} --> {end}\n{text}\n", file=file, flush=True)
+class WriteSRT(SubtitlesWriter):
+    extension: str = "srt"
+    always_include_hours: bool = True
+    decimal_marker: str = ","
+    def write_result(self, result: dict, file: TextIO, options: dict):
+        for i, (start, end, text) in enumerate(
+            self.iterate_result(result, options), start=1
+        ):
+            print(f"{i}\n{start} --> {end}\n{text}\n", file=file, flush=True)
+class WriteTSV(ResultWriter):
+    """
+    Write a transcript to a file in TSV (tab-separated values) format containing lines like:
+    <start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
+    Using integer milliseconds as start and end times means there's no chance of interference from
+    an environment setting a language encoding that causes the decimal in a floating point number
+    to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
+    """
+    extension: str = "tsv"
+    def write_result(self, result: dict, file: TextIO, options: dict):
+        print("start", "end", "text", sep="\t", file=file)
+        for segment in result["segments"]:
+            print(round(1000 * segment["start"]), file=file, end="\t")
+            print(round(1000 * segment["end"]), file=file, end="\t")
+            print(segment["text"].strip().replace("\t", " "), file=file, flush=True)
+class WriteJSON(ResultWriter):
+    extension: str = "json"
+    def write_result(self, result: dict, file: TextIO, options: dict):
+        json.dump(result, file)
+def get_writer(
+    output_format: str, output_dir: str
+) -> Callable[[dict, TextIO, dict], None]:
+    writers = {
+        "txt": WriteTXT,
+        "vtt": WriteVTT,
+        "srt": WriteSRT,
+        "tsv": WriteTSV,
+        "json": WriteJSON,
+    }
+    if output_format == "all":
+        all_writers = [writer(output_dir) for writer in writers.values()]
+        def write_all(result: dict, file: TextIO, options: dict):
+            for writer in all_writers:
+                writer(result, file, options)
+        return write_all
+    return writers[output_format](output_dir)

whisper/whisper/version.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "20230918"