patrickvonplaten
/

whisper-32-2-orig-ckpt

Model card Files Files and versions Community

3c5a93a about 1 year ago

7.33 kB

	#!/usr/bin/env python
	"""Converts a Whisper model in Hugging Face format to OpenAI format.
	This script is based on the following script to do the opposite:
	https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/convert_openai_to_hf.py
	Requirements:
	```bash
	pip install -U openai-whisper
	```
	Example:
	```bash
	# Converts the model from Hugging Face to OpenAI format:
	python convert_hf_to_openai.py \
	--checkpoint openai/whisper-tiny \
	--whisper_dump_path whisper-tiny-openai.pt
	```
	```python
	>>> # Disabled doctest because it requries the openai-whisper package.
	>> import whisper
	>> from transformers.models.whisper.convert_hf_to_openai import convert_tfms_to_openai_whisper
	>> # Converts the model from Hugging Face to OpenAI format:
	>> convert_tfms_to_openai_whisper(
	.. "openai/whisper-tiny", "whisper-tiny-openai.pt"
	.. )
	HF model path: openai/whisper-tiny
	OpenAI model path: whisper-tiny-openai.pt
	>> # Select an audio file:
	>> audio_path = "https://huggingface.co/datasets/sanchit-gandhi/librispeech_long/resolve/main/audio.wav"
	>> # Load the Whisper model in OpenAI format:
	>> model = whisper.load_model("whisper-tiny-openai.pt")
	>> # Transcribe the audio:
	>> prediction = model.transcribe(audio_path)
	>> prediction["text"][:70]
	' chapter 16. I might have told you of the beginning of this liaison in'
	```
	"""
	# Copyright 2023 Xabier de Zuazo and the Aholab team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import argparse

	import torch
	from torch import nn

	from transformers import WhisperConfig, WhisperForConditionalGeneration


	# Create the reverse mapping adapting it from the original `WHISPER_MAPPING` in
	# the `convert_openai_to_hf.py` script:
	REVERSE_WHISPER_MAPPING = {
	"layers": "blocks",
	"fc1": "mlp.0",
	"fc2": "mlp.2",
	"final_layer_norm": "mlp_ln",
	".self_attn.q_proj": ".attn.query",
	".self_attn.k_proj": ".attn.key",
	".self_attn.v_proj": ".attn.value",
	".self_attn_layer_norm": ".attn_ln",
	".self_attn.out_proj": ".attn.out",
	".encoder_attn.q_proj": ".cross_attn.query",
	".encoder_attn.k_proj": ".cross_attn.key",
	".encoder_attn.v_proj": ".cross_attn.value",
	".encoder_attn_layer_norm": ".cross_attn_ln",
	".encoder_attn.out_proj": ".cross_attn.out",
	"decoder.layer_norm.": "decoder.ln.",
	"encoder.layer_norm.": "encoder.ln_post.",
	"embed_tokens": "token_embedding",
	"encoder.embed_positions.weight": "encoder.positional_embedding",
	"decoder.embed_positions.weight": "decoder.positional_embedding",
	}


	def reverse_rename_keys(s_dict: dict) -> dict:
	"""Renames the keys back from Hugging Face to OpenAI Whisper format.
	By using this function on an HF model's state_dict, we should get the names in the format expected by Whisper.
	Args:
	s_dict (`dict`): A dictionary with keys in Hugging Face format.
	Returns:
	`dict`: The same dictionary but in OpenAI Whisper format.
	"""
	keys = list(s_dict.keys())
	for orig_key in keys:
	new_key = orig_key
	for key_r, value_r in REVERSE_WHISPER_MAPPING.items():
	if key_r in orig_key:
	new_key = new_key.replace(key_r, value_r)

	# print(f"{orig_key} -> {new_key}")

	s_dict[new_key] = s_dict.pop(orig_key)
	return s_dict


	def make_emb_from_linear(linear: nn.Linear) -> nn.Embedding:
	"""Converts a linear layer's weights into an embedding layer.
	The linear layer's `in_features` dimension corresponds to the vocabulary size and its `out_features` dimension
	corresponds to the embedding size.
	Args:
	linear (`nn.Linear`): The linear layer to be converted.
	Returns:
	`nn.Embedding`:
	An embedding layer with weights set to those of the input linear layer.
	"""
	vocab_size, emb_size = linear.weight.data.shape
	emb_layer = nn.Embedding(vocab_size, emb_size, _weight=linear.weight.data)
	return emb_layer


	def extract_dims_from_hf(config: WhisperConfig) -> dict:
	"""Extracts necessary dimensions from Hugging Face's WhisperConfig.
	Extracts necessary dimensions and related configuration data from the Hugging Face model and then restructure it
	for the OpenAI Whisper format.
	Args:
	config (`WhisperConfig`): Configuration of the Hugging Face's model.
	Returns:
	`dict`: The `dims` of the OpenAI Whisper model.
	"""
	dims = {
	"n_vocab": config.vocab_size,
	"n_mels": config.num_mel_bins,
	"n_audio_state": config.d_model,
	"n_text_ctx": config.max_target_positions,
	"n_audio_layer": config.encoder_layers,
	"n_audio_head": config.encoder_attention_heads,
	"n_text_layer": config.decoder_layers,
	"n_text_head": config.decoder_attention_heads,
	"n_text_state": config.d_model,
	"n_audio_ctx": config.max_source_positions,
	}
	return dims


	def convert_tfms_to_openai_whisper(hf_model_path: str, whisper_dump_path: str):
	"""Converts a Whisper model from the Hugging Face to the OpenAI format.
	Takes in the path to a Hugging Face Whisper model, extracts its state_dict, renames keys as needed, and then saves
	the model OpenAI's format.
	Args:
	hf_model_path (`str`):
	Path to the pretrained Whisper model in Hugging Face format.
	whisper_dump_path (`str`):
	Destination path where the converted model in Whisper/OpenAI format will be saved.
	Returns:
	`None`
	"""
	print("HF model path:", hf_model_path)
	print("OpenAI model path:", whisper_dump_path)

	# Load the HF model and its state_dict
	model = WhisperForConditionalGeneration.from_pretrained(hf_model_path)
	state_dict = model.state_dict()

	# Use a reverse mapping to rename state_dict keys
	state_dict = reverse_rename_keys(state_dict)

	# Extract configurations and other necessary metadata
	dims = extract_dims_from_hf(model.config)

	# Remove the proj_out weights from state dictionary
	del state_dict["proj_out.weight"]

	# Construct the Whisper checkpoint structure
	state_dict = {k.replace("model.", "", 1): v for k, v in state_dict.items()}
	whisper_checkpoint = {"dims": dims, "model_state_dict": state_dict}

	# Save in Whisper's format
	torch.save(whisper_checkpoint, whisper_dump_path)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	# Required parameters
	parser.add_argument(
	"--checkpoint",
	type=str,
	help="Path of name of the Hugging Face checkpoint.", # noqa: E501
	)
	parser.add_argument(
	"--whisper_dump_path",
	type=str,
	help="Path to the output Whisper model.", # noqa: E501
	)
	args = parser.parse_args()

	convert_tfms_to_openai_whisper(args.checkpoint, args.whisper_dump_path)