Spaces:

fishaudio
/

fish-speech-1

Running on A10G

App Files Files Community

fish-speech-1 / tools /llama /quantize.py

lengyue233

Init hf space integration

0a3525d verified 5 months ago

raw

history blame

No virus

17.2 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.

	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.
	import time
	from pathlib import Path

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	from fish_speech.models.text2semantic.llama import ModelArgs, Transformer, find_multiple

	##### Quantization Primitives ######


	def dynamically_quantize_per_channel(x, quant_min, quant_max, target_dtype):
	# assumes symmetric quantization
	# assumes axis == 0
	# assumes dense memory format
	# TODO(future): relax ^ as needed

	# default setup for affine quantization of activations
	eps = torch.finfo(torch.float32).eps

	# get min and max
	min_val, max_val = torch.aminmax(x, dim=1)

	# calculate scales and zero_points based on min and max
	# reference: https://fburl.com/code/srbiybme
	min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
	max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
	device = min_val_neg.device

	# reference: https://fburl.com/code/4wll53rk
	max_val_pos = torch.max(-min_val_neg, max_val_pos)
	scales = max_val_pos / (float(quant_max - quant_min) / 2)
	# ensure scales is the same dtype as the original tensor
	scales = torch.clamp(scales, min=eps).to(x.dtype)
	zero_points = torch.zeros(min_val_neg.size(), dtype=torch.int64, device=device)

	# quantize based on qmin/qmax/scales/zp
	# reference: https://www.internalfb.com/code/fbsource/[8edc275012b1]/fbcode/caffe2/torch/ao/quantization/fx/_decomposed.py?lines=63
	x_div = x / scales.unsqueeze(-1)
	x_round = torch.round(x_div)
	x_zp = x_round + zero_points.unsqueeze(-1)
	quant = torch.clamp(x_zp, quant_min, quant_max).to(target_dtype)

	return quant, scales, zero_points


	def get_group_qparams(w, n_bit=4, groupsize=128):
	# needed for GPTQ with padding
	if groupsize > w.shape[-1]:
	groupsize = w.shape[-1]
	assert groupsize > 1
	assert w.shape[-1] % groupsize == 0
	assert w.dim() == 2

	to_quant = w.reshape(-1, groupsize)
	assert torch.isnan(to_quant).sum() == 0

	max_val = to_quant.amax(dim=1, keepdim=True)
	min_val = to_quant.amin(dim=1, keepdim=True)
	max_int = 2**n_bit - 1
	scales = (max_val - min_val).clamp(min=1e-6) / max_int
	zeros = min_val + scales * (2 ** (n_bit - 1))
	return scales.to(torch.bfloat16).reshape(w.shape[0], -1), zeros.to(
	torch.bfloat16
	).reshape(w.shape[0], -1)


	def pack_scales_and_zeros(scales, zeros):
	assert scales.shape == zeros.shape
	assert scales.dtype == torch.bfloat16
	assert zeros.dtype == torch.bfloat16
	return (
	torch.cat(
	[
	scales.reshape(scales.size(0), scales.size(1), 1),
	zeros.reshape(zeros.size(0), zeros.size(1), 1),
	],
	2,
	)
	.transpose(0, 1)
	.contiguous()
	)


	def unpack_scales_and_zeros(scales_and_zeros):
	assert len(scales_and_zeros.shape) == 3 and scales_and_zeros.shape[2] == 2
	assert scales_and_zeros.dtype == torch.float
	return torch.split(scales_and_zeros.transpose(0, 1), 1, 2)


	def group_quantize_tensor_from_qparams(w, scales, zeros, n_bit=4, groupsize=128):
	assert groupsize > 1
	# needed for GPTQ single column quantize
	if groupsize > w.shape[-1] and scales.shape[-1] == 1:
	groupsize = w.shape[-1]

	assert w.shape[-1] % groupsize == 0
	assert w.dim() == 2

	to_quant = w.reshape(-1, groupsize)
	assert torch.isnan(to_quant).sum() == 0

	scales = scales.reshape(-1, 1)
	zeros = zeros.reshape(-1, 1)
	min_val = zeros - scales * (2 ** (n_bit - 1))
	max_int = 2**n_bit - 1
	min_int = 0
	w_int32 = (
	to_quant.sub(min_val)
	.div(scales)
	.round()
	.clamp_(min_int, max_int)
	.to(torch.int32)
	.reshape_as(w)
	)

	return w_int32


	def group_quantize_tensor(w, n_bit=4, groupsize=128):
	scales, zeros = get_group_qparams(w, n_bit, groupsize)
	w_int32 = group_quantize_tensor_from_qparams(w, scales, zeros, n_bit, groupsize)
	scales_and_zeros = pack_scales_and_zeros(scales, zeros)
	return w_int32, scales_and_zeros


	def group_dequantize_tensor_from_qparams(
	w_int32, scales, zeros, n_bit=4, groupsize=128
	):
	assert groupsize > 1
	# needed for GPTQ single column dequantize
	if groupsize > w_int32.shape[-1] and scales.shape[-1] == 1:
	groupsize = w_int32.shape[-1]
	assert w_int32.shape[-1] % groupsize == 0
	assert w_int32.dim() == 2

	w_int32_grouped = w_int32.reshape(-1, groupsize)
	scales = scales.reshape(-1, 1)
	zeros = zeros.reshape(-1, 1)

	w_dq = (
	w_int32_grouped.sub(2 ** (n_bit - 1)).mul(scales).add(zeros).reshape_as(w_int32)
	)
	return w_dq


	def group_dequantize_tensor(w_int32, scales_and_zeros, n_bit=4, groupsize=128):
	scales, zeros = unpack_scales_and_zeros(scales_and_zeros)
	return group_dequantize_tensor_from_qparams(
	w_int32, scales, zeros, n_bit, groupsize
	)


	class QuantHandler:
	def __init__(self, mod):
	self.mod = mod

	def create_quantized_state_dict(self) -> "StateDict":
	pass

	def convert_for_runtime(self) -> "nn.Module":
	pass


	##### Weight-only int8 per-channel quantized code ######


	def replace_linear_weight_only_int8_per_channel(module):
	for name, child in module.named_children():
	if isinstance(child, nn.Linear):
	setattr(
	module,
	name,
	WeightOnlyInt8Linear(child.in_features, child.out_features),
	)
	else:
	replace_linear_weight_only_int8_per_channel(child)


	class WeightOnlyInt8QuantHandler:
	def __init__(self, mod):
	self.mod = mod

	@torch.no_grad()
	def create_quantized_state_dict(self):
	cur_state_dict = self.mod.state_dict()
	for fqn, mod in self.mod.named_modules():
	if isinstance(mod, torch.nn.Linear):
	int8_weight, scales, _ = dynamically_quantize_per_channel(
	mod.weight.float(), -128, 127, torch.int8
	)
	cur_state_dict[f"{fqn}.weight"] = int8_weight
	cur_state_dict[f"{fqn}.scales"] = scales.to(mod.weight.dtype)

	return cur_state_dict

	def convert_for_runtime(self):
	replace_linear_weight_only_int8_per_channel(self.mod)
	return self.mod


	class WeightOnlyInt8Linear(torch.nn.Module):
	__constants__ = ["in_features", "out_features"]
	in_features: int
	out_features: int
	weight: torch.Tensor

	def __init__(
	self,
	in_features: int,
	out_features: int,
	bias: bool = True,
	device=None,
	dtype=None,
	) -> None:
	factory_kwargs = {"device": device, "dtype": dtype}
	super().__init__()
	self.in_features = in_features
	self.out_features = out_features
	self.register_buffer(
	"weight", torch.empty((out_features, in_features), dtype=torch.int8)
	)
	self.register_buffer("scales", torch.ones(out_features, dtype=torch.bfloat16))

	def forward(self, input: torch.Tensor) -> torch.Tensor:
	return F.linear(input, self.weight.to(dtype=input.dtype)) * self.scales


	##### weight only int4 per channel groupwise quantized code ######


	def prepare_int4_weight_and_scales_and_zeros(weight_bf16, groupsize, inner_k_tiles):
	weight_int32, scales_and_zeros = group_quantize_tensor(
	weight_bf16, n_bit=4, groupsize=groupsize
	)
	weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
	weight_int32, inner_k_tiles
	)
	return weight_int4pack, scales_and_zeros


	def linear_forward_int4(x, weight_int4pack, scales_and_zeros, out_features, groupsize):
	origin_x_size = x.size()
	x = x.reshape(-1, origin_x_size[-1])
	c = torch.ops.aten._weight_int4pack_mm(
	x, weight_int4pack, groupsize, scales_and_zeros
	)
	new_shape = origin_x_size[:-1] + (out_features,)
	c = c.reshape(new_shape)
	return c


	def _check_linear_int4_k(k, groupsize=1, inner_k_tiles=1):
	return k % groupsize == 0 and k % (inner_k_tiles * 16) == 0


	def replace_linear_int4(module, groupsize, inner_k_tiles, padding):
	for name, child in module.named_children():
	if isinstance(child, nn.Linear):
	if _check_linear_int4_k(child.in_features, groupsize, inner_k_tiles):
	setattr(
	module,
	name,
	WeightOnlyInt4Linear(
	child.in_features,
	child.out_features,
	bias=False,
	groupsize=groupsize,
	inner_k_tiles=inner_k_tiles,
	padding=False,
	),
	)
	elif padding:
	setattr(
	module,
	name,
	WeightOnlyInt4Linear(
	child.in_features,
	child.out_features,
	bias=False,
	groupsize=groupsize,
	inner_k_tiles=inner_k_tiles,
	padding=True,
	),
	)
	else:
	replace_linear_int4(child, groupsize, inner_k_tiles, padding)


	class WeightOnlyInt4QuantHandler:
	def __init__(self, mod, groupsize=128, inner_k_tiles=8, padding=True):
	self.mod = mod
	self.groupsize = groupsize
	self.inner_k_tiles = inner_k_tiles
	self.padding = padding
	assert groupsize in [32, 64, 128, 256]
	assert inner_k_tiles in [2, 4, 8]

	@torch.no_grad()
	def create_quantized_state_dict(self):
	cur_state_dict = self.mod.state_dict()
	for fqn, mod in self.mod.named_modules():
	if isinstance(mod, torch.nn.Linear):
	assert not mod.bias
	out_features = mod.out_features
	in_features = mod.in_features
	assert out_features % 8 == 0, "require out_features % 8 == 0"
	print(f"linear: {fqn}, in={in_features}, out={out_features}")

	weight = mod.weight.data
	if not _check_linear_int4_k(
	in_features, self.groupsize, self.inner_k_tiles
	):
	if self.padding:
	import torch.nn.functional as F

	print(
	f"warning: {fqn} is padded to satisfy in_features % 1024 == 0"
	)
	padded_in_features = find_multiple(in_features, 1024)
	weight = F.pad(
	weight, pad=(0, padded_in_features - in_features)
	)
	else:
	print(
	f"warning: {fqn} is skipped, int4 requires that in_features is 32, 64, or is divisible by 1024, "
	+ "and that groupsize and inner_k_tiles*16 evenly divide into it"
	)
	continue
	(
	weight_int4pack,
	scales_and_zeros,
	) = prepare_int4_weight_and_scales_and_zeros(
	weight.to(torch.bfloat16).to("cuda"),
	self.groupsize,
	self.inner_k_tiles,
	)
	cur_state_dict[f"{fqn}.weight"] = weight_int4pack.to("cpu")
	cur_state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros.to("cpu")

	return cur_state_dict

	def convert_for_runtime(self):
	replace_linear_int4(self.mod, self.groupsize, self.inner_k_tiles, self.padding)
	return self.mod


	class WeightOnlyInt4Linear(torch.nn.Module):
	__constants__ = ["in_features", "out_features"]
	in_features: int
	out_features: int
	weight: torch.Tensor

	def __init__(
	self,
	in_features: int,
	out_features: int,
	bias=True,
	device=None,
	dtype=None,
	groupsize: int = 128,
	inner_k_tiles: int = 8,
	padding: bool = True,
	) -> None:
	super().__init__()
	self.padding = padding
	if padding:
	self.origin_in_features = in_features
	in_features = find_multiple(in_features, 1024)

	self.in_features = in_features
	self.out_features = out_features
	assert not bias, "require bias=False"
	self.groupsize = groupsize
	self.inner_k_tiles = inner_k_tiles

	assert out_features % 8 == 0, "require out_features % 8 == 0"
	assert (
	in_features % (inner_k_tiles * 16) == 0
	), "require in_features % (innerKTiles * 16) == 0"
	self.register_buffer(
	"weight",
	torch.empty(
	(
	out_features // 8,
	in_features // (inner_k_tiles * 16),
	32,
	inner_k_tiles // 2,
	),
	dtype=torch.int32,
	),
	)
	self.register_buffer(
	"scales_and_zeros",
	torch.empty(
	(in_features // groupsize, out_features, 2), dtype=torch.bfloat16
	),
	)

	def forward(self, input: torch.Tensor) -> torch.Tensor:
	input = input.to(torch.bfloat16)
	if self.padding:
	import torch.nn.functional as F

	input = F.pad(input, pad=(0, self.in_features - self.origin_in_features))
	return linear_forward_int4(
	input, self.weight, self.scales_and_zeros, self.out_features, self.groupsize
	)


	def quantize(
	checkpoint_path: Path = Path("checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth"),
	mode: str = "int8",
	# following arguments only available when setting int4 quantization.
	groupsize: int = 128,
	) -> None:
	assert checkpoint_path.is_file(), checkpoint_path

	device = "cpu"
	precision = torch.bfloat16

	print("Loading model ...")
	t0 = time.time()

	with torch.device("meta"):
	model = Transformer(
	ModelArgs(
	max_seq_len=4096,
	vocab_size=36408,
	n_layer=24,
	n_head=16,
	dim=1024,
	rope_base=10000,
	norm_eps=1e-5,
	num_codebooks=4, # single codebook
	codebook_size=168, # codebook size 160 + 2 special tokens
	)
	)

	checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
	if "state_dict" in checkpoint:
	checkpoint = checkpoint["state_dict"]
	checkpoint = {
	k.replace("model.", ""): v
	for k, v in checkpoint.items()
	if k.startswith("model.")
	}
	model.load_state_dict(checkpoint, assign=True)
	model = model.to(dtype=precision, device=device)

	if mode == "int8":
	print(
	"Quantizing model weights for int8 weight-only symmetric per-channel quantization"
	)
	quant_handler = WeightOnlyInt8QuantHandler(model)
	quantized_state_dict = quant_handler.create_quantized_state_dict()

	dir_name = checkpoint_path.parent
	base_name = checkpoint_path.stem
	suffix = checkpoint_path.suffix
	quantize_path = dir_name / f"{base_name}.int8{suffix}"

	elif mode == "int4":
	print(
	"Quantizing model weights for int4 weight-only affine per-channel groupwise quantization"
	)
	quant_handler = WeightOnlyInt4QuantHandler(model, groupsize)
	quantized_state_dict = quant_handler.create_quantized_state_dict()

	dir_name = checkpoint_path.parent
	base_name = checkpoint_path.name
	suffix = checkpoint_path.suffix
	quantize_path = dir_name / f"{base_name}.int4.g{groupsize}{suffix}"

	else:
	raise ValueError(
	f"Invalid quantization mode {mode} needs to be one of [int8, int4, int4-gpptq]"
	)

	print(f"Writing quantized weights to {quantize_path}")
	quantize_path.unlink(missing_ok=True) # remove existing file if one already there
	torch.save(quantized_state_dict, quantize_path)
	print(f"Quantization complete took {time.time() - t0:.02f} seconds")


	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(description="Quantize a model.")
	parser.add_argument(
	"--checkpoint_path",
	type=Path,
	default=Path("checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth"),
	help="Path to the model checkpoint to be quantized.",
	)
	parser.add_argument(
	"--mode",
	"-q",
	type=str,
	default="int8",
	choices=["int8", "int4"],
	help="type of quantization to perform",
	)
	parser.add_argument(
	"--groupsize", type=int, default=32, help="Group size for int4 quantization."
	)

	args = parser.parse_args()
	quantize(args.checkpoint_path, args.mode, args.groupsize)