abhinavnmagic's picture
Update README.md
32379d5 verified
|
raw
history blame
11.7 kB
---
tags:
- fp8
- vllm
---
# Mixtral-8x7B-Instruct-v0.1-FP8
## Model Overview
Mixtral-8x7B-Instruct-v0.1 quantized to FP8 weights and activations, ready for inference with vLLM >= 0.5.0.
## Usage and Creation
Produced using [AutoFP8 with calibration samples from ultrachat](https://github.com/neuralmagic/AutoFP8/blob/147fa4d9e1a90ef8a93f96fc7d9c33056ddc017a/example_dataset.py).
Quantized using the script below:
Command:
```bash
python quantize.py --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --save-dir Mixtral-8x7B-Instruct-v0.1-FP8 --num-samples 512
```
Script:
```python
import argparse
import gc
import re
from typing import Tuple
import torch
import torch.functional as F
import transformers
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
# HACK: override the dtype_byte_size function in transformers to support float8 types
# Fix is posted upstream https://github.com/huggingface/transformers/pull/30488
def new_dtype_byte_size(dtype):
if dtype == torch.bool:
return 1 / 8
bit_search = re.search(r"[^\d](\d+)_?", str(dtype))
if bit_search is None:
raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
bit_size = int(bit_search.groups()[0])
return bit_size // 8
transformers.modeling_utils.dtype_byte_size = new_dtype_byte_size
def cleanup_memory():
gc.collect()
torch.cuda.empty_cache()
def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
"""Quantize a tensor using per-tensor static scaling factor.
Args:
tensor: The input tensor.
"""
finfo = torch.finfo(torch.float8_e4m3fn)
# Calculate the scale as dtype max divided by absmax.
# Since .abs() creates a new tensor, we use aminmax to get
# the min and max first and then calculate the absmax.
if tensor.numel() == 0:
# Deal with empty tensors (triggered by empty MoE experts)
min_val, max_val = (
torch.tensor(0.0, dtype=tensor.dtype),
torch.tensor(1.0, dtype=tensor.dtype),
)
else:
min_val, max_val = tensor.aminmax()
amax = min_val.abs().max(max_val.abs())
scale = finfo.max / amax.clamp(min=1e-12)
# scale and clamp the tensor to bring it to
# the representative range of float8 data type
# (as default cast is unsaturated)
qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)
# Return both float8 data and the inverse scale (as float),
# as both required as inputs to torch._scaled_mm
qweight = qweight.to(torch.float8_e4m3fn)
scale = scale.float().reciprocal()
return qweight, scale
def fp8_gemm(A, A_scale, B, B_scale, bias, out_dtype):
cuda_compute_capability = torch.cuda.get_device_capability()
if cuda_compute_capability >= (9, 0):
output, _ = torch._scaled_mm(
A,
B.t(),
out_dtype=out_dtype,
scale_a=A_scale,
scale_b=B_scale,
bias=bias,
)
else:
output = torch.nn.functional.linear(
A.to(out_dtype) * A_scale,
B.to(out_dtype) * B_scale.to(out_dtype),
bias=bias,
)
return output
class FP8StaticLinearQuantizer(torch.nn.Module):
def __init__(self, qweight, weight_scale):
super().__init__()
self.weight = torch.nn.Parameter(qweight, requires_grad=False)
self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
self.act_scale = None
def forward(self, x):
# Dynamically quantize
qinput, x_act_scale = per_tensor_quantize(x)
# Update scale if needed.
if self.act_scale is None:
self.act_scale = torch.nn.Parameter(x_act_scale)
elif x_act_scale > self.act_scale:
self.act_scale = torch.nn.Parameter(x_act_scale)
# Pass quantized to next layer so it has realistic data.
output = fp8_gemm(
A=qinput,
A_scale=self.act_scale,
B=self.weight,
B_scale=self.weight_scale,
bias=None,
out_dtype=x.dtype,
)
return output
class FP8StaticLinear(torch.nn.Module):
def __init__(self, qweight, weight_scale, act_scale=0.0):
super().__init__()
self.weight = torch.nn.Parameter(qweight, requires_grad=False)
self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
self.act_scale = torch.nn.Parameter(act_scale, requires_grad=False)
def per_tensor_quantize(
self, tensor: torch.Tensor, inv_scale: float
) -> torch.Tensor:
# Scale and clamp the tensor to bring it to
# the representative range of float8 data type
# (as default cast is unsaturated)
finfo = torch.finfo(torch.float8_e4m3fn)
qweight = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
return qweight.to(torch.float8_e4m3fn)
def forward(self, x):
qinput = self.per_tensor_quantize(x, inv_scale=self.act_scale)
output = fp8_gemm(
A=qinput,
A_scale=self.act_scale,
B=self.weight,
B_scale=self.weight_scale,
bias=None,
out_dtype=x.dtype,
)
return output
class FP8DynamicLinear(torch.nn.Module):
def __init__(self, qweight, scale):
super().__init__()
self.weight = torch.nn.Parameter(qweight, requires_grad=False)
self.weight_scale = torch.nn.Parameter(scale, requires_grad=False)
def forward(self, x):
qinput, x_scale = per_tensor_quantize(x)
output = fp8_gemm(
A=qinput,
A_scale=x_scale,
B=self.weight,
B_scale=self.weight_scale,
bias=None,
out_dtype=x.dtype,
)
return output
def replace_module(model, name, new_module):
if "." in name:
parent_name = name.rsplit(".", 1)[0]
child_name = name[len(parent_name) + 1 :]
parent = model.model.get_submodule(parent_name)
else:
parent_name = ""
parent = model.model
child_name = name
setattr(parent, child_name, new_module)
def quantize_weights(model):
for name, linear in model.model.named_modules():
if "gate" in name or not isinstance(linear, torch.nn.Linear):
continue
quant_weight, quant_scale = per_tensor_quantize(linear.weight)
quant_linear = FP8DynamicLinear(quant_weight, quant_scale)
replace_module(model, name, quant_linear)
del linear
cleanup_memory()
def quantize_activations(model, calibration_tokens):
# Replace layers with quantizer.
for name, dynamic_quant_linear in model.model.named_modules():
if "gate" in name or not isinstance(dynamic_quant_linear, FP8DynamicLinear):
continue
quantizer = FP8StaticLinearQuantizer(
dynamic_quant_linear.weight, dynamic_quant_linear.weight_scale
)
replace_module(model, name, quantizer)
del dynamic_quant_linear
cleanup_memory()
# Calibration.
for row_idx in range(calibration_tokens.shape[0]):
_ = model(calibration_tokens[row_idx].reshape(1, -1))
# Replace quantizer with StaticLayer.
for name, quantizer in model.model.named_modules():
if "gate" in name or not isinstance(quantizer, FP8StaticLinearQuantizer):
continue
static_proj = FP8StaticLinear(
quantizer.weight, quantizer.weight_scale, quantizer.act_scale
)
replace_module(model, name, static_proj)
del quantizer
cleanup_memory()
def save_quantized_model(model, activation_scheme, save_dir):
print(f"Saving the model to {save_dir}")
static_q_dict = {
"quantization_config": {
"quant_method": "fp8",
"activation_scheme": activation_scheme,
}
}
model.config.update(static_q_dict)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-id", type=str)
parser.add_argument("--save-dir", type=str)
parser.add_argument(
"--activation-scheme", type=str, default="static", choices=["static", "dynamic"]
)
parser.add_argument("--num-samples", type=int, default=512)
parser.add_argument("--max-seq-len", type=int, default=512)
args = parser.parse_args()
tokenizer = AutoTokenizer.from_pretrained(args.model_id)
sample_input_tokens = tokenizer.apply_chat_template(
[{"role": "user", "content": "What is your name?"}],
add_generation_prompt=True,
return_tensors="pt",
).to("cuda")
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")
ds = ds.shuffle(seed=42).select(range(args.num_samples))
ds = ds.map(
lambda batch: {
"text": tokenizer.apply_chat_template(batch["messages"], tokenize=False)
}
)
tokenizer.pad_token_id = tokenizer.eos_token_id
calibration_tokens = tokenizer(
ds["text"],
return_tensors="pt",
truncation=True,
padding="max_length",
max_length=args.max_seq_len,
add_special_tokens=False,
).input_ids.to("cuda")
print("Calibration tokens:", calibration_tokens.shape)
# Load and test the model
model = AutoModelForCausalLM.from_pretrained(
args.model_id, torch_dtype="auto", device_map="auto"
)
print(model)
output = model.generate(input_ids=sample_input_tokens, max_new_tokens=20)
print("ORIGINAL:\n", tokenizer.decode(output[0]), "\n\n")
# Quantize weights.
quantize_weights(model)
print(model)
output = model.generate(input_ids=sample_input_tokens, max_new_tokens=20)
print("WEIGHT QUANT:\n", tokenizer.decode(output[0]), "\n\n")
if args.activation_scheme in "dynamic":
print("Exporting model with static weights and dynamic activations")
save_quantized_model(model, args.activation_scheme, args.save_dir)
else:
assert args.activation_scheme in "static"
# Quantize activations.
quantize_activations(model, calibration_tokens=calibration_tokens)
output = model.generate(input_ids=sample_input_tokens, max_new_tokens=20)
print("ACT QUANT:\n", tokenizer.decode(output[0]), "\n\n")
print("Exporting model with static weights and static activations")
save_quantized_model(model, args.activation_scheme, args.save_dir)
```
## Evaluation
### Open LLM Leaderboard evaluation scores
| | Mixtral-8x7B-Instruct-v0.1 | Mixtral-8x7B-Instruct-v0.1-FP8<br>(this model) |
| :------------------: | :----------------------: | :------------------------------------------------: |
| arc-c<br>25-shot | 71.50 | 70.05 |
| hellaswag<br>10-shot | 87.53 | 86.30 |
| mmlu<br>5-shot | 70.33 | 68.81 |
| truthfulqa<br>0-shot | 64.79 | 63.69 |
| winogrande<br>5-shot | 82.40 | 81.69 |
| gsm8k<br>5-shot | 64.36 | 59.82 |
| **Average<br>Accuracy** | **73.48** | **71.72** |
| **Recovery** | **100%** | **97.60%** |