mikudev commited on
Commit
86442c0
1 Parent(s): 146c143

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +42 -0
README.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - fp8
4
+ language:
5
+ - en
6
+ base_model: Sao10K/Llama-3.1-8B-Stheno-v3.4
7
+ ---
8
+
9
+ Original Model: https://huggingface.co/Sao10K/Llama-3.1-8B-Stheno-v3.4
10
+
11
+ Quantized with FP8 using https://github.com/neuralmagic/AutoFP8
12
+
13
+ Script:
14
+ ```python
15
+ from datasets import load_dataset
16
+ from transformers import AutoTokenizer
17
+
18
+ from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
19
+
20
+ pretrained_model_dir = "Sao10K/Llama-3.1-8B-Stheno-v3.4"
21
+ quantized_model_dir = "Llama-3.1-8B-Stheno-v3.4-FP8"
22
+
23
+ tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True, model_max_length=4096)
24
+ tokenizer.pad_token = tokenizer.eos_token
25
+
26
+ ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512))
27
+ examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds]
28
+ examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda")
29
+
30
+ quantize_config = BaseQuantizeConfig(
31
+ quant_method="fp8",
32
+ activation_scheme="static",
33
+ ignore_patterns=["re:.*lm_head"],
34
+ )
35
+
36
+ model = AutoFP8ForCausalLM.from_pretrained(
37
+ pretrained_model_dir, quantize_config=quantize_config
38
+ )
39
+
40
+ model.quantize(examples)
41
+ model.save_quantized(quantized_model_dir)
42
+ ```