Update README.md

#7
by reach-vb HF staff - opened
Files changed (1) hide show
  1. README.md +8 -1
README.md CHANGED
@@ -45,15 +45,22 @@ To run the inference on top of Llama 3.1 70B Instruct AWQ in INT4 precision, the
45
 
46
  ```python
47
  import torch
48
- from transformers import AutoModelForCausalLM, AutoTokenizer
49
 
50
  model_id = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
 
 
 
 
 
 
51
  tokenizer = AutoTokenizer.from_pretrained(model_id)
52
  model = AutoModelForCausalLM.from_pretrained(
53
  model_id,
54
  torch_dtype=torch.float16,
55
  low_cpu_mem_usage=True,
56
  device_map="auto",
 
57
  )
58
 
59
  prompt = [
 
45
 
46
  ```python
47
  import torch
48
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig
49
 
50
  model_id = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
51
+ quantization_config = AwqConfig(
52
+ bits=4,
53
+ fuse_max_seq_len=512, # Note: Update this as per your use-case
54
+ do_fuse=True,
55
+ )
56
+
57
  tokenizer = AutoTokenizer.from_pretrained(model_id)
58
  model = AutoModelForCausalLM.from_pretrained(
59
  model_id,
60
  torch_dtype=torch.float16,
61
  low_cpu_mem_usage=True,
62
  device_map="auto",
63
+ quantization_config=quantization_config
64
  )
65
 
66
  prompt = [