Text Generation
Transformers
ONNX
llama
sparse
instruct
deepsparse
File size: 1,747 Bytes
427e493
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
test_stage:
  obcq_modifiers:
    LogarithmicEqualizationModifier:
      mappings:
      - - ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj']
        - re:.*input_layernorm
      - - ['re:.*gate_proj', 're:.*up_proj']
        - re:.*post_attention_layernorm
      - - ['re:.*down_proj']
        - re:.*up_proj
    QuantizationModifier:
      ignore: [LlamaRotaryEmbedding, LlamaRMSNorm, SiLUActivation, model.layers.30.mlp.down_proj,
        model.layers.1.mlp.down_proj, model.layers.0.mlp.down_proj, model.layers.4.mlp.down_proj,
        MatMulOutput_QK, MatMulOutput_PV, MatMulLeftInput_QK, MatMulLeftInput_PV, MatMulRightInput_QK,
        MatMulRightInput_PV, QuantizableMatMul]
      post_oneshot_calibration: true
      scheme_overrides:
        Linear:
          weights: {num_bits: 8, symmetric: true, strategy: channel}
        Embedding:
          input_activations: null
          weights: {num_bits: 8, symmetric: false}
    SparseGPTModifier:
      sparsity: 0.0
      block_size: 128
      sequential_update: false
      quantize: true
      percdamp: 0.01
      prunen: 0
      prunem: 0
      targets: [model.layers.0, model.layers.1, model.layers.2, model.layers.3, model.layers.4,
        model.layers.5, model.layers.6, model.layers.7, model.layers.8, model.layers.9, model.layers.10,
        model.layers.11, model.layers.12, model.layers.13, model.layers.14, model.layers.15,
        model.layers.16, model.layers.17, model.layers.18, model.layers.19, model.layers.20,
        model.layers.21, model.layers.22, model.layers.23, model.layers.24, model.layers.25,
        model.layers.26, model.layers.27, model.layers.28, model.layers.29, model.layers.30,
        model.layers.31]
      target_ids: [attention_mask, position_ids]