salamandra-2b / IQ3_XXS_log.txt

update for quantization

5dadba4 18 days ago

39.5 kB

	main: build = 3906 (7eee341b)
	main: built with Apple clang version 15.0.0 (clang-1500.3.9.4) for arm64-apple-darwin23.6.0
	main: quantizing 'salamandra-2b_bf16.gguf' to './salamandra-2b_IQ3_XXS.gguf' as IQ3_XXS
	llama_model_loader: loaded meta data with 29 key-value pairs and 219 tensors from salamandra-2b_bf16.gguf (version GGUF V3 (latest))
	llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
	llama_model_loader: - kv 0: general.architecture str = llama
	llama_model_loader: - kv 1: general.type str = model
	llama_model_loader: - kv 2: general.size_label str = 2.3B
	llama_model_loader: - kv 3: general.license str = apache-2.0
	llama_model_loader: - kv 4: general.tags arr[str,1] = ["text-generation"]
	llama_model_loader: - kv 5: general.languages arr[str,36] = ["bg", "ca", "code", "cs", "cy", "da"...
	llama_model_loader: - kv 6: llama.block_count u32 = 24
	llama_model_loader: - kv 7: llama.context_length u32 = 8192
	llama_model_loader: - kv 8: llama.embedding_length u32 = 2048
	llama_model_loader: - kv 9: llama.feed_forward_length u32 = 5440
	llama_model_loader: - kv 10: llama.attention.head_count u32 = 16
	llama_model_loader: - kv 11: llama.attention.head_count_kv u32 = 16
	llama_model_loader: - kv 12: llama.rope.freq_base f32 = 10000.000000
	llama_model_loader: - kv 13: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
	llama_model_loader: - kv 14: general.file_type u32 = 32
	llama_model_loader: - kv 15: llama.vocab_size u32 = 256000
	llama_model_loader: - kv 16: llama.rope.dimension_count u32 = 128
	llama_model_loader: - kv 17: tokenizer.ggml.add_space_prefix bool = true
	llama_model_loader: - kv 18: tokenizer.ggml.model str = llama
	llama_model_loader: - kv 19: tokenizer.ggml.pre str = default
	llama_model_loader: - kv 20: tokenizer.ggml.tokens arr[str,256000] = ["<unk>", "<s>", "</s>", "<pad>", "<\|...
	llama_model_loader: - kv 21: tokenizer.ggml.scores arr[f32,256000] = [-1000.000000, -1000.000000, -1000.00...
	llama_model_loader: - kv 22: tokenizer.ggml.token_type arr[i32,256000] = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
	llama_model_loader: - kv 23: tokenizer.ggml.bos_token_id u32 = 1
	llama_model_loader: - kv 24: tokenizer.ggml.eos_token_id u32 = 2
	llama_model_loader: - kv 25: tokenizer.ggml.unknown_token_id u32 = 0
	llama_model_loader: - kv 26: tokenizer.ggml.add_bos_token bool = true
	llama_model_loader: - kv 27: tokenizer.ggml.add_eos_token bool = false
	llama_model_loader: - kv 28: general.quantization_version u32 = 2
	llama_model_loader: - type f32: 49 tensors
	llama_model_loader: - type bf16: 170 tensors
	================================ Have weights data with 168 entries
	[ 1/ 219] output.weight - [ 2048, 256000, 1, 1], type = bf16, size = 1000.000 MB
	[ 2/ 219] token_embd.weight - [ 2048, 256000, 1, 1], type = bf16,
	====== llama_model_quantize_internal: did not find weights for token_embd.weight
	converting to iq3_s .. load_imatrix: imatrix dataset='./imatrix/oscar/imatrix-dataset.txt'
	load_imatrix: loaded 168 importance matrix entries from imatrix/oscar/imatrix.dat computed on 44176 chunks
	prepare_imatrix: have 168 importance matrix entries
	size = 1000.00 MiB -> 214.84 MiB
	[ 3/ 219] blk.0.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 4/ 219] blk.0.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 5/ 219] blk.0.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 6/ 219] blk.0.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 7/ 219] blk.0.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 8/ 219] blk.0.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 9/ 219] blk.0.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 10/ 219] blk.0.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 11/ 219] blk.0.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 12/ 219] blk.1.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 13/ 219] blk.1.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 14/ 219] blk.1.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 15/ 219] blk.1.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 16/ 219] blk.1.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 17/ 219] blk.1.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 18/ 219] blk.1.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 19/ 219] blk.1.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 20/ 219] blk.1.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 21/ 219] blk.10.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 22/ 219] blk.10.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 23/ 219] blk.10.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 24/ 219] blk.10.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 25/ 219] blk.10.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 26/ 219] blk.10.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 27/ 219] blk.10.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 28/ 219] blk.10.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 29/ 219] blk.10.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 30/ 219] blk.11.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 31/ 219] blk.11.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 32/ 219] blk.11.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 33/ 219] blk.11.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 34/ 219] blk.11.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 35/ 219] blk.11.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 36/ 219] blk.11.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 37/ 219] blk.11.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 38/ 219] blk.11.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 39/ 219] blk.12.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 40/ 219] blk.12.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 41/ 219] blk.12.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 42/ 219] blk.12.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 43/ 219] blk.12.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 44/ 219] blk.12.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 45/ 219] blk.12.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 46/ 219] blk.12.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 47/ 219] blk.12.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 48/ 219] blk.13.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 49/ 219] blk.13.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 50/ 219] blk.13.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 51/ 219] blk.13.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 52/ 219] blk.13.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 53/ 219] blk.13.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 54/ 219] blk.13.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 55/ 219] blk.13.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 56/ 219] blk.13.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 57/ 219] blk.14.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 58/ 219] blk.14.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 59/ 219] blk.14.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 60/ 219] blk.14.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 61/ 219] blk.14.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 62/ 219] blk.14.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 63/ 219] blk.14.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 64/ 219] blk.14.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 65/ 219] blk.14.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 66/ 219] blk.15.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 67/ 219] blk.15.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 68/ 219] blk.15.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 69/ 219] blk.15.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 70/ 219] blk.15.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 71/ 219] blk.15.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 72/ 219] blk.15.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 73/ 219] blk.15.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 74/ 219] blk.15.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 75/ 219] blk.16.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 76/ 219] blk.16.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 77/ 219] blk.16.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 78/ 219] blk.16.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 79/ 219] blk.16.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 80/ 219] blk.16.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 81/ 219] blk.16.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 82/ 219] blk.16.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 83/ 219] blk.16.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 84/ 219] blk.17.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 85/ 219] blk.17.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 86/ 219] blk.17.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 87/ 219] blk.17.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 88/ 219] blk.17.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 89/ 219] blk.17.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 90/ 219] blk.17.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 91/ 219] blk.17.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 92/ 219] blk.17.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 93/ 219] blk.18.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 94/ 219] blk.18.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 95/ 219] blk.18.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 96/ 219] blk.18.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 97/ 219] blk.18.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 98/ 219] blk.18.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 99/ 219] blk.18.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 100/ 219] blk.18.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 101/ 219] blk.18.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 102/ 219] blk.19.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 103/ 219] blk.19.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 104/ 219] blk.19.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 105/ 219] blk.19.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 106/ 219] blk.19.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 107/ 219] blk.19.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 108/ 219] blk.19.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 109/ 219] blk.19.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 110/ 219] blk.19.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 111/ 219] blk.2.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 112/ 219] blk.2.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 113/ 219] blk.2.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 114/ 219] blk.2.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 115/ 219] blk.2.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 116/ 219] blk.2.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 117/ 219] blk.2.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 118/ 219] blk.2.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 119/ 219] blk.2.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 120/ 219] blk.20.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 121/ 219] blk.20.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 122/ 219] blk.20.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 123/ 219] blk.20.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 124/ 219] blk.20.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 125/ 219] blk.20.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 126/ 219] blk.20.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 127/ 219] blk.20.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 128/ 219] blk.20.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 129/ 219] blk.21.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 130/ 219] blk.21.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 131/ 219] blk.21.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 132/ 219] blk.21.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 133/ 219] blk.21.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 134/ 219] blk.21.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 135/ 219] blk.21.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 136/ 219] blk.21.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 137/ 219] blk.21.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 138/ 219] blk.22.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 139/ 219] blk.22.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 140/ 219] blk.22.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 141/ 219] blk.22.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 142/ 219] blk.22.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 143/ 219] blk.22.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 144/ 219] blk.22.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 145/ 219] blk.22.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 146/ 219] blk.22.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 147/ 219] blk.23.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 148/ 219] blk.23.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 149/ 219] blk.23.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 150/ 219] blk.23.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 151/ 219] blk.23.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 152/ 219] blk.23.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 153/ 219] blk.23.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 154/ 219] blk.23.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 155/ 219] blk.23.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 156/ 219] blk.3.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 157/ 219] blk.3.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 158/ 219] blk.3.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 159/ 219] blk.3.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 160/ 219] blk.3.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 161/ 219] blk.3.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 162/ 219] blk.3.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 163/ 219] blk.3.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 164/ 219] blk.3.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 165/ 219] blk.4.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 166/ 219] blk.4.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 167/ 219] blk.4.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 168/ 219] blk.4.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 169/ 219] blk.4.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 170/ 219] blk.4.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 171/ 219] blk.4.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 172/ 219] blk.4.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 173/ 219] blk.4.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 174/ 219] blk.5.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 175/ 219] blk.5.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 176/ 219] blk.5.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 177/ 219] blk.5.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 178/ 219] blk.5.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 179/ 219] blk.5.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 180/ 219] blk.5.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 181/ 219] blk.5.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 182/ 219] blk.5.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 183/ 219] blk.6.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 184/ 219] blk.6.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 185/ 219] blk.6.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 186/ 219] blk.6.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 187/ 219] blk.6.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 188/ 219] blk.6.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 189/ 219] blk.6.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 190/ 219] blk.6.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 191/ 219] blk.6.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 192/ 219] blk.7.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 193/ 219] blk.7.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 194/ 219] blk.7.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 195/ 219] blk.7.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 196/ 219] blk.7.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 197/ 219] blk.7.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 198/ 219] blk.7.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 199/ 219] blk.7.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 200/ 219] blk.7.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 201/ 219] blk.8.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 202/ 219] blk.8.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 203/ 219] blk.8.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 204/ 219] blk.8.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 205/ 219] blk.8.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 206/ 219] blk.8.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 207/ 219] blk.8.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 208/ 219] blk.8.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 209/ 219] blk.8.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 210/ 219] blk.9.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 211/ 219] blk.9.ffn_down.weight - [ 5440, 2048, 1, 1], type = bf16,

	llama_tensor_get_type : tensor cols 5440 x 2048 are not divisible by 256, required for iq3_xxs - using fallback quantization iq4_nl
	converting to iq4_nl .. size = 21.25 MiB -> 5.98 MiB
	[ 212/ 219] blk.9.ffn_gate.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 213/ 219] blk.9.ffn_up.weight - [ 2048, 5440, 1, 1], type = bf16, converting to iq3_xxs .. size = 21.25 MiB -> 4.07 MiB
	[ 214/ 219] blk.9.ffn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	[ 215/ 219] blk.9.attn_k.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 216/ 219] blk.9.attn_output.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_s .. size = 8.00 MiB -> 1.72 MiB
	[ 217/ 219] blk.9.attn_q.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq2_s .. size = 8.00 MiB -> 1.28 MiB
	[ 218/ 219] blk.9.attn_v.weight - [ 2048, 2048, 1, 1], type = bf16, converting to iq3_xxs .. size = 8.00 MiB -> 1.53 MiB
	[ 219/ 219] output_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB
	llama_model_quantize_internal: model size = 4298.38 MB
	llama_model_quantize_internal: quant size = 1693.40 MB
	llama_model_quantize_internal: WARNING: 24 of 169 tensor(s) required fallback quantization

	main: quantize time = 28893.81 ms
	main: total time = 28893.81 ms