abetlen
/

paligemma-3b-mix-224-gguf

GGUF

Model card Files Files and versions Community

abetlen commited on Oct 2

Commit

81ceef8

•

1 Parent(s): 9cdca6f

Upload paligemma_to_gguf.py

Browse files

Files changed (1) hide show

paligemma_to_gguf.py +446 -0

paligemma_to_gguf.py ADDED Viewed

	@@ -0,0 +1,446 @@

+import os
+import json
+import typing
+import pathlib
+import argparse
+import numpy as np
+import numpy.typing as npt
+import gguf
+from safetensors import safe_open
+class SafetensorsIndexFile(typing.TypedDict):
+    weight_map: typing.Dict[str, str]
+class SafetensorsIndex:
+    def __init__(self, index_file_path: str):
+        directory = os.path.dirname(index_file_path)
+        self.index = typing.cast(SafetensorsIndexFile, json.load(open(index_file_path)))
+        self.weight_map = self.index["weight_map"]
+        files = set(self.weight_map.values())
+        self.tensors = {file: safe_open(os.path.join(directory, file), framework="np") for file in files}
+    def get_tensor(self, key: str) -> npt.NDArray[np.float32]:
+        return typing.cast(npt.NDArray[np.float32], self.tensors[self.weight_map[key]].get_tensor(key)) # type: ignore
+def k(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+def does_token_look_special(token: typing.Union[str, bytes]) -> bool:
+    if isinstance(token, (bytes, bytearray)):
+        token_text = token.decode(encoding="utf-8")
+    elif isinstance(token, memoryview):
+        token_text = token.tobytes().decode(encoding="utf-8")
+    else:
+        token_text = token
+    # Some models mark some added tokens which ought to be control tokens as not special.
+    # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
+    seems_special = token_text in (
+        "<pad>",  # deepseek-coder
+        "<mask>", "<2mass>", "[@BOS@]",  # gemma{,-2}
+    )
+    seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>"))
+    seems_special = seems_special or (token_text.startswith("<｜") and token_text.endswith("｜>"))  # deepseek-coder
+    # TODO: should these be marked as UNUSED instead? (maybe not)
+    seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">"))  # gemma{,-2}
+    return seems_special
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-d",
+        "--dir-model",
+        required=True,
+        help="path to directory containing the tokenizer",
+    )
+    args = parser.parse_args()
+    dir_model = pathlib.Path(args.dir_model)
+    # set model name to folder name
+    name = dir_model.name
+    tensors = SafetensorsIndex((dir_model / "model.safetensors.index.json").as_posix())
+    config = json.load(open(dir_model / "config.json"))
+    text_config = {
+        "max_position_embeddings": 8192,
+        "rms_norm_eps": 1e-6,
+        "head_dim": 256
+    }
+    text_config.update(config["text_config"])
+    vision_config = config["vision_config"]
+    preprocessor_config = json.load(open(dir_model / "preprocessor_config.json"))
+    ### Vision model
+    ftype = 1  # fp16
+    fname_middle = "mmproj-"
+    has_text_encoder = False
+    has_llava_projector = True
+    n_layers_clip = vision_config["num_hidden_layers"]
+    fname_out = f"{name}-mmproj-f16.gguf"
+    fout = gguf.GGUFWriter(fname_out, arch="clip")
+    fout.add_bool("clip.has_text_encoder", False)
+    fout.add_bool("clip.has_vision_encoder", True)
+    fout.add_bool("clip.has_llava_projector", True)
+    fout.add_file_type(ftype)  # fp16
+    model_name = f"google/{name}"
+    fout.add_name(model_name)
+    fout.add_description("image encoder for " + model_name)
+    fout.add_string("clip.projector_type", "mlp")
+    image_size = vision_config.get("image_size", preprocessor_config["size"]["height"])
+    # vision model hparams
+    VISION = "clip.vision"
+    fout.add_uint32("clip.vision.image_size", image_size)
+    fout.add_uint32("clip.vision.patch_size", vision_config["patch_size"])
+    fout.add_uint32(k(gguf.KEY_EMBEDDING_LENGTH, VISION), vision_config["hidden_size"])
+    fout.add_uint32(k(gguf.KEY_FEED_FORWARD_LENGTH, VISION), vision_config["intermediate_size"])
+    fout.add_uint32("clip.vision.projection_dim", vision_config["projection_dim"])
+    fout.add_uint32(k(gguf.KEY_ATTENTION_HEAD_COUNT, VISION), vision_config["num_attention_heads"])
+    fout.add_float32(k(gguf.KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
+    fout.add_uint32(k(gguf.KEY_BLOCK_COUNT, VISION), n_layers_clip + 1)
+    fout.add_array("clip.vision.image_mean", preprocessor_config["image_mean"])
+    fout.add_array("clip.vision.image_std", preprocessor_config["image_std"])
+    fout.add_bool("clip.use_gelu", vision_config["projector_hidden_act"] == "gelu")
+    fout.add_float32("clip.embeddings_scale", 1.0 / (config["projection_dim"]**0.5))
+    # vision projection
+    fout.add_tensor(
+        "mm.0.weight",
+        tensors.get_tensor("multi_modal_projector.linear.weight").astype(np.float16),
+    )
+    fout.add_tensor(
+        "mm.0.bias",
+        tensors.get_tensor("multi_modal_projector.linear.bias").astype(np.float32),
+    )
+    # encoder (siglip)
+    fout.add_tensor(
+        "v.position_embd.weight",
+        tensors.get_tensor("vision_tower.vision_model.embeddings.position_embedding.weight").astype(np.float16),
+    )
+    fout.add_tensor(
+        "v.patch_embd.weight",
+        tensors.get_tensor("vision_tower.vision_model.embeddings.patch_embedding.weight")
+            .reshape(vision_config["hidden_size"], 3, vision_config["patch_size"], vision_config["patch_size"])
+            .astype(np.float16),
+    )
+    fout.add_tensor(
+        "v.patch_embd.bias",
+        tensors.get_tensor("vision_tower.vision_model.embeddings.patch_embedding.bias").astype(np.float32),
+    )
+    fout.add_tensor(
+        "v.post_ln.weight",
+        tensors.get_tensor("vision_tower.vision_model.post_layernorm.weight").astype(np.float32),
+    )
+    fout.add_tensor(
+        "v.post_ln.bias",
+        tensors.get_tensor("vision_tower.vision_model.post_layernorm.bias").astype(np.float32),
+    )
+    def blk_tensor(i: int, name: str):
+        return tensors.get_tensor(
+            rf"vision_tower.vision_model.encoder.layers.{i}.{name}"
+        )
+    def add_tensor(blk_id: int, gguf_id: typing.Optional[int] = None):
+        if gguf_id is None:
+            gguf_id = blk_id
+        q_w = blk_tensor(blk_id, "self_attn.q_proj.weight")
+        k_w = blk_tensor(blk_id, "self_attn.k_proj.weight")
+        v_w = blk_tensor(blk_id, "self_attn.v_proj.weight")
+        q_b = blk_tensor(blk_id, "self_attn.q_proj.bias")
+        k_b = blk_tensor(blk_id, "self_attn.k_proj.bias")
+        v_b = blk_tensor(blk_id, "self_attn.v_proj.bias")
+        fout.add_tensor(f"v.blk.{gguf_id}.attn_q.weight", q_w.astype(np.float16))
+        fout.add_tensor(f"v.blk.{gguf_id}.attn_q.bias", q_b.astype(np.float32))
+        fout.add_tensor(f"v.blk.{gguf_id}.attn_k.weight", k_w.astype(np.float16))
+        fout.add_tensor(f"v.blk.{gguf_id}.attn_k.bias", k_b.astype(np.float32))
+        fout.add_tensor(f"v.blk.{gguf_id}.attn_v.weight", v_w.astype(np.float16))
+        fout.add_tensor(f"v.blk.{gguf_id}.attn_v.bias", v_b.astype(np.float32))
+        fout.add_tensor(
+            f"v.blk.{gguf_id}.attn_out.weight",
+            blk_tensor(blk_id, "self_attn.out_proj.weight").astype(np.float16),
+        )
+        fout.add_tensor(
+            f"v.blk.{gguf_id}.attn_out.bias",
+            blk_tensor(blk_id, "self_attn.out_proj.bias").astype(np.float32),
+        )
+        fout.add_tensor(
+            f"v.blk.{gguf_id}.ln1.weight",
+            blk_tensor(blk_id, "layer_norm1.weight").astype(np.float32),
+        )
+        fout.add_tensor(
+            f"v.blk.{gguf_id}.ln1.bias",
+            blk_tensor(blk_id, "layer_norm1.bias").astype(np.float32),
+        )
+        fout.add_tensor(
+            f"v.blk.{gguf_id}.ffn_down.weight",
+            blk_tensor(blk_id, "mlp.fc1.weight").astype(np.float16),
+        )
+        fout.add_tensor(
+            f"v.blk.{gguf_id}.ffn_down.bias",
+            blk_tensor(blk_id, "mlp.fc1.bias").astype(np.float32),
+        )
+        fout.add_tensor(
+            f"v.blk.{gguf_id}.ffn_up.weight",
+            blk_tensor(blk_id, "mlp.fc2.weight").astype(np.float16),
+        )
+        fout.add_tensor(
+            f"v.blk.{gguf_id}.ffn_up.bias",
+            blk_tensor(blk_id, "mlp.fc2.bias").astype(np.float32),
+        )
+        fout.add_tensor(
+            f"v.blk.{gguf_id}.ln2.weight",
+            blk_tensor(blk_id, "layer_norm2.weight").astype(np.float32),
+        )
+        fout.add_tensor(
+            f"v.blk.{gguf_id}.ln2.bias",
+            blk_tensor(blk_id, "layer_norm2.bias").astype(np.float32),
+        )
+    for i in range(n_layers_clip):
+        add_tensor(i)
+    # Duplicate the last block (llava-cli skips over this)
+    add_tensor(n_layers_clip - 1, n_layers_clip)
+    fout.write_header_to_file()
+    fout.write_kv_data_to_file()
+    fout.write_tensors_to_file()
+    fout.close()
+    print(f"GGUF written to {fname_out}")
+    ### Text model
+    # general GGUF init
+    fname_out = f"{name}-text-model-f16.gguf"
+    fout = gguf.GGUFWriter(fname_out, arch="gemma")
+    ftype = 1
+    block_count = text_config["num_hidden_layers"]
+    fout.add_name(name)
+    fout.add_context_length(text_config["max_position_embeddings"])
+    fout.add_embedding_length(text_config["hidden_size"])
+    fout.add_block_count(block_count)
+    fout.add_feed_forward_length(text_config["intermediate_size"])
+    fout.add_head_count(text_config["num_attention_heads"])
+    fout.add_head_count_kv(text_config.get("num_key_value_heads") or text_config["num_attention_heads"])
+    fout.add_layer_norm_rms_eps(text_config["rms_norm_eps"])
+    fout.add_key_length(text_config["head_dim"])
+    fout.add_value_length(text_config["head_dim"])
+    fout.add_file_type(ftype)
+    # fout.add_add_bos_token(True)
+    ### Tokenizer
+    # Taken from _set_vocab_sentencepiece
+    from enum import IntEnum
+    class SentencePieceTokenTypes(IntEnum):
+        NORMAL = 1
+        UNKNOWN = 2
+        CONTROL = 3
+        USER_DEFINED = 4
+        UNUSED = 5
+        BYTE = 6
+    from sentencepiece import SentencePieceProcessor
+    tokenizer_path = dir_model / 'tokenizer.model'
+    tokens: typing.List[bytes] = []
+    scores: typing.List[float] = []
+    toktypes: typing.List[int] = []
+    if not tokenizer_path.is_file():
+        raise FileNotFoundError(f"File not found: {tokenizer_path}")
+    tokenizer = SentencePieceProcessor()
+    tokenizer.LoadFromFile(str(tokenizer_path))
+    vocab_size = config["vocab_size"]
+    tokens: typing.List[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+    scores: typing.List[float] = [-10000.0] * vocab_size
+    toktypes: typing.List[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
+    for token_id in range(tokenizer.vocab_size()):
+        piece = tokenizer.IdToPiece(token_id)
+        text = piece.encode("utf-8")
+        score = tokenizer.GetScore(token_id)
+        toktype = SentencePieceTokenTypes.NORMAL
+        if tokenizer.IsUnknown(token_id):
+            toktype = SentencePieceTokenTypes.UNKNOWN
+        elif tokenizer.IsControl(token_id):
+            toktype = SentencePieceTokenTypes.CONTROL
+        elif tokenizer.IsUnused(token_id):
+            toktype = SentencePieceTokenTypes.UNUSED
+        elif tokenizer.IsByte(token_id):
+            toktype = SentencePieceTokenTypes.BYTE
+        tokens[token_id] = text
+        scores[token_id] = score
+        toktypes[token_id] = toktype
+    added_tokens_file = dir_model / 'added_tokens.json'
+    if added_tokens_file.is_file():
+        with open(added_tokens_file, "r", encoding="utf-8") as f:
+            added_tokens_json = json.load(f)
+            for key in added_tokens_json:
+                token_id = added_tokens_json[key]
+                if (token_id >= vocab_size):
+                    print(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
+                    continue
+                tokens[token_id] = key.encode("utf-8")
+                scores[token_id] = -1000.0
+                toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+    tokenizer_config_file = dir_model / 'tokenizer_config.json'
+    if tokenizer_config_file.is_file():
+        with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+            tokenizer_config_json = json.load(f)
+            added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
+            for token_id, token_data in added_tokens_decoder.items():
+                token_id = int(token_id)
+                token: str = token_data["content"]
+                if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
+                    if tokens[token_id] != token.encode("utf-8"):
+                        logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
+                if token_data.get("special") or does_token_look_special(token):
+                    toktypes[token_id] = SentencePieceTokenTypes.CONTROL
+                else:
+                    token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ")  # pre-normalize user-defined spaces
+                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+                scores[token_id] = -1000.0
+                tokens[token_id] = token.encode("utf-8")
+    if vocab_size > len(tokens):
+        pad_count = vocab_size - len(tokens)
+        print(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
+        for i in range(1, pad_count + 1):
+            tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
+            scores.append(-1000.0)
+            toktypes.append(SentencePieceTokenTypes.UNUSED)
+    fout.add_tokenizer_model("llama")
+    fout.add_tokenizer_pre("default")
+    fout.add_token_list(tokens)
+    fout.add_token_scores(scores)
+    fout.add_token_types(toktypes)
+    special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens))
+    special_vocab.add_to_gguf(fout)
+    fout.add_add_space_prefix(False)
+    ### Text model
+    fout.add_tensor(
+        "token_embd.weight",
+        tensors.get_tensor("language_model.model.embed_tokens.weight").astype(np.float16),
+    )
+    for i in range(text_config["num_hidden_layers"]):
+        fout.add_tensor(
+            f"blk.{i}.attn_norm.weight",
+            tensors.get_tensor(f"language_model.model.layers.{i}.input_layernorm.weight").astype(
+                np.float32
+            # https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
+            ) + 1,
+        )
+        fout.add_tensor(
+            f"blk.{i}.ffn_down.weight",
+            tensors.get_tensor(f"language_model.model.layers.{i}.mlp.down_proj.weight").astype(
+                np.float16
+            ),
+        )
+        fout.add_tensor(
+            f"blk.{i}.ffn_gate.weight",
+            tensors.get_tensor(f"language_model.model.layers.{i}.mlp.gate_proj.weight").astype(
+                np.float16
+            ),
+        )
+        fout.add_tensor(
+            f"blk.{i}.ffn_up.weight",
+            tensors.get_tensor(f"language_model.model.layers.{i}.mlp.up_proj.weight").astype(
+                np.float16
+            ),
+        )
+        fout.add_tensor(
+            f"blk.{i}.ffn_norm.weight",
+            tensors.get_tensor(f"language_model.model.layers.{i}.post_attention_layernorm.weight").astype(
+                np.float32
+            ) + 1,
+        )
+        fout.add_tensor(
+            f"blk.{i}.attn_k.weight",
+            tensors.get_tensor(
+                f"language_model.model.layers.{i}.self_attn.k_proj.weight"
+            ).astype(np.float16),
+        )
+        fout.add_tensor(
+            f"blk.{i}.attn_output.weight",
+            tensors.get_tensor(
+                f"language_model.model.layers.{i}.self_attn.o_proj.weight"
+            ).astype(np.float16),
+        )
+        fout.add_tensor(
+            f"blk.{i}.attn_q.weight",
+            tensors.get_tensor(
+                f"language_model.model.layers.{i}.self_attn.q_proj.weight"
+            ).astype(np.float16),
+        )
+        fout.add_tensor(
+            f"blk.{i}.attn_v.weight",
+            tensors.get_tensor(
+                f"language_model.model.layers.{i}.self_attn.v_proj.weight"
+            ).astype(np.float16),
+        )
+    fout.add_tensor(
+        "output_norm.weight",
+        tensors.get_tensor("language_model.model.norm.weight").astype(np.float32) + 1,
+    )
+    # save gguf
+    fout.write_header_to_file()
+    fout.write_kv_data_to_file()
+    fout.write_tensors_to_file()
+    fout.close()
+    print(f"GGUF written to {fname_out}")