abetlen commited on
Commit
81ceef8
1 Parent(s): 9cdca6f

Upload paligemma_to_gguf.py

Browse files
Files changed (1) hide show
  1. paligemma_to_gguf.py +446 -0
paligemma_to_gguf.py ADDED
@@ -0,0 +1,446 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import typing
4
+ import pathlib
5
+ import argparse
6
+
7
+ import numpy as np
8
+ import numpy.typing as npt
9
+
10
+ import gguf
11
+
12
+ from safetensors import safe_open
13
+
14
+
15
+ class SafetensorsIndexFile(typing.TypedDict):
16
+ weight_map: typing.Dict[str, str]
17
+
18
+
19
+ class SafetensorsIndex:
20
+ def __init__(self, index_file_path: str):
21
+ directory = os.path.dirname(index_file_path)
22
+ self.index = typing.cast(SafetensorsIndexFile, json.load(open(index_file_path)))
23
+ self.weight_map = self.index["weight_map"]
24
+ files = set(self.weight_map.values())
25
+ self.tensors = {file: safe_open(os.path.join(directory, file), framework="np") for file in files}
26
+
27
+ def get_tensor(self, key: str) -> npt.NDArray[np.float32]:
28
+ return typing.cast(npt.NDArray[np.float32], self.tensors[self.weight_map[key]].get_tensor(key)) # type: ignore
29
+
30
+
31
+ def k(raw_key: str, arch: str) -> str:
32
+ return raw_key.format(arch=arch)
33
+
34
+ def does_token_look_special(token: typing.Union[str, bytes]) -> bool:
35
+ if isinstance(token, (bytes, bytearray)):
36
+ token_text = token.decode(encoding="utf-8")
37
+ elif isinstance(token, memoryview):
38
+ token_text = token.tobytes().decode(encoding="utf-8")
39
+ else:
40
+ token_text = token
41
+
42
+ # Some models mark some added tokens which ought to be control tokens as not special.
43
+ # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
44
+ seems_special = token_text in (
45
+ "<pad>", # deepseek-coder
46
+ "<mask>", "<2mass>", "[@BOS@]", # gemma{,-2}
47
+ )
48
+
49
+ seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>"))
50
+ seems_special = seems_special or (token_text.startswith("<|") and token_text.endswith("|>")) # deepseek-coder
51
+
52
+ # TODO: should these be marked as UNUSED instead? (maybe not)
53
+ seems_special = seems_special or (token_text.startswith("<unused") and token_text.endswith(">")) # gemma{,-2}
54
+
55
+ return seems_special
56
+
57
+
58
+ if __name__ == "__main__":
59
+ parser = argparse.ArgumentParser()
60
+ parser.add_argument(
61
+ "-d",
62
+ "--dir-model",
63
+ required=True,
64
+ help="path to directory containing the tokenizer",
65
+ )
66
+ args = parser.parse_args()
67
+
68
+ dir_model = pathlib.Path(args.dir_model)
69
+
70
+ # set model name to folder name
71
+ name = dir_model.name
72
+
73
+ tensors = SafetensorsIndex((dir_model / "model.safetensors.index.json").as_posix())
74
+
75
+ config = json.load(open(dir_model / "config.json"))
76
+ text_config = {
77
+ "max_position_embeddings": 8192,
78
+ "rms_norm_eps": 1e-6,
79
+ "head_dim": 256
80
+ }
81
+ text_config.update(config["text_config"])
82
+ vision_config = config["vision_config"]
83
+
84
+ preprocessor_config = json.load(open(dir_model / "preprocessor_config.json"))
85
+
86
+ ### Vision model
87
+
88
+ ftype = 1 # fp16
89
+
90
+ fname_middle = "mmproj-"
91
+ has_text_encoder = False
92
+ has_llava_projector = True
93
+
94
+ n_layers_clip = vision_config["num_hidden_layers"]
95
+
96
+ fname_out = f"{name}-mmproj-f16.gguf"
97
+ fout = gguf.GGUFWriter(fname_out, arch="clip")
98
+
99
+ fout.add_bool("clip.has_text_encoder", False)
100
+ fout.add_bool("clip.has_vision_encoder", True)
101
+ fout.add_bool("clip.has_llava_projector", True)
102
+ fout.add_file_type(ftype) # fp16
103
+
104
+ model_name = f"google/{name}"
105
+ fout.add_name(model_name)
106
+ fout.add_description("image encoder for " + model_name)
107
+ fout.add_string("clip.projector_type", "mlp")
108
+
109
+ image_size = vision_config.get("image_size", preprocessor_config["size"]["height"])
110
+
111
+ # vision model hparams
112
+ VISION = "clip.vision"
113
+ fout.add_uint32("clip.vision.image_size", image_size)
114
+ fout.add_uint32("clip.vision.patch_size", vision_config["patch_size"])
115
+ fout.add_uint32(k(gguf.KEY_EMBEDDING_LENGTH, VISION), vision_config["hidden_size"])
116
+ fout.add_uint32(k(gguf.KEY_FEED_FORWARD_LENGTH, VISION), vision_config["intermediate_size"])
117
+ fout.add_uint32("clip.vision.projection_dim", vision_config["projection_dim"])
118
+ fout.add_uint32(k(gguf.KEY_ATTENTION_HEAD_COUNT, VISION), vision_config["num_attention_heads"])
119
+ fout.add_float32(k(gguf.KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
120
+ fout.add_uint32(k(gguf.KEY_BLOCK_COUNT, VISION), n_layers_clip + 1)
121
+
122
+ fout.add_array("clip.vision.image_mean", preprocessor_config["image_mean"])
123
+ fout.add_array("clip.vision.image_std", preprocessor_config["image_std"])
124
+ fout.add_bool("clip.use_gelu", vision_config["projector_hidden_act"] == "gelu")
125
+ fout.add_float32("clip.embeddings_scale", 1.0 / (config["projection_dim"]**0.5))
126
+
127
+ # vision projection
128
+ fout.add_tensor(
129
+ "mm.0.weight",
130
+ tensors.get_tensor("multi_modal_projector.linear.weight").astype(np.float16),
131
+ )
132
+ fout.add_tensor(
133
+ "mm.0.bias",
134
+ tensors.get_tensor("multi_modal_projector.linear.bias").astype(np.float32),
135
+ )
136
+
137
+ # encoder (siglip)
138
+ fout.add_tensor(
139
+ "v.position_embd.weight",
140
+ tensors.get_tensor("vision_tower.vision_model.embeddings.position_embedding.weight").astype(np.float16),
141
+ )
142
+ fout.add_tensor(
143
+ "v.patch_embd.weight",
144
+ tensors.get_tensor("vision_tower.vision_model.embeddings.patch_embedding.weight")
145
+ .reshape(vision_config["hidden_size"], 3, vision_config["patch_size"], vision_config["patch_size"])
146
+ .astype(np.float16),
147
+ )
148
+ fout.add_tensor(
149
+ "v.patch_embd.bias",
150
+ tensors.get_tensor("vision_tower.vision_model.embeddings.patch_embedding.bias").astype(np.float32),
151
+ )
152
+
153
+ fout.add_tensor(
154
+ "v.post_ln.weight",
155
+ tensors.get_tensor("vision_tower.vision_model.post_layernorm.weight").astype(np.float32),
156
+ )
157
+ fout.add_tensor(
158
+ "v.post_ln.bias",
159
+ tensors.get_tensor("vision_tower.vision_model.post_layernorm.bias").astype(np.float32),
160
+ )
161
+
162
+ def blk_tensor(i: int, name: str):
163
+ return tensors.get_tensor(
164
+ rf"vision_tower.vision_model.encoder.layers.{i}.{name}"
165
+ )
166
+
167
+ def add_tensor(blk_id: int, gguf_id: typing.Optional[int] = None):
168
+ if gguf_id is None:
169
+ gguf_id = blk_id
170
+
171
+ q_w = blk_tensor(blk_id, "self_attn.q_proj.weight")
172
+ k_w = blk_tensor(blk_id, "self_attn.k_proj.weight")
173
+ v_w = blk_tensor(blk_id, "self_attn.v_proj.weight")
174
+ q_b = blk_tensor(blk_id, "self_attn.q_proj.bias")
175
+ k_b = blk_tensor(blk_id, "self_attn.k_proj.bias")
176
+ v_b = blk_tensor(blk_id, "self_attn.v_proj.bias")
177
+
178
+ fout.add_tensor(f"v.blk.{gguf_id}.attn_q.weight", q_w.astype(np.float16))
179
+ fout.add_tensor(f"v.blk.{gguf_id}.attn_q.bias", q_b.astype(np.float32))
180
+ fout.add_tensor(f"v.blk.{gguf_id}.attn_k.weight", k_w.astype(np.float16))
181
+ fout.add_tensor(f"v.blk.{gguf_id}.attn_k.bias", k_b.astype(np.float32))
182
+ fout.add_tensor(f"v.blk.{gguf_id}.attn_v.weight", v_w.astype(np.float16))
183
+ fout.add_tensor(f"v.blk.{gguf_id}.attn_v.bias", v_b.astype(np.float32))
184
+ fout.add_tensor(
185
+ f"v.blk.{gguf_id}.attn_out.weight",
186
+ blk_tensor(blk_id, "self_attn.out_proj.weight").astype(np.float16),
187
+ )
188
+ fout.add_tensor(
189
+ f"v.blk.{gguf_id}.attn_out.bias",
190
+ blk_tensor(blk_id, "self_attn.out_proj.bias").astype(np.float32),
191
+ )
192
+
193
+ fout.add_tensor(
194
+ f"v.blk.{gguf_id}.ln1.weight",
195
+ blk_tensor(blk_id, "layer_norm1.weight").astype(np.float32),
196
+ )
197
+ fout.add_tensor(
198
+ f"v.blk.{gguf_id}.ln1.bias",
199
+ blk_tensor(blk_id, "layer_norm1.bias").astype(np.float32),
200
+ )
201
+
202
+ fout.add_tensor(
203
+ f"v.blk.{gguf_id}.ffn_down.weight",
204
+ blk_tensor(blk_id, "mlp.fc1.weight").astype(np.float16),
205
+ )
206
+ fout.add_tensor(
207
+ f"v.blk.{gguf_id}.ffn_down.bias",
208
+ blk_tensor(blk_id, "mlp.fc1.bias").astype(np.float32),
209
+ )
210
+ fout.add_tensor(
211
+ f"v.blk.{gguf_id}.ffn_up.weight",
212
+ blk_tensor(blk_id, "mlp.fc2.weight").astype(np.float16),
213
+ )
214
+ fout.add_tensor(
215
+ f"v.blk.{gguf_id}.ffn_up.bias",
216
+ blk_tensor(blk_id, "mlp.fc2.bias").astype(np.float32),
217
+ )
218
+
219
+ fout.add_tensor(
220
+ f"v.blk.{gguf_id}.ln2.weight",
221
+ blk_tensor(blk_id, "layer_norm2.weight").astype(np.float32),
222
+ )
223
+ fout.add_tensor(
224
+ f"v.blk.{gguf_id}.ln2.bias",
225
+ blk_tensor(blk_id, "layer_norm2.bias").astype(np.float32),
226
+ )
227
+
228
+ for i in range(n_layers_clip):
229
+ add_tensor(i)
230
+
231
+ # Duplicate the last block (llava-cli skips over this)
232
+ add_tensor(n_layers_clip - 1, n_layers_clip)
233
+
234
+ fout.write_header_to_file()
235
+ fout.write_kv_data_to_file()
236
+ fout.write_tensors_to_file()
237
+ fout.close()
238
+
239
+ print(f"GGUF written to {fname_out}")
240
+
241
+ ### Text model
242
+
243
+ # general GGUF init
244
+ fname_out = f"{name}-text-model-f16.gguf"
245
+ fout = gguf.GGUFWriter(fname_out, arch="gemma")
246
+ ftype = 1
247
+
248
+ block_count = text_config["num_hidden_layers"]
249
+
250
+ fout.add_name(name)
251
+ fout.add_context_length(text_config["max_position_embeddings"])
252
+ fout.add_embedding_length(text_config["hidden_size"])
253
+ fout.add_block_count(block_count)
254
+ fout.add_feed_forward_length(text_config["intermediate_size"])
255
+ fout.add_head_count(text_config["num_attention_heads"])
256
+ fout.add_head_count_kv(text_config.get("num_key_value_heads") or text_config["num_attention_heads"])
257
+ fout.add_layer_norm_rms_eps(text_config["rms_norm_eps"])
258
+ fout.add_key_length(text_config["head_dim"])
259
+ fout.add_value_length(text_config["head_dim"])
260
+ fout.add_file_type(ftype)
261
+ # fout.add_add_bos_token(True)
262
+
263
+
264
+ ### Tokenizer
265
+
266
+ # Taken from _set_vocab_sentencepiece
267
+ from enum import IntEnum
268
+ class SentencePieceTokenTypes(IntEnum):
269
+ NORMAL = 1
270
+ UNKNOWN = 2
271
+ CONTROL = 3
272
+ USER_DEFINED = 4
273
+ UNUSED = 5
274
+ BYTE = 6
275
+
276
+ from sentencepiece import SentencePieceProcessor
277
+ tokenizer_path = dir_model / 'tokenizer.model'
278
+
279
+ tokens: typing.List[bytes] = []
280
+ scores: typing.List[float] = []
281
+ toktypes: typing.List[int] = []
282
+
283
+ if not tokenizer_path.is_file():
284
+ raise FileNotFoundError(f"File not found: {tokenizer_path}")
285
+
286
+ tokenizer = SentencePieceProcessor()
287
+ tokenizer.LoadFromFile(str(tokenizer_path))
288
+
289
+ vocab_size = config["vocab_size"]
290
+
291
+ tokens: typing.List[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
292
+ scores: typing.List[float] = [-10000.0] * vocab_size
293
+ toktypes: typing.List[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
294
+
295
+ for token_id in range(tokenizer.vocab_size()):
296
+ piece = tokenizer.IdToPiece(token_id)
297
+ text = piece.encode("utf-8")
298
+ score = tokenizer.GetScore(token_id)
299
+
300
+ toktype = SentencePieceTokenTypes.NORMAL
301
+ if tokenizer.IsUnknown(token_id):
302
+ toktype = SentencePieceTokenTypes.UNKNOWN
303
+ elif tokenizer.IsControl(token_id):
304
+ toktype = SentencePieceTokenTypes.CONTROL
305
+ elif tokenizer.IsUnused(token_id):
306
+ toktype = SentencePieceTokenTypes.UNUSED
307
+ elif tokenizer.IsByte(token_id):
308
+ toktype = SentencePieceTokenTypes.BYTE
309
+
310
+ tokens[token_id] = text
311
+ scores[token_id] = score
312
+ toktypes[token_id] = toktype
313
+
314
+ added_tokens_file = dir_model / 'added_tokens.json'
315
+ if added_tokens_file.is_file():
316
+ with open(added_tokens_file, "r", encoding="utf-8") as f:
317
+ added_tokens_json = json.load(f)
318
+ for key in added_tokens_json:
319
+ token_id = added_tokens_json[key]
320
+ if (token_id >= vocab_size):
321
+ print(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
322
+ continue
323
+
324
+ tokens[token_id] = key.encode("utf-8")
325
+ scores[token_id] = -1000.0
326
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
327
+
328
+ tokenizer_config_file = dir_model / 'tokenizer_config.json'
329
+ if tokenizer_config_file.is_file():
330
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
331
+ tokenizer_config_json = json.load(f)
332
+ added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
333
+ for token_id, token_data in added_tokens_decoder.items():
334
+ token_id = int(token_id)
335
+ token: str = token_data["content"]
336
+ if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
337
+ if tokens[token_id] != token.encode("utf-8"):
338
+ logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
339
+ if token_data.get("special") or does_token_look_special(token):
340
+ toktypes[token_id] = SentencePieceTokenTypes.CONTROL
341
+ else:
342
+ token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces
343
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
344
+
345
+ scores[token_id] = -1000.0
346
+ tokens[token_id] = token.encode("utf-8")
347
+
348
+ if vocab_size > len(tokens):
349
+ pad_count = vocab_size - len(tokens)
350
+ print(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
351
+ for i in range(1, pad_count + 1):
352
+ tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
353
+ scores.append(-1000.0)
354
+ toktypes.append(SentencePieceTokenTypes.UNUSED)
355
+
356
+ fout.add_tokenizer_model("llama")
357
+ fout.add_tokenizer_pre("default")
358
+ fout.add_token_list(tokens)
359
+ fout.add_token_scores(scores)
360
+ fout.add_token_types(toktypes)
361
+
362
+ special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens))
363
+ special_vocab.add_to_gguf(fout)
364
+ fout.add_add_space_prefix(False)
365
+
366
+ ### Text model
367
+
368
+ fout.add_tensor(
369
+ "token_embd.weight",
370
+ tensors.get_tensor("language_model.model.embed_tokens.weight").astype(np.float16),
371
+ )
372
+
373
+ for i in range(text_config["num_hidden_layers"]):
374
+ fout.add_tensor(
375
+ f"blk.{i}.attn_norm.weight",
376
+ tensors.get_tensor(f"language_model.model.layers.{i}.input_layernorm.weight").astype(
377
+ np.float32
378
+ # https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
379
+ ) + 1,
380
+ )
381
+
382
+ fout.add_tensor(
383
+ f"blk.{i}.ffn_down.weight",
384
+ tensors.get_tensor(f"language_model.model.layers.{i}.mlp.down_proj.weight").astype(
385
+ np.float16
386
+ ),
387
+ )
388
+ fout.add_tensor(
389
+ f"blk.{i}.ffn_gate.weight",
390
+ tensors.get_tensor(f"language_model.model.layers.{i}.mlp.gate_proj.weight").astype(
391
+ np.float16
392
+ ),
393
+ )
394
+
395
+ fout.add_tensor(
396
+ f"blk.{i}.ffn_up.weight",
397
+ tensors.get_tensor(f"language_model.model.layers.{i}.mlp.up_proj.weight").astype(
398
+ np.float16
399
+ ),
400
+ )
401
+
402
+ fout.add_tensor(
403
+ f"blk.{i}.ffn_norm.weight",
404
+ tensors.get_tensor(f"language_model.model.layers.{i}.post_attention_layernorm.weight").astype(
405
+ np.float32
406
+ ) + 1,
407
+ )
408
+
409
+ fout.add_tensor(
410
+ f"blk.{i}.attn_k.weight",
411
+ tensors.get_tensor(
412
+ f"language_model.model.layers.{i}.self_attn.k_proj.weight"
413
+ ).astype(np.float16),
414
+ )
415
+ fout.add_tensor(
416
+ f"blk.{i}.attn_output.weight",
417
+ tensors.get_tensor(
418
+ f"language_model.model.layers.{i}.self_attn.o_proj.weight"
419
+ ).astype(np.float16),
420
+ )
421
+ fout.add_tensor(
422
+ f"blk.{i}.attn_q.weight",
423
+ tensors.get_tensor(
424
+ f"language_model.model.layers.{i}.self_attn.q_proj.weight"
425
+ ).astype(np.float16),
426
+ )
427
+ fout.add_tensor(
428
+ f"blk.{i}.attn_v.weight",
429
+ tensors.get_tensor(
430
+ f"language_model.model.layers.{i}.self_attn.v_proj.weight"
431
+ ).astype(np.float16),
432
+ )
433
+
434
+ fout.add_tensor(
435
+ "output_norm.weight",
436
+ tensors.get_tensor("language_model.model.norm.weight").astype(np.float32) + 1,
437
+ )
438
+
439
+
440
+ # save gguf
441
+ fout.write_header_to_file()
442
+ fout.write_kv_data_to_file()
443
+ fout.write_tensors_to_file()
444
+ fout.close()
445
+
446
+ print(f"GGUF written to {fname_out}")