Upload 13 files

Browse files

Files changed (14) hide show

.gitattributes +2 -0
demo.py +13 -0
gemm_config.in +111 -0
lyraBelle/__init__.py +1 -0
lyraBelle/config.py +33 -0
lyraBelle/libth_transformer.so +3 -0
lyraBelle/lyraBelle.py +163 -0
lyraBelle/model.py +764 -0
model/1-gpu-fp16.h5 +3 -0
model/config.ini +20 -0
model/special_tokens_map.json +1 -0
model/tokenizer.json +3 -0
model/tokenizer_config.json +1 -0
requirements.txt +6 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+lyraBelle/libth_transformer.so filter=lfs diff=lfs merge=lfs -text
+model/tokenizer.json filter=lfs diff=lfs merge=lfs -text

demo.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from lyraBelle import LyraBelle
+data_type = "fp16"
+prompts = "今天天气大概 25度，有点小雨，吹着风，我想去户外散步，应该穿什么样的衣服裤子鞋子搭配。"
+model_dir = "./model"
+model_name = "1-gpu-fp16.h5"
+max_output_length = 512
+model = LyraBelle(model_dir, model_name, data_type, 0)
+output_texts = model.generate(prompts, output_length=max_output_length,top_k=30, top_p=0.85, temperature=0.35, repetition_penalty=1.2, do_sample=True)
+print(output_texts)

gemm_config.in ADDED Viewed

	@@ -0,0 +1,111 @@

+batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time
+64 64 32 128 1 ### 1 12288 4096 4096 6 0 20 0 1 0 0 11 1.444813
+64 64 32 128 1 ### 2048 64 64 128 112 -1 -1 -1 -1 -1 -1 -1 0.083370
+64 64 32 128 1 ### 2048 128 64 64 100 -1 -1 -1 -1 -1 -1 -1 0.070630
+64 64 32 128 1 ### 1 4096 4096 4096 6 0 24 1 0 0 0 9 0.502825
+64 64 32 128 1 ### 1 16384 4096 4096 6 0 20 0 1 0 0 11 1.898404
+64 64 32 128 1 ### 1 4096 4096 16384 21 0 24 1 0 0 0 12 1.909555
+64 1 32 128 1 ### 1 12288 64 4096 6 0 18 0 1 0 0 16 0.080251
+64 1 32 128 1 ### 1 4096 64 4096 6 0 15 1 0 0 0 18 0.026583
+64 1 32 128 1 ### 1 16384 64 4096 6 0 18 0 1 0 0 15 0.110223
+64 1 32 128 1 ### 1 4096 64 16384 31 0 15 1 1 0 0 18 0.109978
+64 1 32 128 1 ### 1 250880 64 4096 112 -1 -1 -1 -1 -1 -1 -1 1.602350
+32 64 32 128 1 ### 1 12288 2048 4096 6 0 20 0 1 0 0 11 0.750490
+32 64 32 128 1 ### 1024 64 64 128 109 -1 -1 -1 -1 -1 -1 -1 0.047020
+32 64 32 128 1 ### 1024 128 64 64 108 -1 -1 -1 -1 -1 -1 -1 0.037950
+32 64 32 128 1 ### 1 4096 2048 4096 6 0 20 0 0 0 0 11 0.256123
+32 64 32 128 1 ### 1 16384 2048 4096 6 0 20 0 1 0 0 11 0.959887
+32 64 32 128 1 ### 1 4096 2048 16384 6 0 20 0 1 0 0 11 0.979282
+32 1 32 128 1 ### 1 12288 32 4096 6 0 18 0 0 0 0 16 0.078582
+32 1 32 128 1 ### 1 4096 32 4096 31 0 15 1 0 0 0 18 0.024535
+32 1 32 128 1 ### 1 16384 32 4096 6 0 18 0 0 0 0 12 0.105523
+32 1 32 128 1 ### 1 4096 32 16384 109 -1 -1 -1 -1 -1 -1 -1 0.105160
+32 1 32 128 1 ### 1 250880 32 4096 114 -1 -1 -1 -1 -1 -1 -1 1.479260
+16 64 32 128 1 ### 1 12288 1024 4096 6 0 20 2 1 1 3072 11 0.398694
+16 64 32 128 1 ### 512 64 64 128 105 -1 -1 -1 -1 -1 -1 -1 0.015370
+16 64 32 128 1 ### 512 128 64 64 114 -1 -1 -1 -1 -1 -1 -1 0.014250
+16 64 32 128 1 ### 1 4096 1024 4096 21 0 20 2 0 1 1024 11 0.144855
+16 64 32 128 1 ### 1 16384 1024 4096 6 0 20 0 1 0 0 11 0.505098
+16 64 32 128 1 ### 1 4096 1024 16384 111 -1 -1 -1 -1 -1 -1 -1 0.545680
+16 1 32 128 1 ### 1 12288 16 4096 6 0 18 1 1 0 0 16 0.077865
+16 1 32 128 1 ### 1 4096 16 4096 31 0 15 1 1 0 0 18 0.024023
+16 1 32 128 1 ### 1 16384 16 4096 6 0 21 1 0 0 0 15 0.104765
+16 1 32 128 1 ### 1 4096 16 16384 6 0 15 1 1 0 0 17 0.105298
+16 1 32 128 1 ### 1 250880 16 4096 109 -1 -1 -1 -1 -1 -1 -1 1.450620
+8 64 32 128 1 ### 1 12288 512 4096 115 -1 -1 -1 -1 -1 -1 -1 0.204910
+8 64 32 128 1 ### 256 64 64 128 105 -1 -1 -1 -1 -1 -1 -1 0.010500
+8 64 32 128 1 ### 256 128 64 64 109 -1 -1 -1 -1 -1 -1 -1 0.010250
+8 64 32 128 1 ### 1 4096 512 4096 6 0 20 4 1 1 512 11 0.081009
+8 64 32 128 1 ### 1 16384 512 4096 107 -1 -1 -1 -1 -1 -1 -1 0.257450
+8 64 32 128 1 ### 1 4096 512 16384 6 0 20 5 1 1 512 11 0.256573
+8 1 32 128 1 ### 1 12288 8 4096 6 0 18 1 1 0 0 16 0.077445
+8 1 32 128 1 ### 1 4096 8 4096 31 0 15 1 1 0 0 18 0.023245
+8 1 32 128 1 ### 1 16384 8 4096 110 -1 -1 -1 -1 -1 -1 -1 0.104450
+8 1 32 128 1 ### 1 4096 8 16384 6 0 15 1 1 0 0 17 0.104192
+8 1 32 128 1 ### 1 250880 8 4096 108 -1 -1 -1 -1 -1 -1 -1 1.429910
+1 64 32 128 1 ### 1 12288 64 4096 109 -1 -1 -1 -1 -1 -1 -1 0.080110
+1 64 32 128 1 ### 32 64 64 128 103 -1 -1 -1 -1 -1 -1 -1 0.005320
+1 64 32 128 1 ### 32 128 64 64 109 -1 -1 -1 -1 -1 -1 -1 0.005470
+1 64 32 128 1 ### 1 4096 64 4096 6 0 15 1 0 0 0 18 0.026429
+1 64 32 128 1 ### 1 16384 64 4096 107 -1 -1 -1 -1 -1 -1 -1 0.110100
+1 64 32 128 1 ### 1 4096 64 16384 31 0 15 1 1 0 0 18 0.109885
+1 1 32 128 1 ### 1 12288 1 4096 6 0 18 1 1 0 0 16 0.076769
+1 1 32 128 1 ### 1 4096 1 4096 6 0 15 1 1 0 0 18 0.023040
+1 1 32 128 1 ### 1 16384 1 4096 105 -1 -1 -1 -1 -1 -1 -1 0.103720
+1 1 32 128 1 ### 1 4096 1 16384 6 0 18 3 0 4 24576 16 0.102124
+1 1 32 128 1 ### 1 250880 1 4096 102 -1 -1 -1 -1 -1 -1 -1 1.402680
+64 128 32 128 1 ### 1 12288 8192 4096 6 0 20 0 1 0 0 11 2.837852
+64 128 32 128 1 ### 2048 128 128 128 111 -1 -1 -1 -1 -1 -1 -1 0.202480
+64 128 32 128 1 ### 2048 128 128 128 103 -1 -1 -1 -1 -1 -1 -1 0.156770
+64 128 32 128 1 ### 1 4096 8192 4096 6 0 20 0 1 0 0 11 0.955003
+64 128 32 128 1 ### 1 16384 8192 4096 6 0 20 0 1 0 0 11 3.772959
+64 128 32 128 1 ### 1 4096 8192 16384 6 0 20 0 1 0 0 11 3.703818
+64 1 32 128 1 ### 1 12288 64 4096 6 0 18 0 0 0 0 16 0.080015
+64 1 32 128 1 ### 1 4096 64 4096 6 0 15 1 0 0 0 18 0.026460
+64 1 32 128 1 ### 1 16384 64 4096 105 -1 -1 -1 -1 -1 -1 -1 0.110300
+64 1 32 128 1 ### 1 4096 64 16384 31 0 15 1 1 0 0 18 0.109691
+64 1 32 128 1 ### 1 250880 64 4096 100 -1 -1 -1 -1 -1 -1 -1 1.603500
+32 128 32 128 1 ### 1 12288 4096 4096 6 0 20 0 1 0 0 11 1.444751
+32 128 32 128 1 ### 1024 128 128 128 112 -1 -1 -1 -1 -1 -1 -1 0.105780
+32 128 32 128 1 ### 1024 128 128 128 113 -1 -1 -1 -1 -1 -1 -1 0.084340
+32 128 32 128 1 ### 1 4096 4096 4096 6 0 24 1 0 0 0 9 0.502835
+32 128 32 128 1 ### 1 16384 4096 4096 6 0 20 0 1 0 0 11 1.898291
+32 128 32 128 1 ### 1 4096 4096 16384 21 0 24 1 0 0 0 12 1.910139
+32 1 32 128 1 ### 1 12288 32 4096 107 -1 -1 -1 -1 -1 -1 -1 0.078600
+32 1 32 128 1 ### 1 4096 32 4096 31 0 15 1 0 0 0 18 0.024586
+32 1 32 128 1 ### 1 16384 32 4096 6 0 18 0 1 0 0 12 0.105708
+32 1 32 128 1 ### 1 4096 32 16384 105 -1 -1 -1 -1 -1 -1 -1 0.105120
+32 1 32 128 1 ### 1 250880 32 4096 106 -1 -1 -1 -1 -1 -1 -1 1.480140
+16 128 32 128 1 ### 1 12288 2048 4096 6 0 20 0 1 0 0 11 0.750612
+16 128 32 128 1 ### 512 128 128 128 108 -1 -1 -1 -1 -1 -1 -1 0.057030
+16 128 32 128 1 ### 512 128 128 128 114 -1 -1 -1 -1 -1 -1 -1 0.048080
+16 128 32 128 1 ### 1 4096 2048 4096 6 0 20 0 0 0 0 11 0.256000
+16 128 32 128 1 ### 1 16384 2048 4096 6 0 20 0 1 0 0 11 0.957215
+16 128 32 128 1 ### 1 4096 2048 16384 6 0 20 0 1 0 0 11 0.978862
+16 1 32 128 1 ### 1 12288 16 4096 6 0 18 1 1 0 0 16 0.077793
+16 1 32 128 1 ### 1 4096 16 4096 31 0 15 1 1 0 0 18 0.023849
+16 1 32 128 1 ### 1 16384 16 4096 6 0 21 1 0 0 0 15 0.104858
+16 1 32 128 1 ### 1 4096 16 16384 6 0 15 1 1 0 0 17 0.105001
+16 1 32 128 1 ### 1 250880 16 4096 108 -1 -1 -1 -1 -1 -1 -1 1.450760
+8 128 32 128 1 ### 1 12288 1024 4096 6 0 20 2 1 1 3072 11 0.398592
+8 128 32 128 1 ### 256 128 128 128 107 -1 -1 -1 -1 -1 -1 -1 0.018050
+8 128 32 128 1 ### 256 128 128 128 104 -1 -1 -1 -1 -1 -1 -1 0.015680
+8 128 32 128 1 ### 1 4096 1024 4096 21 0 20 2 0 1 1024 11 0.144763
+8 128 32 128 1 ### 1 16384 1024 4096 6 0 20 0 1 0 0 11 0.505160
+8 128 32 128 1 ### 1 4096 1024 16384 115 -1 -1 -1 -1 -1 -1 -1 0.545580
+8 1 32 128 1 ### 1 12288 8 4096 6 0 18 1 1 0 0 16 0.077445
+8 1 32 128 1 ### 1 4096 8 4096 31 0 15 1 1 0 0 18 0.023245
+8 1 32 128 1 ### 1 16384 8 4096 110 -1 -1 -1 -1 -1 -1 -1 0.104360
+8 1 32 128 1 ### 1 4096 8 16384 6 0 15 1 1 0 0 17 0.104305
+8 1 32 128 1 ### 1 250880 8 4096 100 -1 -1 -1 -1 -1 -1 -1 1.430000
+1 128 32 128 1 ### 1 12288 128 4096 6 0 18 0 1 0 0 15 0.085402
+1 128 32 128 1 ### 32 128 128 128 108 -1 -1 -1 -1 -1 -1 -1 0.007070
+1 128 32 128 1 ### 32 128 128 128 114 -1 -1 -1 -1 -1 -1 -1 0.007350
+1 128 32 128 1 ### 1 4096 128 4096 104 -1 -1 -1 -1 -1 -1 -1 0.033170
+1 128 32 128 1 ### 1 16384 128 4096 6 0 24 0 0 0 0 15 0.115405
+1 128 32 128 1 ### 1 4096 128 16384 104 -1 -1 -1 -1 -1 -1 -1 0.118900
+1 1 32 128 1 ### 1 12288 1 4096 6 0 18 1 1 0 0 16 0.076872
+1 1 32 128 1 ### 1 4096 1 4096 6 0 15 1 1 0 0 18 0.023235
+1 1 32 128 1 ### 1 16384 1 4096 107 -1 -1 -1 -1 -1 -1 -1 0.103860
+1 1 32 128 1 ### 1 4096 1 16384 6 0 18 3 0 4 24576 16 0.102523
+1 1 32 128 1 ### 1 250880 1 4096 103 -1 -1 -1 -1 -1 -1 -1 1.402790

lyraBelle/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .lyraBelle import LyraBelle

lyraBelle/config.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import dataclasses
+from typing import Optional
+@dataclasses.dataclass
+class BelleParam:
+    num_heads: int = 32
+    size_per_head: int = 128
+    inter_size: int = 16384
+    num_layers: int = 30
+    vocab_size: int = 250880
+    start_id: Optional[int] = 1
+    end_id: Optional[int] = 2
+    tensor_para_size: int = 1
+    pipeline_para_size: int = 1
+    remove_padding: bool = True
+    shared_contexts_ratio: float = 1.0
+    weights_data_type: str = "fp16"
+    def __post_init__(self):
+        if not 0.0 <= self.shared_contexts_ratio <= 1.0:
+            raise ValueError(
+                f'Got an invalid value of shared_context_ratio '
+                f'{self.shared_contexts_ratio} - range: [0.0, 1.0]')
+    def asdict(self):
+        return dataclasses.asdict(self)
+BELLE_PARAM = BelleParam()
+import os
+current_dir = os.path.dirname(os.path.abspath(__file__))
+LIB_SO_PATH = os.path.join(current_dir, 'libth_transformer.so')

lyraBelle/libth_transformer.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17485c356e0f201d2f3193e6c31ec26d3b4e0b3f605968e1915a7adcd2b05b43
+size 200050816

lyraBelle/lyraBelle.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from __future__ import annotations
+import configparser
+import pathlib
+import typing
+import torch
+import transformers
+from torch.nn.utils.rnn import pad_sequence
+from .config import BELLE_PARAM, LIB_SO_PATH
+from .model import BelleModel
+import os
+class LyraBelle:
+    def __init__(self, model_path, model_name, dtype='fp16', int8_mode=0) -> None:
+        self.model_path = model_path
+        self.model_name = model_name
+        self.dtype = dtype
+        if dtype != 'int8':
+            int8_mode = 0
+        self.int8_mode = int8_mode
+        print(f'Loading model and tokenizer from {self.model_path}')
+        self.model, self.tokenizer = self.load_model_and_tokenizer()
+        print("Got model and tokenizer")
+    def load_model_and_tokenizer(self):
+        tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_path)
+        checkpoint_path = pathlib.Path(self.model_path)
+        config_path = checkpoint_path / 'config.ini'
+        if config_path.exists():
+            # Read model params from config.
+            cfg = configparser.ConfigParser()
+            cfg.read(config_path)
+            model_name = 'belle'
+            inference_data_type = self.dtype
+            if inference_data_type == None:
+                inference_data_type = cfg.get(model_name, "weight_data_type")
+            model_args = dict(
+                head_num=cfg.getint(model_name, 'head_num'),
+                size_per_head=cfg.getint(model_name, "size_per_head"),
+                layer_num=cfg.getint(model_name, "num_layer"),
+                tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
+                vocab_size=cfg.getint(model_name, "vocab_size"),
+                start_id=cfg.getint(model_name, "start_id"),
+                end_id=cfg.getint(model_name, "end_id"),
+                weights_data_type=cfg.get(model_name, "weight_data_type"),
+                layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
+                inference_data_type=inference_data_type)
+        else:
+            inference_data_type = self.dtype
+            if inference_data_type == None:
+                inference_data_type = BELLE_PARAM.weights_data_type
+            model_args = dict(head_num=BELLE_PARAM.num_heads,
+                              size_per_head=BELLE_PARAM.size_per_head,
+                              vocab_size=BELLE_PARAM.vocab_size,
+                              start_id=BELLE_PARAM.start_id or tokenizer.bos_token_id,
+                              end_id=BELLE_PARAM.end_id or tokenizer.eos_token_id,
+                              layer_num=BELLE_PARAM.num_layers,
+                              tensor_para_size=BELLE_PARAM.tensor_para_size,
+                              weights_data_type=BELLE_PARAM.weights_data_type,
+                              inference_data_type=inference_data_type)
+        # update common parameters
+        model_args.update(dict(
+            lib_path=LIB_SO_PATH,
+            pipeline_para_size=BELLE_PARAM.pipeline_para_size,
+            shared_contexts_ratio=BELLE_PARAM.shared_contexts_ratio,
+            int8_mode=self.int8_mode
+        ))
+        print('[FT][INFO] Load Our FT Highly Optimized BELLE model')
+        for k, v in model_args.items():
+            print(f' - {k.ljust(25, ".")}: {v}')
+        # Check sanity and consistency between the model and tokenizer.
+        checklist = ['head_num', 'size_per_head', 'vocab_size', 'layer_num',
+                     'tensor_para_size', 'tensor_para_size', 'weights_data_type']
+        if None in [model_args[k] for k in checklist]:
+            none_params = [p for p in checklist if model_args[p] is None]
+            print(f'[FT][WARNING] Found None parameters {none_params}. They must '
+                  f'be provided either by config file or CLI arguments.')
+        if model_args['start_id'] != tokenizer.bos_token_id:
+            print('[FT][WARNING] Given start_id is not matched with the bos token '
+                  'id of the pretrained tokenizer.')
+        if model_args['end_id'] not in (tokenizer.pad_token_id, tokenizer.eos_token_id):
+            print('[FT][WARNING] Given end_id is not matched with neither pad '
+                  'token id nor eos token id of the pretrained tokenizer.')
+        model = BelleModel(**model_args)
+        if not model.load(ckpt_path=os.path.join(self.model_path, self.model_name)):
+            print('[FT][WARNING] Skip model loading since no checkpoints are found')
+        return model, tokenizer
+    def generate(self, prompts: typing.List[str] | str,
+                 output_length: int = 512,
+                 beam_width: int = 1,
+                 top_k: typing.Optional[torch.IntTensor] = 1,
+                 top_p: typing.Optional[torch.FloatTensor] = 1.0,
+                 beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = 0.0,
+                 temperature: typing.Optional[torch.FloatTensor] = 1.0,
+                 len_penalty: typing.Optional[torch.FloatTensor] = 0.0,
+                 repetition_penalty: typing.Optional[torch.FloatTensor] = 1.0,
+                 presence_penalty: typing.Optional[torch.FloatTensor] = None,
+                 min_length: typing.Optional[torch.IntTensor] = None,
+                 bad_words_list: typing.Optional[torch.IntTensor] = None,
+                 do_sample: bool = False,
+                 return_output_length: bool = False,
+                 return_cum_log_probs: int = 0):
+        #
+        if isinstance(prompts, str):
+            prompts = [prompts, ]
+        inputs = ['Human: ' + prompt.strip() +
+                  '\n\nAssistant:' for prompt in prompts]
+        batch_size = len(inputs)
+        ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
+        ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
+        # we must encode the raw prompt text one by one in order to compute the length of the original text.
+        input_token_ids = [self.tokenizer(text, return_tensors="pt").input_ids.int().squeeze() for text in inputs]
+        input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
+        # after got the length of each input text tokens. we can batchfy the input list to a tensor. padding the right.
+        input_token_ids = pad_sequence(input_token_ids, batch_first=True, padding_value=self.tokenizer.eos_token_id)
+        random_seed = None
+        if do_sample:
+            random_seed = torch.randint(0, 262144, (batch_size,), dtype=torch.long)
+        outputs = self.model(start_ids=input_token_ids,
+                             start_lengths=input_lengths,
+                             output_len=output_length,
+                             beam_width=beam_width,
+                             top_k=top_k*ones_int,
+                             top_p=top_p*ones_float,
+                             beam_search_diversity_rate=beam_search_diversity_rate*ones_float,
+                             temperature=temperature*ones_float,
+                             len_penalty=len_penalty*ones_float,
+                             repetition_penalty=repetition_penalty*ones_float,
+                             presence_penalty=presence_penalty,
+                             min_length=min_length,
+                             random_seed=random_seed,
+                             bad_words_list=bad_words_list,
+                             return_output_length=return_output_length,
+                             return_cum_log_probs=return_cum_log_probs)
+        if return_cum_log_probs > 0:
+            outputs = outputs[0]  # output_token_ids.
+        # Slice the generated token ids of the 1st beam result.
+        # output = input tokens + generated tokens.
+        output_token_ids = [out[0, length:].cpu()
+                            for out, length in zip(outputs, input_lengths)]
+        output_texts = self.tokenizer.batch_decode(
+            output_token_ids, skip_special_tokens=True)
+        return output_texts

lyraBelle/model.py ADDED Viewed

	@@ -0,0 +1,764 @@

+from __future__ import annotations
+import os
+import pathlib
+import typing
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+str_type_map = {"fp32": torch.float32,
+                "fp16": torch.float16, "bf16": torch.bfloat16}
+class BaseBelleWeights:
+    def __init__(self, head_num, size_per_head, layer_num, vocab_size, max_seq_len, tensor_para_size, pipeline_para_size,
+                 weights_data_type: typing.Union[str, np.dtype],
+                 inference_data_type: str,
+                 has_adapters: bool = False,
+                 adapter_inter_size: int = 0,
+                 gpt_with_moe: bool = False,
+                 has_positional_encoding: bool = True,
+                 has_pre_decoder_layernorm: bool = False,
+                 has_post_decoder_layernorm: bool = True,
+                 int8_mode: int = 0,
+                 inter_size: int = 0):
+        assert(head_num % tensor_para_size == 0)
+        if int8_mode == 1:
+            torch_infer_dtype = str_type_map[inference_data_type]
+            assert torch_infer_dtype == torch.float16 or torch_infer_dtype == torch.bfloat16, "Weight only quant only supported for infer type fp16 or bf16."
+            quant = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix
+            self.weight_transpose_calibrate_quantize = lambda x: quant(
+                x, torch.int8)
+        else:
+            assert int8_mode == 0, "Invalid int8 mode for BELLE. Must be 0 or 1"
+        self.head_num = head_num
+        self.size_per_head = size_per_head
+        self.layer_num = layer_num
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.tensor_para_size = tensor_para_size
+        self.pipeline_para_size = pipeline_para_size
+        self.layers_per_device = layer_num // pipeline_para_size
+        self.has_adapters = has_adapters
+        self.adapter_inter_size = adapter_inter_size
+        self.gpt_with_moe = gpt_with_moe
+        self.has_positional_encoding = has_positional_encoding
+        self.has_pre_decoder_layernorm = has_pre_decoder_layernorm
+        self.has_post_decoder_layernorm = has_post_decoder_layernorm
+        local_head_num = head_num // tensor_para_size
+        global_head_num = head_num
+        local_hidden_units = local_head_num * size_per_head
+        global_hidden_units = global_head_num * size_per_head
+        local_inter_size = local_hidden_units * 4
+        if inter_size != 0:
+            assert inter_size % tensor_para_size == 0, f"inter_size({inter_size}) \% tensor_para_size({tensor_para_size}) must be 0"
+            local_inter_size = inter_size // tensor_para_size
+        local_adapter_inter_size = self.adapter_inter_size // tensor_para_size
+        self.local_head_num = local_head_num
+        self.global_head_num = global_head_num
+        self.local_hidden_units = local_hidden_units
+        self.global_hidden_units = global_hidden_units
+        self.local_inter_size = local_inter_size
+        self.int8_mode = int8_mode
+        self.share_embed = False
+        if isinstance(weights_data_type, str):
+            try:
+                weights_data_type = {
+                    "fp16": np.float16,
+                    "fp32": np.float32,
+                    "float16": np.float16,
+                    "float32": np.float32,
+                }[weights_data_type]
+            except KeyError:
+                raise ValueError(
+                    f"Don't know how to interpret weights_data_type: {weights_data_type}")
+        assert weights_data_type in [np.float32, np.float16]
+        self.weights_data_type = weights_data_type
+        self.inference_data_type = inference_data_type
+        self.w = []
+        self.int8_w = []
+        self.scale = []
+        # Transformer blocks
+        self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
+            self.inference_data_type])] * layer_num)   # self_layernorm_gamma
+        self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
+            self.inference_data_type])] * layer_num)   # self_layernorm_beta
+        self.w.extend([torch.zeros(global_hidden_units, local_hidden_units * 3,
+                      dtype=str_type_map[self.inference_data_type])] * layer_num)   # self_kernel
+        self.w.extend([torch.zeros(local_hidden_units * 3, dtype=str_type_map[self.inference_data_type])]
+                      * layer_num)   # self_bias
+        self.w.extend([torch.zeros(local_hidden_units, global_hidden_units, dtype=str_type_map[
+            self.inference_data_type])] * layer_num)   # self_output_kernel
+        self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
+            self.inference_data_type])] * layer_num)   # self_output_bias
+        self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
+            self.inference_data_type])] * layer_num)   # ffn_layernorm_gamma
+        self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
+            self.inference_data_type])] * layer_num)   # ffn_layernorm_beta
+        self.w.extend([torch.zeros(global_hidden_units, local_inter_size,
+                      dtype=str_type_map[self.inference_data_type])] * layer_num)   # ffn_kernel1
+        self.w.extend([torch.zeros(local_inter_size, dtype=str_type_map[
+            self.inference_data_type])] * layer_num)   # ffn_bias1
+        self.w.extend([torch.zeros(local_inter_size, global_hidden_units,
+                      dtype=str_type_map[self.inference_data_type])] * layer_num)   # ffn_kernel2
+        self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
+            self.inference_data_type])] * layer_num)   # ffn_bias2
+        optional_adapter_offset = 0
+        # After Transformer blocks
+        if self.has_pre_decoder_layernorm:
+            self.w.append(torch.zeros(global_hidden_units, dtype=str_type_map[
+                self.inference_data_type]))   # embedding layernorm gamma
+            self.w.append(torch.zeros(global_hidden_units, dtype=str_type_map[
+                self.inference_data_type]))   # embedding layernorm beta
+            optional_adapter_offset += 2
+        if self.has_post_decoder_layernorm:
+            self.w.append(torch.zeros(global_hidden_units, dtype=str_type_map[
+                self.inference_data_type]))   # final layernorm gamma
+            self.w.append(torch.zeros(global_hidden_units, dtype=str_type_map[
+                self.inference_data_type]))   # final layernorm beta
+            optional_adapter_offset += 2
+        if self.has_positional_encoding:
+            self.w.append(torch.zeros(max_seq_len, global_hidden_units, dtype=str_type_map[
+                self.inference_data_type]))   # position_encoding_table
+            optional_adapter_offset += 1
+        self.pre_embed_idx = len(self.w)
+        self.w.append(torch.zeros(vocab_size, global_hidden_units,
+                      dtype=str_type_map[self.inference_data_type]))   # embedding_table
+        self.post_embed_idx = len(self.w)
+        self.w.append(torch.zeros(vocab_size, global_hidden_units, dtype=str_type_map[
+            self.inference_data_type]))   # post embedding_kernel
+        self.adapter_offset = 2 + optional_adapter_offset
+        self.w.extend([torch.empty(
+            0, dtype=str_type_map[self.inference_data_type])] * layer_num)   # gating_weight
+        self.adapter_offset += layer_num
+        # adapters
+        if self.has_adapters:
+            self.w.extend([torch.zeros(global_hidden_units, local_adapter_inter_size,
+                          dtype=str_type_map[self.inference_data_type])] * layer_num)   # adaptor1_kernel1
+            self.w.extend([torch.zeros(local_adapter_inter_size, dtype=str_type_map[
+                self.inference_data_type])] * layer_num)   # adaptor1_bias1
+            self.w.extend([torch.zeros(local_adapter_inter_size, global_hidden_units,
+                          dtype=str_type_map[self.inference_data_type])] * layer_num)   # adaptor1_kernel2
+            self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
+                self.inference_data_type])] * layer_num)   # adaptor1_bias2
+            self.w.extend([torch.zeros(global_hidden_units, local_adapter_inter_size,
+                          dtype=str_type_map[self.inference_data_type])] * layer_num)   # adaptor2_kernel1
+            self.w.extend([torch.zeros(local_adapter_inter_size, dtype=str_type_map[
+                self.inference_data_type])] * layer_num)   # adaptor2_bias1
+            self.w.extend([torch.zeros(local_adapter_inter_size, global_hidden_units,
+                          dtype=str_type_map[self.inference_data_type])] * layer_num)   # adaptor2_kernel2
+            self.w.extend([torch.zeros(global_hidden_units, dtype=str_type_map[
+                self.inference_data_type])] * layer_num)   # adaptor2_bias2
+        # Initialization
+        self._map(lambda w: torch.nn.init.normal_(w, mean=0., std=1.))
+        if (self.int8_mode != 0):
+            self.int8_w.extend([torch.zeros(global_hidden_units, local_hidden_units *
+                               3, dtype=torch.int8)] * layer_num)   # self_int8_kernel
+            self.scale.extend([torch.zeros(
+                local_hidden_units * 3, dtype=torch.float)] * layer_num)   # self_scale
+            self.int8_w.extend([torch.zeros(local_hidden_units, global_hidden_units, dtype=torch.int8)]
+                               * layer_num)   # self_output_int8_kernel
+            # self_output_scale
+            self.scale.extend(
+                [torch.zeros(global_hidden_units, dtype=torch.float)] * layer_num)
+            self.int8_w.extend([torch.zeros(global_hidden_units, local_inter_size,
+                               dtype=torch.int8)] * layer_num)   # ffn_int8_kernel1
+            self.scale.extend(
+                [torch.zeros(local_inter_size, dtype=torch.float)] * layer_num)   # ffn_scale1
+            self.int8_w.extend([torch.zeros(local_inter_size, global_hidden_units,
+                               dtype=torch.int8)] * layer_num)   # ffn_int8_kernel2
+            self.scale.extend(
+                [torch.zeros(global_hidden_units, dtype=torch.float)] * layer_num)   # ffn_scale2
+            if self.has_adapters:
+                self.int8_w.extend([torch.zeros(global_hidden_units, local_adapter_inter_size,
+                                   dtype=torch.int8)] * layer_num)   # adaptor1_int8_kernel1
+                self.scale.extend([torch.zeros(local_adapter_inter_size, dtype=torch.float)]
+                                  * layer_num)   # adaptor1_scale1
+                self.int8_w.extend([torch.zeros(local_adapter_inter_size, global_hidden_units,
+                                   dtype=torch.int8)] * layer_num)   # adaptor1_int8_kernel2
+                self.scale.extend([torch.zeros(
+                    global_hidden_units, dtype=torch.float)] * layer_num)   # adaptor1_scale2
+                self.int8_w.extend([torch.zeros(global_hidden_units, local_adapter_inter_size,
+                                   dtype=torch.int8)] * layer_num)   # adaptor2_int8_kernel1
+                self.scale.extend([torch.zeros(local_adapter_inter_size, dtype=torch.float)]
+                                  * layer_num)   # adaptor2_scale1
+                self.int8_w.extend([torch.zeros(local_adapter_inter_size, global_hidden_units,
+                                   dtype=torch.int8)] * layer_num)   # adaptor2_int8_kernel2
+                self.scale.extend([torch.zeros(
+                    global_hidden_units, dtype=torch.float)] * layer_num)   # adaptor2_scale2
+    def __getitem__(self, idx):
+        return self.w[idx]
+    def __setitem__(self, idx, val):
+        self.w[idx] = val
+    def __len__(self):
+        return len(self.w)
+    def _map(self, func):
+        assert(self.pre_embed_idx < self.post_embed_idx,
+               "Pre decoder embedding index should be lower than post decoder embedding index.")
+        for i in range(len(self.w)):
+            if isinstance(self.w[i], list):
+                for j in range(len(self.w[i])):
+                    self.w[i][j] = func(self.w[i][j])
+            else:
+                if self.share_embed and i == self.post_embed_idx:
+                    # If sharing the pre and post embedding, any mapping to
+                    # the pre decoder weight will give the same output to the
+                    # post decoder weight, so we just copy here.
+                    self.w[self.post_embed_idx] = self.w[self.pre_embed_idx]
+                else:
+                    self.w[i] = func(self.w[i])
+    def _map_int8(self, func):
+        for i in range(len(self.int8_w)):
+            if isinstance(self.int8_w[i], list):
+                for j in range(len(self.int8_w[i])):
+                    self.int8_w[i][j] = func(self.int8_w[i][j])
+            else:
+                self.int8_w[i] = func(self.int8_w[i])
+        for i in range(len(self.scale)):
+            if isinstance(self.scale[i], list):
+                for j in range(len(self.scale[i])):
+                    self.scale[i][j] = func(self.scale[i][j])
+            else:
+                self.scale[i] = func(self.scale[i])
+    def _map_int8_scales(self, func):
+        for i in range(len(self.scale)):
+            if isinstance(self.scale[i], list):
+                for j in range(len(self.scale[i])):
+                    self.scale[i][j] = func(self.scale[i][j])
+            else:
+                self.scale[i] = func(self.scale[i])
+    def load(self, ckpt_path, tp_rank, pipeline_para_rank):
+        if not os.path.exists(ckpt_path):
+            raise FileNotFoundError(f"Failed to find {ckpt_path}")
+        w = []
+        type_map = {np.float32: torch.float32, np.float16: torch.float16}
+        # Load
+        def is_load(i): return i >= self.layers_per_device * \
+            pipeline_para_rank and i < self.layers_per_device * \
+            (pipeline_para_rank + 1)
+        def load_to_torch(npdata: str, is_load: bool):
+            if is_load:
+                return torch.from_numpy(npdata).to(str_type_map[self.inference_data_type])
+                #return torch.from_numpy(np.fromfile(file_path, dtype=self.weights_data_type)).to(str_type_map[self.inference_data_type])
+            else:
+                return torch.empty(0).to(str_type_map[self.inference_data_type])
+        def get_np_data(h5f, layername, layer_num, weight_type, tp_rank=None):
+            if tp_rank is None:
+                return [load_to_torch(h5f[f'model.layers.{i}.{layername}.{weight_type}']["weights"][:], is_load(i))  for i in range(layer_num)]
+            else:
+                return [load_to_torch(h5f[f'model.layers.{i}.{layername}.{weight_type}.{tp_rank}']["weights"][:], is_load(i))  for i in range(layer_num)]
+        def get_np_data_single(h5f, layername, weight_type, is_loaded, tp_rank=None):
+            if weight_type is None:
+                return load_to_torch(h5f[f'model.{layername}']["weights"][:], is_loaded)
+            if tp_rank is None:
+                 return load_to_torch(h5f[f'model.{layername}.{weight_type}']["weights"][:], is_loaded)
+            else:
+                return load_to_torch(h5f[f'model.{layername}.{weight_type}.{tp_rank}']["weights"][:], is_loaded)
+        import h5py
+        ckpt_f = h5py.File(ckpt_path, "r")
+        w.extend(get_np_data(ckpt_f, "input_layernorm", self.layer_num, "weight"))
+        w.extend(get_np_data(ckpt_f, "input_layernorm", self.layer_num, "bias"))
+        w.extend(get_np_data(ckpt_f, "attention.query_key_value", self.layer_num, "weight", tp_rank))
+        w.extend(get_np_data(ckpt_f, "attention.query_key_value", self.layer_num, "bias", tp_rank))
+        w.extend(get_np_data(ckpt_f, "attention.dense", self.layer_num, "weight", tp_rank))
+        w.extend(get_np_data(ckpt_f, "attention.dense", self.layer_num, "bias"))
+        w.extend(get_np_data(ckpt_f, "post_attention_layernorm", self.layer_num, "weight"))
+        w.extend(get_np_data(ckpt_f, "post_attention_layernorm", self.layer_num, "bias"))
+        # if moe, load "mlp.moe.experts.dense_h_to_4h"
+        w.extend(get_np_data(ckpt_f, "mlp.dense_h_to_4h", self.layer_num, "weight", tp_rank))
+        w.extend(get_np_data(ckpt_f, "mlp.dense_h_to_4h", self.layer_num, "bias", tp_rank))
+        # if moe, load "mlp.moe.experts.dense_4h_to_h"
+        w.extend(get_np_data(ckpt_f, "mlp.dense_4h_to_h", self.layer_num, "weight", tp_rank))
+        w.extend(get_np_data(ckpt_f, "mlp.dense_4h_to_h", self.layer_num, "bias"))
+        if self.has_pre_decoder_layernorm:
+            w.append(get_np_data_single(ckpt_f, "pre_decoder_layernorm", "weight", True))
+            w.append(get_np_data_single(ckpt_f, "pre_decoder_layernorm", "bias", True))
+        if self.has_post_decoder_layernorm:
+            w.append(get_np_data_single(ckpt_f, "final_layernorm", "weight", True))
+            w.append(get_np_data_single(ckpt_f, "final_layernorm", "bias", True))
+        if self.has_positional_encoding:
+            wpe = load_to_torch(get_np_data_single(ckpt_f, "wpe", weight_type=None, is_loaded=True)).reshape(-1, self.global_hidden_units)
+            assert self.max_seq_len <= wpe.size(0), (
+                f"max_seq_len ({self.max_seq_len} must not exceed "
+                f"the value of maximum sequence length during training ({wpe.size(0)})."
+            )
+            w.append(wpe)
+        w.append(get_np_data_single(ckpt_f, "wte", weight_type=None, is_loaded=True))
+        if "model.lm_head.weight" in ckpt_f.keys():
+            self.share_embed = False
+            w.append(get_np_data_single(ckpt_f, "lm_head", "weight", True))
+        else:
+            self.share_embed = True
+            w.append(torch.empty(0).to(str_type_map[self.inference_data_type]))
+        gate_list = []
+        for i in range(self.layer_num):
+            print(">>>???>>")
+            if f"model.layers.{i}.mlp.moe.gate.wg.weight" in ckpt_f.keys():
+                gate_list.append(load_to_torch(
+                    f"{ckpt_path}/model.layers.{i}.mlp.moe.gate.wg.weight.bin", True))
+            else:
+                gate_list.append(load_to_torch(
+                    f"{ckpt_path}/model.layers.{i}.mlp.moe.gate.wg.weight.bin", False))
+        w.extend(gate_list)
+        """
+        if self.has_adapters:
+            w.extend([load_to_torch(
+                f"{ckpt_path}/model.layers.{i}.after_attention_adapter.dense_h_to_4h.weight.{tp_rank}.bin"
+                if os.path.isfile(f"{ckpt_path}/model.layers.{i}.after_attention_adapter.dense_h_to_4h.weight.{tp_rank}.bin")
+                else f"{ckpt_path}/model.layers.{i}.after_attention_adapter.moe.experts.dense_h_to_4h.weight.{tp_rank}.bin",
+                is_load(i)) for i in range(self.layer_num)])
+            w.extend([load_to_torch(
+                f"{ckpt_path}/model.layers.{i}.after_attention_adapter.dense_h_to_4h.bias.{tp_rank}.bin"
+                if os.path.isfile(f"{ckpt_path}/model.layers.{i}.after_attention_adapter.dense_h_to_4h.bias.{tp_rank}.bin")
+                else f"{ckpt_path}/model.layers.{i}.after_attention_adapter.moe.experts.dense_h_to_4h.bias.{tp_rank}.bin",
+                is_load(i)) for i in range(self.layer_num)])
+            w.extend([load_to_torch(
+                f"{ckpt_path}/model.layers.{i}.after_attention_adapter.dense_4h_to_h.weight.{tp_rank}.bin"
+                if os.path.isfile(f"{ckpt_path}/model.layers.{i}.after_attention_adapter.dense_4h_to_h.weight.{tp_rank}.bin")
+                else f"{ckpt_path}/model.layers.{i}.after_attention_adapter.moe.experts.dense_4h_to_h.weight.{tp_rank}.bin",
+                is_load(i)) for i in range(self.layer_num)])
+            w.extend([load_to_torch(
+                f"{ckpt_path}/model.layers.{i}.after_attention_adapter.dense_4h_to_h.bias.bin"
+                if os.path.isfile(f"{ckpt_path}/model.layers.{i}.after_attention_adapter.dense_4h_to_h.bias.bin")
+                else f"{ckpt_path}/model.layers.{i}.after_attention_adapter.moe.experts.dense_4h_to_h.bias.bin",
+                is_load(i)) for i in range(self.layer_num)])
+            w.extend([load_to_torch(
+                f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.dense_h_to_4h.weight.{tp_rank}.bin"
+                if os.path.isfile(f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.dense_h_to_4h.weight.{tp_rank}.bin")
+                else f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.moe.experts.dense_h_to_4h.weight.{tp_rank}.bin",
+                is_load(i)) for i in range(self.layer_num)])
+            w.extend([load_to_torch(
+                f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.dense_h_to_4h.bias.{tp_rank}.bin"
+                if os.path.isfile(f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.dense_h_to_4h.bias.{tp_rank}.bin")
+                else f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.moe.experts.dense_h_to_4h.bias.{tp_rank}.bin",
+                is_load(i)) for i in range(self.layer_num)])
+            w.extend([load_to_torch(
+                f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.dense_4h_to_h.weight.{tp_rank}.bin"
+                if os.path.isfile(f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.dense_4h_to_h.weight.{tp_rank}.bin")
+                else f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.moe.experts.dense_4h_to_h.weight.{tp_rank}.bin",
+                is_load(i)) for i in range(self.layer_num)])
+            w.extend([load_to_torch(
+                f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.dense_4h_to_h.bias.bin"
+                if os.path.isfile(f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.dense_4h_to_h.bias.bin")
+                else f"{ckpt_path}/model.layers.{i}.after_ffn_adapter.moe.experts.dense_4h_to_h.bias.bin",
+                is_load(i)) for i in range(self.layer_num)])
+        """
+        assert len(self.w) == len(w)
+        # Reshape
+        try:
+            for i in range(len(w)):
+                if w[i].nelement() == self.w[i].nelement():
+                    self.w[i] = w[i].reshape(self.w[i].shape)
+                else:
+                    self.w[i] = w[i]
+        except RuntimeError:
+            raise RuntimeError(
+                f"head_num, size_per_head, vocab_size, and max_seq_len must be the same as the ones during training "
+                f"(idx: {i} expected shape: {self.w[i].shape} got shape: {w[i].shape})."
+            )
+        # transpose calibrate quantize the kernel
+        layer_num = self.layer_num
+        if self.int8_mode != 0:
+            for i in range(layer_num):
+                self.int8_w[i + 0 * layer_num], self.scale[i + 0 *
+                                                           layer_num] = self.weight_transpose_calibrate_quantize(self.w[2 * layer_num + i])
+                self.int8_w[i + 1 * layer_num], self.scale[i + 1 *
+                                                           layer_num] = self.weight_transpose_calibrate_quantize(self.w[4 * layer_num + i])
+                self.int8_w[i + 2 * layer_num], self.scale[i + 2 *
+                                                           layer_num] = self.weight_transpose_calibrate_quantize(self.w[8 * layer_num + i])
+                self.int8_w[i + 3 * layer_num], self.scale[i + 3 *
+                                                           layer_num] = self.weight_transpose_calibrate_quantize(self.w[10 * layer_num + i])
+                # We clear the original weights since they are no longer needed
+                if self.int8_mode == 1:
+                    self.w[2 * layer_num +
+                           i] = torch.empty(0).to(str_type_map[self.inference_data_type])
+                    self.w[4 * layer_num +
+                           i] = torch.empty(0).to(str_type_map[self.inference_data_type])
+                    self.w[8 * layer_num +
+                           i] = torch.empty(0).to(str_type_map[self.inference_data_type])
+                    self.w[10 * layer_num +
+                           i] = torch.empty(0).to(str_type_map[self.inference_data_type])
+                if self.has_adapters:
+                    self.int8_w[i + 4 * layer_num], self.scale[i + 4 * layer_num] = self.weight_transpose_calibrate_quantize(
+                        self.w[12 * layer_num + i + self.adapter_offset])
+                    self.int8_w[i + 5 * layer_num], self.scale[i + 5 * layer_num] = self.weight_transpose_calibrate_quantize(
+                        self.w[14 * layer_num + i + self.adapter_offset])
+                    self.int8_w[i + 6 * layer_num], self.scale[i + 6 * layer_num] = self.weight_transpose_calibrate_quantize(
+                        self.w[16 * layer_num + i + self.adapter_offset])
+                    self.int8_w[i + 7 * layer_num], self.scale[i + 7 * layer_num] = self.weight_transpose_calibrate_quantize(
+                        self.w[18 * layer_num + i + self.adapter_offset])
+                    # Similar to above:
+                    if self.int8_mode == 1:
+                        self.w[12 * layer_num + i + self.adapter_offset] = torch.empty(
+                            0).to(str_type_map[self.inference_data_type])
+                        self.w[14 * layer_num + i + self.adapter_offset] = torch.empty(
+                            0).to(str_type_map[self.inference_data_type])
+                        self.w[16 * layer_num + i + self.adapter_offset] = torch.empty(
+                            0).to(str_type_map[self.inference_data_type])
+                        self.w[18 * layer_num + i + self.adapter_offset] = torch.empty(
+                            0).to(str_type_map[self.inference_data_type])
+        return True
+class BaseBelleModel(nn.Module):
+    def __init__(self,
+                 head_num, size_per_head,
+                 vocab_size, start_id, end_id, layer_num,
+                 max_seq_len: int,
+                 tensor_para_size: int,
+                 pipeline_para_size: int,
+                 lib_path: typing.Union[str, pathlib.Path],
+                 inference_data_type: str,
+                 inter_size: int = 0,
+                 # gpt_variant_params
+                 layernorm_eps: float = 1e-6,
+                 layernorm_type: typing.Literal['pre_layernorm',
+                                                'post_layernorm'] = "pre_layernorm",
+                 activation_type: str = "Gelu",
+                 gpt_with_moe: bool = False,
+                 expert_num: int = 0,
+                 moe_k: int = 0,
+                 moe_layer_index: typing.List = [],
+                 has_positional_encoding: bool = True,
+                 has_pre_decoder_layernorm: bool = False,
+                 has_post_decoder_layernorm: bool = True,
+                 has_adapters: bool = False,
+                 adapter_inter_size: int = 0,
+                 use_attention_linear_bias: bool = False,
+                 int8_mode: int = 0,
+                 weights_data_type: typing.Union[str, np.dtype] = np.float32,
+                 shared_contexts_ratio: float = 1.0):
+        super().__init__()
+        self.head_num = head_num
+        self.size_per_head = size_per_head
+        self.vocab_size = vocab_size
+        self.start_id = start_id
+        self.end_id = end_id
+        self.layer_num = layer_num
+        self.inter_size = inter_size if inter_size != 0 else 4 * \
+            self.head_num * self.size_per_head
+        # gpt_variant_params
+        self.layernorm_eps = layernorm_eps
+        self.layernorm_type = layernorm_type
+        self.activation_type = activation_type
+        self.gpt_with_moe = gpt_with_moe
+        self.expert_num = expert_num
+        self.moe_k = moe_k
+        self.moe_layer_index = moe_layer_index
+        self.has_positional_encoding = has_positional_encoding
+        self.has_pre_decoder_layernorm = has_pre_decoder_layernorm
+        self.has_post_decoder_layernorm = has_post_decoder_layernorm
+        self.has_adapters = has_adapters
+        self.adapter_inter_size = adapter_inter_size
+        self.use_attention_linear_bias = use_attention_linear_bias
+        # multi-gpu params
+        self.tensor_para_size = tensor_para_size
+        self.pipeline_para_size = pipeline_para_size
+        self.use_sparse_gemm = False
+        self.build_model = False
+        self.int8_mode = int8_mode
+        self.weights_data_type = weights_data_type
+        self.shared_contexts_ratio = shared_contexts_ratio
+        assert torch.cuda.is_available(), "CUDA is required for this model."
+        assert head_num % tensor_para_size == 0, "head_num must be a multiple of tensor_para_size."
+        assert layer_num % pipeline_para_size == 0, "layer_num must be a multiple of pipeline_para_size."
+        # Load the C++ model into Pytorch model.
+        torch.classes.load_library(os.path.abspath(lib_path))
+        # Prepare weights
+        self.weights = BaseBelleWeights(head_num, size_per_head, layer_num, vocab_size,
+                                        max_seq_len, tensor_para_size, pipeline_para_size,
+                                        weights_data_type=weights_data_type,
+                                        inference_data_type=inference_data_type,
+                                        gpt_with_moe=self.gpt_with_moe,
+                                        has_positional_encoding=self.has_positional_encoding,
+                                        has_pre_decoder_layernorm=self.has_pre_decoder_layernorm,
+                                        has_post_decoder_layernorm=self.has_post_decoder_layernorm,
+                                        has_adapters=self.has_adapters,
+                                        adapter_inter_size=self.adapter_inter_size,
+                                        int8_mode=int8_mode,
+                                        inter_size=inter_size)
+        # Prepare for tensor/pipeline parallel
+        try:
+            dist.init_process_group(backend='mpi')
+        except:
+            print("[INFO] WARNING: Have initialized the process group")
+        self.rank = dist.get_rank()
+        self.device_count = torch.cuda.device_count()
+        self.device = self.rank % self.device_count
+        torch.cuda.set_device(self.device)
+        world_size = dist.get_world_size()
+        assert world_size == tensor_para_size * \
+            pipeline_para_size, "tensor_para_size * pipeline_para_size must be equal to world_size."
+        self.tensor_para_rank = self.rank % self.tensor_para_size
+        self.pipeline_para_rank = self.rank // self.tensor_para_size
+    def load(self, ckpt_path):
+        is_load = self.weights.load(ckpt_path, tp_rank=self.tensor_para_rank,
+                                    pipeline_para_rank=self.pipeline_para_rank)
+        self.cuda()
+        torch.cuda.empty_cache()  # clean cache for model weight preprocessing
+        return is_load
+    def sparse(self):
+        if not self.use_sparse_gemm:
+            self.use_sparse_gemm = True
+    def cuda(self):
+        self.weights._map(lambda w: w.cuda(self.device))
+        if self.int8_mode != 0:
+            self.weights._map_int8(lambda w: w.cuda(self.device))
+        if self.build_model:
+            del self.model
+            self.build_model = False
+        self.model = torch.classes.FasterTransformer.GptOp(
+            self.head_num, self.size_per_head, self.inter_size,
+            self.layer_num,
+            self.expert_num,
+            self.moe_k,
+            self.moe_layer_index,
+            self.vocab_size, self.start_id, self.end_id,
+            self.use_sparse_gemm,
+            # gpt_variant_params
+            self.layernorm_eps,
+            self.layernorm_type,
+            self.activation_type,
+            self.has_positional_encoding,
+            self.has_pre_decoder_layernorm,
+            self.has_post_decoder_layernorm,
+            self.has_adapters,
+            self.adapter_inter_size,
+            self.use_attention_linear_bias,
+            self.weights.w)
+        self.build_model = True
+    def forward(self,
+                start_ids: torch.IntTensor,
+                start_lengths: torch.IntTensor,
+                output_len: int,
+                beam_width: int = 1,
+                top_k: typing.Optional[torch.IntTensor] = None,
+                top_p: typing.Optional[torch.FloatTensor] = None,
+                beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = None,
+                temperature: typing.Optional[torch.FloatTensor] = None,
+                len_penalty: typing.Optional[torch.FloatTensor] = None,
+                repetition_penalty: typing.Optional[torch.FloatTensor] = None,
+                presence_penalty: typing.Optional[torch.FloatTensor] = None,
+                min_length: typing.Optional[torch.IntTensor] = None,
+                random_seed: typing.Optional[torch.LongTensor] = None,
+                bad_words_list: typing.Optional[torch.IntTensor] = None,
+                return_output_length: bool = False,
+                return_cum_log_probs: int = 0):
+        if not self.build_model:
+            # for the cases we don't load model
+            self.cuda()
+            torch.cuda.empty_cache()  # clean cache for model weight preprocessing
+        input_len = start_ids.size(1)
+        assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."
+        # Inputs to device
+        start_ids = start_ids.cuda(self.device)
+        start_lengths = start_lengths.cuda(self.device)
+        # outputs: output_ids, output_lengths, output_cum_log_probs (optional)
+        outputs = self.model.forward(start_ids,
+                                     start_lengths,
+                                     output_len,
+                                     beam_width,  # optional, can be None
+                                     top_k,  # optional, can be None
+                                     top_p,  # optional, can be None
+                                     beam_search_diversity_rate,  # optional, can be None
+                                     temperature,  # optional, can be None
+                                     len_penalty,  # optional, can be None
+                                     repetition_penalty,  # optional, can be None
+                                     presence_penalty,  # optional, can be None
+                                     min_length,  # optional, can be None
+                                     random_seed,  # optional, can be None
+                                     bad_words_list,  # optional, can be None
+                                     return_cum_log_probs)  # optional, can be None
+        if return_cum_log_probs == 0:
+            output_ids, output_lengths = outputs
+        else:
+            output_ids, output_lengths, output_cum_log_probs = outputs
+        if return_output_length:
+            if return_cum_log_probs > 0:
+                return output_ids, output_lengths, output_cum_log_probs
+            else:
+                return output_ids, output_lengths
+        else:
+            return output_ids
+    def set_input_tensor(self, input_tensor):
+        """Set input tensor to be used instead of forward()'s input.
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor
+class BaseParallelBelleModel(BaseBelleModel):
+    def cuda(self):
+        self.weights._map(lambda w: w.cuda(self.device))
+        if self.int8_mode != 0:
+            self.weights._map_int8(lambda w: w.cuda(self.device))
+        if self.build_model:
+            del self.model
+            self.build_model = False
+        self.model = torch.classes.FasterTransformer.ParallelGptOp(
+            self.head_num, self.size_per_head, self.inter_size,
+            self.layer_num,
+            self.expert_num,
+            self.moe_k,
+            self.moe_layer_index,
+            self.vocab_size, self.start_id, self.end_id,
+            self.tensor_para_size, self.pipeline_para_size, self.int8_mode,
+            # GPT variant parameters
+            self.layernorm_eps,
+            self.layernorm_type,
+            self.activation_type,
+            self.has_positional_encoding,
+            self.has_pre_decoder_layernorm,
+            self.has_post_decoder_layernorm,
+            self.has_adapters,
+            self.adapter_inter_size,
+            self.use_attention_linear_bias,
+            self.weights.w,
+            self.weights.int8_w,
+            self.weights.scale,
+            self.shared_contexts_ratio)
+        self.build_model = True
+class BelleWeight(BaseBelleWeights):
+    def __init__(self, head_num, size_per_head, layer_num, vocab_size,
+                 tensor_para_size, pipeline_para_size, weights_data_type, inference_data_type,
+                 int8_mode=0):
+        super().__init__(
+            head_num, size_per_head, layer_num, vocab_size, 0,
+            tensor_para_size, pipeline_para_size, weights_data_type,
+            inference_data_type,
+            has_adapters=False,
+            adapter_inter_size=0,
+            has_positional_encoding=False,
+            has_pre_decoder_layernorm=True,
+            has_post_decoder_layernorm=True,
+            int8_mode=int8_mode)
+class BelleModel(BaseParallelBelleModel):
+    def __init__(self,
+                 head_num, size_per_head,
+                 vocab_size, start_id, end_id, layer_num,
+                 tensor_para_size: int,
+                 pipeline_para_size: int,
+                 lib_path: str | Path,
+                 inference_data_type: str,
+                 weights_data_type: str | np.dtype = np.float32,
+                 layernorm_eps: float = 1e-5,
+                 shared_contexts_ratio: float = 1.0,
+                 int8_mode: int = 0):
+        super().__init__(
+            head_num, size_per_head, vocab_size, start_id, end_id, layer_num,
+            0, tensor_para_size, pipeline_para_size,
+            lib_path=lib_path,
+            inference_data_type=inference_data_type,
+            layernorm_eps=layernorm_eps,
+            # gpt_variant_params
+            layernorm_type="pre_layernorm",
+            activation_type="Gelu",
+            has_positional_encoding=False,
+            has_pre_decoder_layernorm=True,
+            has_post_decoder_layernorm=True,
+            has_adapters=False,
+            adapter_inter_size=0,
+            use_attention_linear_bias=True,
+            int8_mode=int8_mode,
+            weights_data_type=weights_data_type,
+            shared_contexts_ratio=shared_contexts_ratio)
+    def set_input_tensor(self, input_tensor: Optional[torch.Tensor]):
+        """Set input tensor to be used instead of forward()'s input.
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func
+        """
+        self.input_tensor = input_tensor

model/1-gpu-fp16.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:606ee9330476cbcc465466b7a3e5ecb5945879ee42197c779b76127c4e87a037
+size 14153067254

model/config.ini ADDED Viewed

	@@ -0,0 +1,20 @@

+[belle]
+model_name=
+num_layer=30
+head_num=32
+inter_size=16384
+size_per_head=128
+vocab_size=250880
+tensor_para_size=1
+weight_data_type=fp16
+model_variant=bloom-pre
+layernorm_eps=1e-05
+layernorm_type=pre_layernorm
+activation_type=Gelu
+has_positional_encoding=False
+has_pre_decoder_layernorm=True
+has_post_decoder_layernorm=True
+use_attention_linear_bias=True
+start_id=1
+end_id=2

model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}

model/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3fa39cd4b1500feb205bcce3b9703a4373414cafe4970e0657b413f7ddd2a9d3
+size 14500438

model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "<unk>", "eos_token": "</s>", "bos_token": "<s>", "pad_token": "<pad>", "name_or_path": "bigscience/tokenizer", "special_tokens_map_file": null, "tokenizer_class": "BloomTokenizerFast"}

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+huggingface_hub
+numpy
+safetensors
+setuptools
+torch
+transformers