malenia1's picture
Upload modeling.py with huggingface_hub
ad5b231 verified
import os
os.environ["PATH"] = "/usr/local/cuda/bin:" + os.environ["PATH"]
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
import bitblas
import torch
import torch.nn as nn
from transformers import BertConfig, BertModel, PreTrainedModel, PretrainedConfig,AutoModel,AutoConfig,BertPreTrainedModel
class bitlinear(bitblas.Linear):
def __init__(
self,
in_features: int,
out_features: int,
bias: bool = False,
A_dtype: str = "float16",
W_dtype: str = "int2",
accum_dtype: str = "float16",
out_dtype: str = "float16",
group_size: int = -1,
with_scaling: bool = False,
with_zeros: bool = False,
zeros_mode: str = None,
opt_M: list = [1, 16, 32, 64, 128, 256, 512],
fast_decoding: bool = True,
alpha: torch.dtype = torch.float16,
b:torch.Tensor=None
):
super().__init__(
in_features=in_features,
out_features=out_features,
bias=bias,
A_dtype=A_dtype,
W_dtype=W_dtype,
accum_dtype=accum_dtype,
out_dtype=out_dtype,
group_size=group_size,
with_scaling=with_scaling,
with_zeros=with_zeros,
zeros_mode=zeros_mode,
opt_M=opt_M,
fast_decoding=fast_decoding,
)
self.alpha = nn.Parameter(alpha,requires_grad=False)
self.b = nn.Parameter(b,requires_grad=False)
def forward(self, A: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
out = super().forward(A, out)
out *= self.alpha
if self.b is not None:
out += self.b.view(1, -1).expand_as(out)
return out.to(torch.float32)
class TernaryBertConfig(BertConfig):
model_type = "ternarybert"
def __init__(self, **kwargs):
super().__init__(**kwargs)
class TernaryBert(PreTrainedModel):
#config_class = TernaryBertConfig
config_class = BertConfig
def __init__(self, config):
super().__init__(config)
self.bert = BertModel(config)
self.replace_linear2bitblas(self.bert)
#def forward(self, input_ids, attention_mask=None,token_type_ids=None):
# return self.bert(input_ids, attention_mask=attention_mask,token_type_ids=token_type_ids)
def forward(self, **kwargs):
return self.bert(**kwargs)
def convert_to_bitlinear(self,layer):
bitlayer = bitlinear(
in_features=layer.in_features,
out_features=layer.out_features,
bias=False,
A_dtype="float16", # activation A dtype
W_dtype="int2", # weight W dtype
accum_dtype="float16", # accumulation dtype
out_dtype="float16", # output dtype
# configs for weight only quantization
group_size=-1, # setting for grouped quantization
with_scaling=False, # setting for scaling factor
with_zeros=False, # setting for zeros
zeros_mode=None, # setting for how to calculating zeros
# Target optimization var for dynamic symbolic.
# For detailed information please checkout docs/PythonAPI.md
# By default, the optimization var is [1, 16, 32, 64, 128, 256, 512]
opt_M=[1, 16, 32, 64, 128, 256, 512],
fast_decoding=True,
alpha=torch.tensor(1.).to(torch.float16),
b = layer.bias.data.to(torch.float16)
)
return bitlayer
def replace_linear2bitblas(self,model):
for name, module in model.named_children():
if isinstance(module, nn.Linear):
new_layer = self.convert_to_bitlinear(module)
setattr(model, name, new_layer)
elif len(list(module.children())) > 0:
self.replace_linear2bitblas(module)