Upload folder using huggingface_hub
Browse files- config.json +0 -1
- configuration_baichuan.py +0 -2
- modeling_baichuan.py +15 -19
config.json
CHANGED
@@ -6,7 +6,6 @@
|
|
6 |
"AutoConfig": "configuration_baichuan.BaichuanConfig",
|
7 |
"AutoModelForCausalLM": "modeling_baichuan.BaichuanForCausalLM"
|
8 |
},
|
9 |
-
"tokenizer_class": "BaichuanTokenizer",
|
10 |
"bos_token_id": 1,
|
11 |
"eos_token_id": 2,
|
12 |
"hidden_act": "silu",
|
|
|
6 |
"AutoConfig": "configuration_baichuan.BaichuanConfig",
|
7 |
"AutoModelForCausalLM": "modeling_baichuan.BaichuanForCausalLM"
|
8 |
},
|
|
|
9 |
"bos_token_id": 1,
|
10 |
"eos_token_id": 2,
|
11 |
"hidden_act": "silu",
|
configuration_baichuan.py
CHANGED
@@ -46,7 +46,6 @@ class BaichuanConfig(PretrainedConfig):
|
|
46 |
bos_token_id=1,
|
47 |
eos_token_id=2,
|
48 |
tie_word_embeddings=False,
|
49 |
-
z_loss_weight=0,
|
50 |
**kwargs,
|
51 |
):
|
52 |
self.vocab_size = vocab_size
|
@@ -59,7 +58,6 @@ class BaichuanConfig(PretrainedConfig):
|
|
59 |
self.initializer_range = initializer_range
|
60 |
self.rms_norm_eps = rms_norm_eps
|
61 |
self.use_cache = use_cache
|
62 |
-
self.z_loss_weight = z_loss_weight
|
63 |
super().__init__(
|
64 |
pad_token_id=pad_token_id,
|
65 |
bos_token_id=bos_token_id,
|
|
|
46 |
bos_token_id=1,
|
47 |
eos_token_id=2,
|
48 |
tie_word_embeddings=False,
|
|
|
49 |
**kwargs,
|
50 |
):
|
51 |
self.vocab_size = vocab_size
|
|
|
58 |
self.initializer_range = initializer_range
|
59 |
self.rms_norm_eps = rms_norm_eps
|
60 |
self.use_cache = use_cache
|
|
|
61 |
super().__init__(
|
62 |
pad_token_id=pad_token_id,
|
63 |
bos_token_id=bos_token_id,
|
modeling_baichuan.py
CHANGED
@@ -502,7 +502,6 @@ class NormHead(nn.Module):
|
|
502 |
def forward(self, hidden_states):
|
503 |
if self.training:
|
504 |
norm_weight = nn.functional.normalize(self.weight)
|
505 |
-
self.first_flag = True
|
506 |
elif self.first_flag:
|
507 |
self.first_flag = False
|
508 |
self.weight = nn.Parameter(nn.functional.normalize(self.weight))
|
@@ -529,7 +528,7 @@ class BaichuanForCausalLM(BaichuanPreTrainedModel):
|
|
529 |
self.model = BaichuanModel(config)
|
530 |
|
531 |
self.lm_head = NormHead(config.hidden_size, config.vocab_size, bias=False)
|
532 |
-
if hasattr(config, "quantization_config") and
|
533 |
try:
|
534 |
from .quantizer import quantize_offline, init_model_weight_int4
|
535 |
except ImportError:
|
@@ -609,23 +608,22 @@ class BaichuanForCausalLM(BaichuanPreTrainedModel):
|
|
609 |
model_file = os.path.join(pretrained_model_name_or_path, 'pytorch_model.bin')
|
610 |
state_dict = torch.load(model_file, map_location="cpu")
|
611 |
model.is_quantized = True
|
612 |
-
|
613 |
device_map = kwargs.pop("device_map", None)
|
614 |
torch_dtype = kwargs.pop("torch_dtype", None)
|
615 |
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
model = init_model_weight_int4(config, model, state_dict)
|
630 |
|
631 |
# Set model in evaluation mode to deactivate DropOut modules by default
|
@@ -706,11 +704,9 @@ class BaichuanForCausalLM(BaichuanPreTrainedModel):
|
|
706 |
loss_fct = CrossEntropyLoss()
|
707 |
shift_logits = shift_logits.view(-1, self.config.vocab_size)
|
708 |
shift_labels = shift_labels.view(-1)
|
709 |
-
softmax_normalizer = shift_logits.max(-1).values ** 2
|
710 |
-
z_loss = self.config.z_loss_weight * softmax_normalizer.mean()
|
711 |
# Enable model parallelism
|
712 |
shift_labels = shift_labels.to(shift_logits.device)
|
713 |
-
loss = loss_fct(shift_logits, shift_labels)
|
714 |
|
715 |
if not return_dict:
|
716 |
output = (logits,) + outputs[1:]
|
|
|
502 |
def forward(self, hidden_states):
|
503 |
if self.training:
|
504 |
norm_weight = nn.functional.normalize(self.weight)
|
|
|
505 |
elif self.first_flag:
|
506 |
self.first_flag = False
|
507 |
self.weight = nn.Parameter(nn.functional.normalize(self.weight))
|
|
|
528 |
self.model = BaichuanModel(config)
|
529 |
|
530 |
self.lm_head = NormHead(config.hidden_size, config.vocab_size, bias=False)
|
531 |
+
if hasattr(config, "quantization_config") and config.quantization_config['load_in_4bit']:
|
532 |
try:
|
533 |
from .quantizer import quantize_offline, init_model_weight_int4
|
534 |
except ImportError:
|
|
|
608 |
model_file = os.path.join(pretrained_model_name_or_path, 'pytorch_model.bin')
|
609 |
state_dict = torch.load(model_file, map_location="cpu")
|
610 |
model.is_quantized = True
|
611 |
+
|
612 |
device_map = kwargs.pop("device_map", None)
|
613 |
torch_dtype = kwargs.pop("torch_dtype", None)
|
614 |
|
615 |
+
kwargs = {"no_split_module_classes": model._no_split_modules}
|
616 |
+
target_dtype = CustomDtype.INT4
|
617 |
+
max_memory = get_balanced_memory(
|
618 |
+
model,
|
619 |
+
dtype=target_dtype,
|
620 |
+
low_zero=(device_map == "balanced_low_0"),
|
621 |
+
max_memory=None,
|
622 |
+
**kwargs,
|
623 |
+
)
|
624 |
+
kwargs["max_memory"] = max_memory
|
625 |
+
|
626 |
+
device_map = infer_auto_device_map(model, dtype=target_dtype, **kwargs)
|
|
|
627 |
model = init_model_weight_int4(config, model, state_dict)
|
628 |
|
629 |
# Set model in evaluation mode to deactivate DropOut modules by default
|
|
|
704 |
loss_fct = CrossEntropyLoss()
|
705 |
shift_logits = shift_logits.view(-1, self.config.vocab_size)
|
706 |
shift_labels = shift_labels.view(-1)
|
|
|
|
|
707 |
# Enable model parallelism
|
708 |
shift_labels = shift_labels.to(shift_logits.device)
|
709 |
+
loss = loss_fct(shift_logits, shift_labels)
|
710 |
|
711 |
if not return_dict:
|
712 |
output = (logits,) + outputs[1:]
|