GuoPD commited on
Commit
0cc6a61
1 Parent(s): 0c7f845

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. config.json +0 -1
  2. configuration_baichuan.py +0 -2
  3. modeling_baichuan.py +15 -19
config.json CHANGED
@@ -6,7 +6,6 @@
6
  "AutoConfig": "configuration_baichuan.BaichuanConfig",
7
  "AutoModelForCausalLM": "modeling_baichuan.BaichuanForCausalLM"
8
  },
9
- "tokenizer_class": "BaichuanTokenizer",
10
  "bos_token_id": 1,
11
  "eos_token_id": 2,
12
  "hidden_act": "silu",
 
6
  "AutoConfig": "configuration_baichuan.BaichuanConfig",
7
  "AutoModelForCausalLM": "modeling_baichuan.BaichuanForCausalLM"
8
  },
 
9
  "bos_token_id": 1,
10
  "eos_token_id": 2,
11
  "hidden_act": "silu",
configuration_baichuan.py CHANGED
@@ -46,7 +46,6 @@ class BaichuanConfig(PretrainedConfig):
46
  bos_token_id=1,
47
  eos_token_id=2,
48
  tie_word_embeddings=False,
49
- z_loss_weight=0,
50
  **kwargs,
51
  ):
52
  self.vocab_size = vocab_size
@@ -59,7 +58,6 @@ class BaichuanConfig(PretrainedConfig):
59
  self.initializer_range = initializer_range
60
  self.rms_norm_eps = rms_norm_eps
61
  self.use_cache = use_cache
62
- self.z_loss_weight = z_loss_weight
63
  super().__init__(
64
  pad_token_id=pad_token_id,
65
  bos_token_id=bos_token_id,
 
46
  bos_token_id=1,
47
  eos_token_id=2,
48
  tie_word_embeddings=False,
 
49
  **kwargs,
50
  ):
51
  self.vocab_size = vocab_size
 
58
  self.initializer_range = initializer_range
59
  self.rms_norm_eps = rms_norm_eps
60
  self.use_cache = use_cache
 
61
  super().__init__(
62
  pad_token_id=pad_token_id,
63
  bos_token_id=bos_token_id,
modeling_baichuan.py CHANGED
@@ -502,7 +502,6 @@ class NormHead(nn.Module):
502
  def forward(self, hidden_states):
503
  if self.training:
504
  norm_weight = nn.functional.normalize(self.weight)
505
- self.first_flag = True
506
  elif self.first_flag:
507
  self.first_flag = False
508
  self.weight = nn.Parameter(nn.functional.normalize(self.weight))
@@ -529,7 +528,7 @@ class BaichuanForCausalLM(BaichuanPreTrainedModel):
529
  self.model = BaichuanModel(config)
530
 
531
  self.lm_head = NormHead(config.hidden_size, config.vocab_size, bias=False)
532
- if hasattr(config, "quantization_config") and isinstance(config.quantization_config, dict) and config.quantization_config.get('load_in_4bit', False):
533
  try:
534
  from .quantizer import quantize_offline, init_model_weight_int4
535
  except ImportError:
@@ -609,23 +608,22 @@ class BaichuanForCausalLM(BaichuanPreTrainedModel):
609
  model_file = os.path.join(pretrained_model_name_or_path, 'pytorch_model.bin')
610
  state_dict = torch.load(model_file, map_location="cpu")
611
  model.is_quantized = True
612
-
613
  device_map = kwargs.pop("device_map", None)
614
  torch_dtype = kwargs.pop("torch_dtype", None)
615
 
616
- if device_map is not None:
617
- kwargs = {"no_split_module_classes": model._no_split_modules}
618
- target_dtype = CustomDtype.INT4
619
- max_memory = get_balanced_memory(
620
- model,
621
- dtype=target_dtype,
622
- low_zero=(device_map == "balanced_low_0"),
623
- max_memory=None,
624
- **kwargs,
625
- )
626
- kwargs["max_memory"] = max_memory
627
- device_map = infer_auto_device_map(model, dtype=target_dtype, **kwargs)
628
-
629
  model = init_model_weight_int4(config, model, state_dict)
630
 
631
  # Set model in evaluation mode to deactivate DropOut modules by default
@@ -706,11 +704,9 @@ class BaichuanForCausalLM(BaichuanPreTrainedModel):
706
  loss_fct = CrossEntropyLoss()
707
  shift_logits = shift_logits.view(-1, self.config.vocab_size)
708
  shift_labels = shift_labels.view(-1)
709
- softmax_normalizer = shift_logits.max(-1).values ** 2
710
- z_loss = self.config.z_loss_weight * softmax_normalizer.mean()
711
  # Enable model parallelism
712
  shift_labels = shift_labels.to(shift_logits.device)
713
- loss = loss_fct(shift_logits, shift_labels) + z_loss
714
 
715
  if not return_dict:
716
  output = (logits,) + outputs[1:]
 
502
  def forward(self, hidden_states):
503
  if self.training:
504
  norm_weight = nn.functional.normalize(self.weight)
 
505
  elif self.first_flag:
506
  self.first_flag = False
507
  self.weight = nn.Parameter(nn.functional.normalize(self.weight))
 
528
  self.model = BaichuanModel(config)
529
 
530
  self.lm_head = NormHead(config.hidden_size, config.vocab_size, bias=False)
531
+ if hasattr(config, "quantization_config") and config.quantization_config['load_in_4bit']:
532
  try:
533
  from .quantizer import quantize_offline, init_model_weight_int4
534
  except ImportError:
 
608
  model_file = os.path.join(pretrained_model_name_or_path, 'pytorch_model.bin')
609
  state_dict = torch.load(model_file, map_location="cpu")
610
  model.is_quantized = True
611
+
612
  device_map = kwargs.pop("device_map", None)
613
  torch_dtype = kwargs.pop("torch_dtype", None)
614
 
615
+ kwargs = {"no_split_module_classes": model._no_split_modules}
616
+ target_dtype = CustomDtype.INT4
617
+ max_memory = get_balanced_memory(
618
+ model,
619
+ dtype=target_dtype,
620
+ low_zero=(device_map == "balanced_low_0"),
621
+ max_memory=None,
622
+ **kwargs,
623
+ )
624
+ kwargs["max_memory"] = max_memory
625
+
626
+ device_map = infer_auto_device_map(model, dtype=target_dtype, **kwargs)
 
627
  model = init_model_weight_int4(config, model, state_dict)
628
 
629
  # Set model in evaluation mode to deactivate DropOut modules by default
 
704
  loss_fct = CrossEntropyLoss()
705
  shift_logits = shift_logits.view(-1, self.config.vocab_size)
706
  shift_labels = shift_labels.view(-1)
 
 
707
  # Enable model parallelism
708
  shift_labels = shift_labels.to(shift_logits.device)
709
+ loss = loss_fct(shift_logits, shift_labels)
710
 
711
  if not return_dict:
712
  output = (logits,) + outputs[1:]