THUDM
/

chatglm3-6b

@@ -157,7 +157,7 @@ class RotaryEmbedding(nn.Module):
         )
-@torch.jit.script
 def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
     # x: [sq, b, np, hn]
     sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
@@ -223,8 +223,7 @@ class CoreAttention(torch.nn.Module):
         if pytorch_major_version >= 2:
             query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
             if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 is_causal=True)
             else:
                 if attention_mask is not None:
                     attention_mask = ~attention_mask
@@ -237,7 +236,7 @@ class CoreAttention(torch.nn.Module):
             # Raw attention scores
             # [b, np, sq, sk]
-            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
             # [sq, b, np, hn] -> [sq, b * np, hn]
             query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
@@ -312,7 +311,6 @@ class CoreAttention(torch.nn.Module):
 class SelfAttention(torch.nn.Module):
     """Parallel self-attention layer abstract class.
     Self-attention layer takes input with size [s, b, h]
     and returns output of the same size.
     """
@@ -448,7 +446,6 @@ class SelfAttention(torch.nn.Module):
         return output, kv_cache
 def _config_to_kwargs(args):
     common_kwargs = {
         "dtype": args.torch_dtype,
@@ -504,7 +501,6 @@ class MLP(torch.nn.Module):
 class GLMBlock(torch.nn.Module):
     """A single transformer layer.
     Transformer layer takes input with size [s, b, h] and returns an
     output of the same size.
     """
@@ -597,7 +593,7 @@ class GLMTransformer(torch.nn.Module):
         if self.post_layer_norm:
             LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
             # Final layer norm before output.
-            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
                                                  dtype=config.torch_dtype)
         self.gradient_checkpointing = False
@@ -653,7 +649,7 @@ class GLMTransformer(torch.nn.Module):
         # Final layer norm.
         if self.post_layer_norm:
-            hidden_states = self.final_layernorm(hidden_states)
         return hidden_states, presents, all_hidden_states, all_self_attentions
@@ -740,7 +736,14 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
         init_kwargs = {}
         if device is not None:
             init_kwargs["device"] = device
-        self.embedding = init_method(Embedding, config, **init_kwargs)
         self.num_layers = config.num_layers
         self.multi_query_group_num = config.multi_query_group_num
         self.kv_channels = config.kv_channels
@@ -753,9 +756,21 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
         self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
                                               dtype=config.torch_dtype)
-        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
-        self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
-                                        dtype=config.torch_dtype, **init_kwargs)
         self.pre_seq_len = config.pre_seq_len
         self.prefix_projection = config.prefix_projection
         if self.pre_seq_len is not None:
@@ -765,6 +780,8 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
             self.prefix_encoder = PrefixEncoder(config)
             self.dropout = torch.nn.Dropout(0.1)
     def get_input_embeddings(self):
         return self.embedding.word_embeddings
@@ -804,7 +821,7 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
         batch_size, seq_length = input_ids.shape
         if inputs_embeds is None:
-            inputs_embeds = self.embedding(input_ids)
         if self.pre_seq_len is not None:
             if past_key_values is None:
@@ -827,10 +844,54 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
         rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
         # Run encoder.
-        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
-            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
-            kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
-        )
         if not return_dict:
             return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
@@ -844,7 +905,7 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
     def quantize(self, weight_bit_width: int):
         from .quantization import quantize
-        quantize(self.encoder, weight_bit_width)
         return self
@@ -853,7 +914,8 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         super().__init__(config)
         self.max_sequence_length = config.max_length
-        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
         self.config = config
         self.quantized = False
@@ -934,7 +996,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        transformer_outputs = self.transformer(
             input_ids=input_ids,
             position_ids=position_ids,
             attention_mask=attention_mask,
@@ -948,8 +1010,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         hidden_states = transformer_outputs[0]
         if return_last_logit:
             hidden_states = hidden_states[-1:]
-        lm_logits = self.transformer.output_layer(hidden_states)
-        lm_logits = lm_logits.transpose(0, 1).contiguous()
         loss = None
         if labels is not None:
@@ -1062,8 +1123,8 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         inputs = inputs.to(self.device)
         if past_key_values is not None:
             past_length = past_key_values[0][0].shape[0]
-            if self.transformer.pre_seq_len is not None:
-                past_length -= self.transformer.pre_seq_len
             inputs.position_ids += past_length
             attention_mask = inputs.attention_mask
             attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
@@ -1205,7 +1266,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         self.config.quantization_bit = bits
-        self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
                                             **kwargs)
         return self
@@ -1215,7 +1276,7 @@ class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
         self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
         if config.classifier_dropout is not None:
@@ -1242,7 +1303,7 @@ class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
     ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        transformer_outputs = self.transformer(
             input_ids=input_ids,
             position_ids=position_ids,
             attention_mask=attention_mask,
@@ -1293,4 +1354,4 @@ class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
-        )

         )
+# @torch.jit.script
 def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
     # x: [sq, b, np, hn]
     sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
         if pytorch_major_version >= 2:
             query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
             if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,is_causal=True)
             else:
                 if attention_mask is not None:
                     attention_mask = ~attention_mask
             # Raw attention scores
             # [b, np, sq, sk]
+            output_size = (query_layer.size(0), query_layer.size(2), query_layer.size(1), key_layer.size(0))
             # [sq, b, np, hn] -> [sq, b * np, hn]
             query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
 class SelfAttention(torch.nn.Module):
     """Parallel self-attention layer abstract class.
     Self-attention layer takes input with size [s, b, h]
     and returns output of the same size.
     """
         return output, kv_cache
 def _config_to_kwargs(args):
     common_kwargs = {
         "dtype": args.torch_dtype,
 class GLMBlock(torch.nn.Module):
     """A single transformer layer.
     Transformer layer takes input with size [s, b, h] and returns an
     output of the same size.
     """
         if self.post_layer_norm:
             LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
             # Final layer norm before output.
+            self.norm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
                                                  dtype=config.torch_dtype)
         self.gradient_checkpointing = False
         # Final layer norm.
         if self.post_layer_norm:
+            hidden_states = self.norm(hidden_states)
         return hidden_states, presents, all_hidden_states, all_self_attentions
         init_kwargs = {}
         if device is not None:
             init_kwargs["device"] = device
+        self.embed_tokens = nn.Embedding(
+            config.padded_vocab_size,
+            config.hidden_size,
+            dtype=config.torch_dtype,
+            device=device
+        )
         self.num_layers = config.num_layers
         self.multi_query_group_num = config.multi_query_group_num
         self.kv_channels = config.kv_channels
         self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
                                               dtype=config.torch_dtype)
+        # Transformer layers.
+        def build_layer(layer_number):
+            return GLMBlock(config, layer_number, device=device)
+        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
+        self.num_layers = config.num_layers
+        self.post_layer_norm = config.post_layer_norm
+        if self.post_layer_norm:
+            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+            # Final layer norm before output.
+            self.norm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                 dtype=config.torch_dtype)
         self.pre_seq_len = config.pre_seq_len
         self.prefix_projection = config.prefix_projection
         if self.pre_seq_len is not None:
             self.prefix_encoder = PrefixEncoder(config)
             self.dropout = torch.nn.Dropout(0.1)
+        self.gradient_checkpointing = False
     def get_input_embeddings(self):
         return self.embedding.word_embeddings
         batch_size, seq_length = input_ids.shape
         if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
         if self.pre_seq_len is not None:
             if past_key_values is None:
         rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
         # Run encoder.
+        if not past_key_values:
+            past_key_values = [None for _ in range(self.num_layers)]
+        presents = () if use_cache else None
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        all_self_attentions = None
+        all_hidden_states = () if output_hidden_states else None
+        hidden_states = inputs_embeds
+        # To comply with former chat-glm format that expects (seqlen, bs, hd)
+        hidden_states = hidden_states.permute(1, 0, 2)
+        for index, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_ret = torch.utils.checkpoint.checkpoint(
+                    layer,
+                    hidden_states,
+                    full_attention_mask,
+                    rotary_pos_emb,
+                    past_key_values[index],
+                    use_cache
+                )
+            else:
+                layer_ret = layer(
+                    hidden_states,
+                    full_attention_mask,
+                    rotary_pos_emb,
+                    kv_cache=past_key_values[index],
+                    use_cache=use_cache
+                )
+            hidden_states, kv_cache = layer_ret
+            if use_cache:
+                presents = presents + (kv_cache,)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        # Final layer norm.
+        if self.post_layer_norm:
+            hidden_states = self.norm(hidden_states)
         if not return_dict:
             return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
     def quantize(self, weight_bit_width: int):
         from .quantization import quantize
+        quantize(self, weight_bit_width)
         return self
         super().__init__(config)
         self.max_sequence_length = config.max_length
+        self.model = ChatGLMModel(config, empty_init=empty_init, device=device)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.config = config
         self.quantized = False
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
             input_ids=input_ids,
             position_ids=position_ids,
             attention_mask=attention_mask,
         hidden_states = transformer_outputs[0]
         if return_last_logit:
             hidden_states = hidden_states[-1:]
+        lm_logits = self.lm_head(hidden_states)
         loss = None
         if labels is not None:
         inputs = inputs.to(self.device)
         if past_key_values is not None:
             past_length = past_key_values[0][0].shape[0]
+            if self.model.pre_seq_len is not None:
+                past_length -= self.model.pre_seq_len
             inputs.position_ids += past_length
             attention_mask = inputs.attention_mask
             attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
         self.config.quantization_bit = bits
+        self.model = quantize(self.model, bits, empty_init=empty_init, device=device,
                                             **kwargs)
         return self
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.model = ChatGLMModel(config, empty_init=empty_init, device=device)
         self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
         if config.classifier_dropout is not None:
     ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
             input_ids=input_ids,
             position_ids=position_ids,
             attention_mask=attention_mask,
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
+        )