Crystalcareai
/

Quiet-Mistral

Text Generation

Transformers

Safetensors

quiet

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 26

Commit

f2459a7

•

1 Parent(s): 6f470a7

Update modeling_quiet.py

Browse files

Files changed (1) hide show

modeling_quiet.py +5 -3

modeling_quiet.py CHANGED Viewed

@@ -910,7 +910,7 @@ class QuietModel(QuietPreTrainedModel):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
             [QuietDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
@@ -1102,6 +1102,7 @@ class QuietModel(QuietPreTrainedModel):
             past_key_values=next_cache,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
@@ -1215,7 +1216,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
         )
         hidden_states = outputs.last_hidden_state
-        base_logits = self.lm_head(hidden_states)
         thought_ids, thought_embeddings = self.model._generate_thoughts(hidden_states, max_length=self.thought_length)
         thought_hidden_states = self.model(inputs_embeds=thought_embeddings).last_hidden_state
@@ -1224,7 +1225,6 @@ class QuietForCausalLM(QuietPreTrainedModel):
         mixing_input = torch.cat([hidden_states, thought_hidden_states], dim=-1)
         mixing_weights = self.mixing_head(mixing_input).squeeze(-1)  # (batch_size, seq_length)
         mixed_logits = base_logits * (1 - mixing_weights.unsqueeze(-1)) + thought_logits * mixing_weights.unsqueeze(-1)
         loss = None
         if labels is not None:
             # Shift so that tokens < n predict n
@@ -1240,6 +1240,8 @@ class QuietForCausalLM(QuietPreTrainedModel):
                     rewards = torch.clamp(rewards, min=0)
                 policy_loss = self.calculate_policy_loss(thought_ids, rewards)
                 loss = loss + policy_loss
         if not return_dict:
             output = (mixed_logits,) + outputs[1:]

         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
             [QuietDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
             past_key_values=next_cache,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
+            logits=self.lm_head(hidden_states),
         )
         )
         hidden_states = outputs.last_hidden_state
+        base_logits = outputs.logits  # Use the logits from the model output
         thought_ids, thought_embeddings = self.model._generate_thoughts(hidden_states, max_length=self.thought_length)
         thought_hidden_states = self.model(inputs_embeds=thought_embeddings).last_hidden_state
         mixing_input = torch.cat([hidden_states, thought_hidden_states], dim=-1)
         mixing_weights = self.mixing_head(mixing_input).squeeze(-1)  # (batch_size, seq_length)
         mixed_logits = base_logits * (1 - mixing_weights.unsqueeze(-1)) + thought_logits * mixing_weights.unsqueeze(-1)
         loss = None
         if labels is not None:
             # Shift so that tokens < n predict n
                     rewards = torch.clamp(rewards, min=0)
                 policy_loss = self.calculate_policy_loss(thought_ids, rewards)
                 loss = loss + policy_loss
+        else:
+            loss = None
         if not return_dict:
             output = (mixed_logits,) + outputs[1:]