damerajee commited on
Commit
ff369a7
1 Parent(s): bd78fad

Update modeling_Llamoe.py

Browse files
Files changed (1) hide show
  1. modeling_Llamoe.py +9 -31
modeling_Llamoe.py CHANGED
@@ -747,7 +747,7 @@ class LlamoeSdpaAttention(LlamoeAttention):
747
  return attn_output, None, past_key_value
748
 
749
 
750
- LLAMA_ATTENTION_CLASSES = {
751
  "eager": LlamoeAttention,
752
  "flash_attention_2": LlamoeFlashAttention2,
753
  "sdpa": LlamoeSdpaAttention,
@@ -833,7 +833,7 @@ class LlamoeDecoderLayer(nn.Module):
833
 
834
 
835
 
836
- LLAMA_START_DOCSTRING = r"""
837
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
838
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
839
  etc.)
@@ -851,12 +851,8 @@ LLAMA_START_DOCSTRING = r"""
851
 
852
 
853
  @add_start_docstrings(
854
- "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
855
- LLAMA_START_DOCSTRING,
856
- )
857
- @add_start_docstrings(
858
- "The bare Gemmoe Model outputting raw hidden-states without any specific head on top.",
859
- GEMMOE_START_DOCSTRING,
860
  )
861
 
862
  class LlammoePreTrainedModel(PreTrainedModel):
@@ -903,7 +899,7 @@ class LlammoePreTrainedModel(PreTrainedModel):
903
  layer.self_attn.past_key_value = None
904
 
905
 
906
- GEMMOE_INPUTS_DOCSTRING = r"""
907
  Args:
908
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
909
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
@@ -967,14 +963,14 @@ GEMMOE_INPUTS_DOCSTRING = r"""
967
 
968
  @add_start_docstrings(
969
  "The bare Gemmoe Model outputting raw hidden-states without any specific head on top.",
970
- GEMMOE_START_DOCSTRING,
971
  )
972
 
973
  class LlamoeModel(LlammoePreTrainedModel):
974
  """
975
- Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`GemmoeDecoderLayer`]
976
  Args:
977
- config: GemmoeConfig
978
  """
979
 
980
  def __init__(self, config: LlamoeConfig):
@@ -1229,25 +1225,7 @@ class LlamoeForCausalLM(LlammoePreTrainedModel):
1229
  output_router_logits: Optional[bool] = None,
1230
  return_dict: Optional[bool] = None,
1231
  ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
1232
- r"""
1233
- Args:
1234
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1235
- Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1236
- config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1237
- (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1238
- Returns:
1239
- Example:
1240
- ```python
1241
- >>> from transformers import AutoTokenizer, GemmoeForCausalLM
1242
- >>> model = GemmoeForCausalLM.from_pretrained("mistralai/Gemmoe-8x7B-v0.1")
1243
- >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Gemmoe-8x7B-v0.1")
1244
- >>> prompt = "Hey, are you conscious? Can you talk to me?"
1245
- >>> inputs = tokenizer(prompt, return_tensors="pt")
1246
- >>> # Generate
1247
- >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1248
- >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1249
- "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1250
- ```"""
1251
 
1252
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1253
  output_router_logits = (
 
747
  return attn_output, None, past_key_value
748
 
749
 
750
+ LLAMOE_ATTENTION_CLASSES = {
751
  "eager": LlamoeAttention,
752
  "flash_attention_2": LlamoeFlashAttention2,
753
  "sdpa": LlamoeSdpaAttention,
 
833
 
834
 
835
 
836
+ LLAMOE_START_DOCSTRING = r"""
837
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
838
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
839
  etc.)
 
851
 
852
 
853
  @add_start_docstrings(
854
+ "The bare Llamoe Model outputting raw hidden-states without any specific head on top.",
855
+ LLAMOE_START_DOCSTRING,
 
 
 
 
856
  )
857
 
858
  class LlammoePreTrainedModel(PreTrainedModel):
 
899
  layer.self_attn.past_key_value = None
900
 
901
 
902
+ LLAMOE_INPUTS_DOCSTRING = r"""
903
  Args:
904
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
905
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
 
963
 
964
  @add_start_docstrings(
965
  "The bare Gemmoe Model outputting raw hidden-states without any specific head on top.",
966
+ LLAMOE_START_DOCSTRING,
967
  )
968
 
969
  class LlamoeModel(LlammoePreTrainedModel):
970
  """
971
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamoeDecoderLayer`]
972
  Args:
973
+ config: LlamoeConfig
974
  """
975
 
976
  def __init__(self, config: LlamoeConfig):
 
1225
  output_router_logits: Optional[bool] = None,
1226
  return_dict: Optional[bool] = None,
1227
  ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
1228
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1229
 
1230
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1231
  output_router_logits = (