oweller2 commited on
Commit
9bfde91
1 Parent(s): 1400590
Files changed (3) hide show
  1. attention.py +1 -1
  2. config.json +1 -1
  3. modeling_flexbert.py +0 -1
attention.py CHANGED
@@ -863,7 +863,7 @@ class FlexBertUnpadRopeAttention(FlexBertAttentionBase):
863
  qkv = self.Wqkv(hidden_states)
864
 
865
  # only needed for inference when we have KV cache
866
- seqlen_offset = max_seqlen * (cu_seqlens[0].item() // max_seqlen)
867
 
868
  # (total_seqlen, 3, nheads, headdim)
869
  qkv = qkv.view(-1, 3, self.num_attention_heads, self.attn_head_size)
 
863
  qkv = self.Wqkv(hidden_states)
864
 
865
  # only needed for inference when we have KV cache
866
+ seqlen_offset = 0
867
 
868
  # (total_seqlen, 3, nheads, headdim)
869
  qkv = qkv.view(-1, 3, self.num_attention_heads, self.attn_head_size)
config.json CHANGED
@@ -74,7 +74,7 @@
74
  "padding": "unpadded",
75
  "pooling_type": "cls",
76
  "position_embedding_type": "absolute",
77
- "rotary_emb_base": 10000.0,
78
  "rotary_emb_dim": 64,
79
  "rotary_emb_interleaved": false,
80
  "rotary_emb_scale_base": null,
 
74
  "padding": "unpadded",
75
  "pooling_type": "cls",
76
  "position_embedding_type": "absolute",
77
+ "rotary_emb_base": 1000.0,
78
  "rotary_emb_dim": 64,
79
  "rotary_emb_interleaved": false,
80
  "rotary_emb_scale_base": null,
modeling_flexbert.py CHANGED
@@ -1733,7 +1733,6 @@ class FlexBertForCausalLM(FlexBertPreTrainedModel):
1733
  input_ids, indices, cu_seqlens, max_seqlen, position_ids, _ = self.unpad_inputs(
1734
  input_ids, attention_mask, position_ids, None
1735
  )
1736
- breakpoint()
1737
  return {
1738
  "input_ids": input_ids,
1739
  "attention_mask": attention_mask,
 
1733
  input_ids, indices, cu_seqlens, max_seqlen, position_ids, _ = self.unpad_inputs(
1734
  input_ids, attention_mask, position_ids, None
1735
  )
 
1736
  return {
1737
  "input_ids": input_ids,
1738
  "attention_mask": attention_mask,