File size: 1,850 Bytes
8e08011 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
from packaging import version
import transformers
if version.parse(transformers.__version__) < version.parse("4.31.0"):
raise ImportError(
f"You are using transformers=={transformers.__version__}, but transformers>=4.31.0 is required to use DeciLM. Please upgrade transformers."
)
from transformers.models.llama.configuration_llama import LlamaConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
class DeciLMConfig(LlamaConfig):
r"""
Args:
num_key_value_heads_per_layer (`List[int]`):
The number of key-value heads per layer.
naive_attention_prefill (`bool`, *optional*, defaults to False):
Whether to use naive matmul or scaled dot product attention during prefill.
naive_attention_decode_batched (`bool`, *optional*, defaults to True):
Whether to use naive matmul or scaled dot product attention during decode for batch_size > 1.
naive_attention_decode_single (`bool`, *optional*, defaults to False):
Whether to use naive matmul or scaled dot product attention during decode for batch_size == 1.
```"""
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
num_key_value_heads_per_layer: list = None,
naive_attention_prefill: bool = False,
naive_attention_decode_batched: bool = False,
naive_attention_decode_single: bool = False,
**kwargs,
):
self.num_key_value_heads_per_layer = num_key_value_heads_per_layer
self.naive_attention_prefill = naive_attention_prefill
self.naive_attention_decode_batched = naive_attention_decode_batched
self.naive_attention_decode_single = naive_attention_decode_single
super().__init__(**kwargs, )
|