|
|
|
""" Hgrn configuration""" |
|
|
|
from transformers.configuration_utils import PretrainedConfig |
|
from transformers.utils import logging |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} |
|
|
|
|
|
class HgrnConfig(PretrainedConfig): |
|
model_type = "hgrn" |
|
keys_to_ignore_at_inference = ["past_key_values"] |
|
|
|
def __init__( |
|
self, |
|
pad_token_id=1, |
|
bos_token_id=0, |
|
eos_token_id=2, |
|
vocab_size=50272, |
|
use_cache=True, |
|
init_std=0.02, |
|
|
|
decoder_embed_dim=1024, |
|
decoder_layers=24, |
|
add_bos_token=False, |
|
act_fun="swish", |
|
causal=True, |
|
use_triton=False, |
|
glu_act="swish", |
|
glu_dim=2816, |
|
bias=False, |
|
norm_type="layernorm", |
|
no_scale_embedding=False, |
|
**kwargs, |
|
): |
|
super().__init__( |
|
pad_token_id=pad_token_id, |
|
bos_token_id=bos_token_id, |
|
eos_token_id=eos_token_id, |
|
**kwargs, |
|
) |
|
|
|
self.vocab_size = vocab_size |
|
self.use_cache = use_cache |
|
self.init_std = init_std |
|
|
|
self.decoder_embed_dim = decoder_embed_dim |
|
self.decoder_layers = decoder_layers |
|
self.add_bos_token = add_bos_token |
|
self.act_fun = act_fun |
|
self.causal = causal |
|
self.use_triton = use_triton |
|
self.glu_act = glu_act |
|
self.glu_dim = glu_dim |
|
self.bias = bias |
|
self.norm_type = norm_type |
|
self.no_scale_embedding = no_scale_embedding |