{ "n_layers": 2, "d_model": 256, "n_ctx": 2048, "d_head": 32, "model_name": "custom", "n_heads": 8, "d_vocab": 50259, "eps": 1e-05, "use_attn_result": True, "use_attn_scale": True, "use_split_qkv_input": False, "use_local_attn": False, "from_checkpoint": False, "init_mode": "gpt2", "normalization_type": "LN", "device": "cuda", "attention_dir": "causal", "attn_only": True, "initializer_range": 0.05, "init_weights": True, "scale_attn_by_inverse_layer_idx": False, "positional_embedding_type": "shortformer", "final_rms": False, "d_vocab_out": 50259, "parallel_attn_mlp": False, "n_params": 524288, "use_hook_tokens": False, }