|
{ |
|
"device": "cuda:0", |
|
"seed": 42, |
|
"dtype": "torch.bfloat16", |
|
"hook_point_in": "blocks.23.hook_attn_out", |
|
"hook_point_out": "blocks.23.hook_attn_out", |
|
"use_decoder_bias": true, |
|
"apply_decoder_bias_to_pre_encoder": false, |
|
"expansion_factor": 8, |
|
"d_model": 4096, |
|
"d_sae": 32768, |
|
"bias_init_method": "all_zero", |
|
"act_fn": "jumprelu", |
|
"jump_relu_threshold": 0.119140625, |
|
"norm_activation": "dataset-wise", |
|
"dataset_average_activation_norm": { |
|
"in": 2.890625, |
|
"out": 2.890625 |
|
}, |
|
"decoder_exactly_fixed_norm": false, |
|
"sparsity_include_decoder_norm": true, |
|
"use_glu_encoder": false, |
|
"init_decoder_norm": 0.5, |
|
"init_encoder_norm": null, |
|
"init_encoder_with_decoder_transpose": true, |
|
"lp": 1, |
|
"l1_coefficient": 8e-05, |
|
"l1_coefficient_warmup_steps": 39062, |
|
"top_k": 50, |
|
"k_warmup_steps": 39062, |
|
"use_batch_norm_mse": true, |
|
"use_ghost_grads": false, |
|
"tp_size": 1, |
|
"ddp_size": 1 |
|
} |