{ "device": "cuda:0", "seed": 42, "dtype": "torch.bfloat16", "hook_point_in": "blocks.7.hook_attn_out", "hook_point_out": "blocks.7.hook_attn_out", "use_decoder_bias": true, "apply_decoder_bias_to_pre_encoder": false, "expansion_factor": 8, "d_model": 4096, "d_sae": 32768, "bias_init_method": "all_zero", "act_fn": "jumprelu", "jump_relu_threshold": 0.1005859375, "norm_activation": "dataset-wise", "dataset_average_activation_norm": { "in": 2.34375, "out": 2.34375 }, "decoder_exactly_fixed_norm": false, "sparsity_include_decoder_norm": true, "use_glu_encoder": false, "init_decoder_norm": 0.5, "init_encoder_norm": null, "init_encoder_with_decoder_transpose": true, "lp": 1, "l1_coefficient": 8e-05, "l1_coefficient_warmup_steps": 39062, "top_k": 50, "k_warmup_steps": 39062, "use_batch_norm_mse": true, "use_ghost_grads": false, "tp_size": 1, "ddp_size": 1 }