ghidav commited on
Commit
cd6d31f
1 Parent(s): 3b8e8bf

Upload 30 files

Browse files
L5/100M/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.5.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 5, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L5_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/99xxpw0a", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
L5/100M/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f5ed43338b47070d998fe37de7833c4ce9381e7488effbd6249f2e04f707e08
3
+ size 37801344
L5/100M/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08093f73aaebd0dd1c446aedd3d151b99f792596847b746bbd925bdd45d5cc29
3
+ size 24656
L5/200M/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.5.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 5, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L5_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/99xxpw0a", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
L5/200M/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75cf14094cd70261c95c56bea1e6225411bf8904e0b0097476ad3443f6108dfd
3
+ size 37801344
L5/200M/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:991eb66e80d2531e3ef83447547823ded974e686ec509aeb1c2a7683ac2fb877
3
+ size 24656
L5/300M/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.5.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 5, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L5_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/99xxpw0a", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
L5/300M/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ee9a5f14617871c28b595696c48938447c37a5b6dfe13d3c565d3573f70af44
3
+ size 37801344
L5/300M/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:535ddeff7951b95321b9e95596c016248ecceba54d0da51163d0414027521643
3
+ size 24656
L5/400M/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.5.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 5, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L5_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/99xxpw0a", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
L5/400M/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1d0596b4a18723c7c9aeebec3b92b29080afaf3732168ba034ac43fd37c538f
3
+ size 37801344
L5/400M/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04d775547bd4dc2cb8a5c7831846b69ae07fe9b9283061d766b791534f2edbb8
3
+ size 24656
L5/500M/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.5.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 5, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L5_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/99xxpw0a", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
L5/500M/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6af7a346d361aed354dfab89eacc6c1047cc4c8002535f9194962c4219657ac7
3
+ size 37801344
L5/500M/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8542c86364a43a216471dfa2436d710ab579ea1cf6a071b91ecb36d0012bcde8
3
+ size 24656
L5/600M/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.5.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 5, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L5_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/99xxpw0a", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
L5/600M/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7caa1916864aa5ac7da60fd6099a46367be442a21d2b63b8e7168bea2066ee0c
3
+ size 37801344
L5/600M/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c66322d4b9d989fe142c43565f4aa14a2e4d8c7edf29299fcceccdbad70cbbcf
3
+ size 24656
L5/700M/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.5.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 5, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L5_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/99xxpw0a", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
L5/700M/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65822574774edd5137b807d18e36726ace965b1d0c0d0041166bc08cff4c6642
3
+ size 37801344
L5/700M/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8011a2296062ace53fdb7efcf3da4f77b54a3d93fe12ec0c05e8171d7f00b4a9
3
+ size 24656
L5/800M/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.5.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 5, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L5_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/99xxpw0a", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
L5/800M/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f77653a70f069fff72b1269d4d92ee79afd9488a270dec87b3171e5a6d6c5654
3
+ size 37801344
L5/800M/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e8b565a31d55d3f98915837d27ad2f2ec37134950d9e1f0bbe1ef98240fb532
3
+ size 24656
L5/900M/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.5.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 5, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L5_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/99xxpw0a", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
L5/900M/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63541fc3c86ff94ab6bd4b10d8fcb6a638d1405fd1cc6bbb1afc00c5a500eba1
3
+ size 37801344
L5/900M/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:931565aa35d62c1c011b7726d8f818a3b9f26e61732fb889e2e4f682b8e1ad39
3
+ size 24656
L5/cfg.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.5.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 5, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L5_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/99xxpw0a", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
L5/sae_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6974c7bd8502c276ad9b3363f9b4f3d2d9368a50878a9e77853562f0b620becb
3
+ size 37801344
L5/sparsity.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9e1bf4d35ccb0f088f422226609090061dc73c3488c1ab6a2a5c9c9f17139c1
3
+ size 24656