Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .DS_Store +0 -0
- L0/.DS_Store +0 -0
- L0/100M/cfg.json +1 -0
- L0/100M/sae_weights.safetensors +3 -0
- L0/100M/sparsity.safetensors +3 -0
- L0/200M/cfg.json +1 -0
- L0/200M/sae_weights.safetensors +3 -0
- L0/200M/sparsity.safetensors +3 -0
- L0/300M/cfg.json +1 -0
- L0/300M/sae_weights.safetensors +3 -0
- L0/300M/sparsity.safetensors +3 -0
- L0/400M/cfg.json +1 -0
- L0/400M/sae_weights.safetensors +3 -0
- L0/400M/sparsity.safetensors +3 -0
- L0/500M/cfg.json +1 -0
- L0/500M/sae_weights.safetensors +3 -0
- L0/500M/sparsity.safetensors +3 -0
- L0/600M/cfg.json +1 -0
- L0/600M/sae_weights.safetensors +3 -0
- L0/600M/sparsity.safetensors +3 -0
- L0/700M/cfg.json +1 -0
- L0/700M/sae_weights.safetensors +3 -0
- L0/700M/sparsity.safetensors +3 -0
- L0/800M/cfg.json +1 -0
- L0/800M/sae_weights.safetensors +3 -0
- L0/800M/sparsity.safetensors +3 -0
- L0/900M/cfg.json +1 -0
- L0/900M/sae_weights.safetensors +3 -0
- L0/900M/sparsity.safetensors +3 -0
- L0/cfg.json +1 -0
- L0/sae_weights.safetensors +3 -0
- L0/sparsity.safetensors +3 -0
- L1/.DS_Store +0 -0
- L1/100M/cfg.json +1 -0
- L1/100M/sae_weights.safetensors +3 -0
- L1/100M/sparsity.safetensors +3 -0
- L1/200M/cfg.json +1 -0
- L1/200M/sae_weights.safetensors +3 -0
- L1/200M/sparsity.safetensors +3 -0
- L1/300M/cfg.json +1 -0
- L1/300M/sae_weights.safetensors +3 -0
- L1/300M/sparsity.safetensors +3 -0
- L1/400M/cfg.json +1 -0
- L1/400M/sae_weights.safetensors +3 -0
- L1/400M/sparsity.safetensors +3 -0
- L1/500M/cfg.json +1 -0
- L1/500M/sae_weights.safetensors +3 -0
- L1/500M/sparsity.safetensors +3 -0
- L1/600M/cfg.json +1 -0
- L1/600M/sae_weights.safetensors +3 -0
.DS_Store
ADDED
Binary file (10.2 kB). View file
|
|
L0/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
L0/100M/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.0.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 0, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L0_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/0gs7wqs5", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L0/100M/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fccab3d6a4655d7d9a28070802d5849bd449732c7f094a7a4707c7a21f85a7da
|
3 |
+
size 37801344
|
L0/100M/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4f9dc061a141bff11c5c258dd875a50b00b010666fdcf6026593179c7673d43a
|
3 |
+
size 24656
|
L0/200M/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.0.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 0, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L0_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/0gs7wqs5", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L0/200M/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:09c286f10aedce685b6437d15d9c61087f222cc9c6b09838f05d89cb78bd7235
|
3 |
+
size 37801344
|
L0/200M/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7d21789328ae9fc2a63ec8e8faf24b5dbc19658b13f97baaa385c2ae378cbae1
|
3 |
+
size 24656
|
L0/300M/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.0.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 0, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L0_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/0gs7wqs5", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L0/300M/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ee5919165a9355c63f2efefdd6e5019e355a80e3357f3f15b1c704d5f65133d0
|
3 |
+
size 37801344
|
L0/300M/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:80be15be93d330fc81936c5c6eae253b3d5f4e2ed36352f3428eba630f43b4e8
|
3 |
+
size 24656
|
L0/400M/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.0.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 0, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L0_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/0gs7wqs5", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L0/400M/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d526a6a9271e9f5cbec6eb558bfae653b6a80d83797e574dae29468a315d61cb
|
3 |
+
size 37801344
|
L0/400M/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a8658d33a95393f5270361d27b3a920901fc1d92b194c6664d34ccc99422940d
|
3 |
+
size 24656
|
L0/500M/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.0.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 0, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L0_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/0gs7wqs5", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L0/500M/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3981a91bd59dd240fb183b7d5ddfb976ee0a6c47117f9f8e2c7b2994397ed9c5
|
3 |
+
size 37801344
|
L0/500M/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a5cefff7398f4abfb34b354c4a134ca227d754a690f8cbec435b82e5844d05a
|
3 |
+
size 24656
|
L0/600M/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.0.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 0, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L0_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/0gs7wqs5", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L0/600M/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ec985f14abe659420046e2bc17193f9dcb584c117012d7dc189d3f68f7800fb5
|
3 |
+
size 37801344
|
L0/600M/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:770c5bef03a7b7360361cc76446a9bc37c36ed23db0b548c2cb149d2da07a441
|
3 |
+
size 24656
|
L0/700M/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.0.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 0, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L0_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/0gs7wqs5", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L0/700M/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53057b1bfbdeb962b16248c00808b53b0bd82c31d1e06055ec9828b95a93b6c7
|
3 |
+
size 37801344
|
L0/700M/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:40446dfac2931e9264cb5a7c0c0e04f190fd290acac4fb42721733fe239377c5
|
3 |
+
size 24656
|
L0/800M/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.0.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 0, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L0_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/0gs7wqs5", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L0/800M/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d791c8e056991edc50031b4568f00616ebaeb32bd5054ce5f9904d6bb3df3649
|
3 |
+
size 37801344
|
L0/800M/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1bb920eb73d4af9845d2f60a9987609e91b5f65ad52f762b2f39ab3bd4c7b716
|
3 |
+
size 24656
|
L0/900M/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.0.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 0, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L0_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/0gs7wqs5", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L0/900M/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:342f2458e8c8a3162605ffe70a14efd59321ab153d2ba02dab30d2b3fbb8ff03
|
3 |
+
size 37801344
|
L0/900M/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf563f58b90ed8e879518e2b502d41aa0423acdf8ec158364f7737f9c62ae958
|
3 |
+
size 24656
|
L0/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.0.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 0, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L0_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/0gs7wqs5", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L0/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6acb7f40a9c8dd71edd3742ee221103ede3afb872a06b51ecd4560c1171e1502
|
3 |
+
size 37801344
|
L0/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e93cee41bb74dca9fd92ed2659b273a82d61bad27333b9929b8d99493e4b6d10
|
3 |
+
size 24656
|
L1/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
L1/100M/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.1.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 1, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L1_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/eezx69qa", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L1/100M/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e78395d5410e0860eed2082fa36ea49dc2faeab0f44791a475ed6ab285c4817f
|
3 |
+
size 37801344
|
L1/100M/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f780a71547cc3b7e730cf7bdb8366a3e142b8a45828fcabe3fb2fd7dfa21f255
|
3 |
+
size 24656
|
L1/200M/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.1.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 1, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L1_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/eezx69qa", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L1/200M/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:76544a19b4eadde706709ea942ea05ce614480de937f0f27597032c6c8906c31
|
3 |
+
size 37801344
|
L1/200M/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e49627664d08a1ced5f3517643a662b57e5b09f1dee19a4c6eb8ef921963184
|
3 |
+
size 24656
|
L1/300M/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.1.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 1, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L1_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/eezx69qa", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L1/300M/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b7572de6c43b8d37140aa71f743bfd23c184d92e8f171b30bbacbebe995779f4
|
3 |
+
size 37801344
|
L1/300M/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c79d622b0cc13bff513aabb456229559f75a7e749f50073779f84722ba7e164c
|
3 |
+
size 24656
|
L1/400M/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.1.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 1, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L1_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/eezx69qa", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L1/400M/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:139e44c33cf132b5953523ac55c83deaa48a7e7f911efe4a1d6fb21d6c8de8f8
|
3 |
+
size 37801344
|
L1/400M/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7d068f33713c2495a6a8e5ec6e6d769b1d3af7d6c395c6a6a74a2eae3c741730
|
3 |
+
size 24656
|
L1/500M/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.1.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 1, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L1_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/eezx69qa", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L1/500M/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9f38f266105cb2215752bc5df84e635b646f418fc9db323315e1d6134cce2396
|
3 |
+
size 37801344
|
L1/500M/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9f6310aaf90897b3e305cdac83e5c84b1d0cb4f8519ad54a6f3f7abc4119d8b8
|
3 |
+
size 24656
|
L1/600M/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.1.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 1, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L1_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/eezx69qa", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L1/600M/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a7d92ff32c6c32f8f1914652bab393d84a173cb55549cf80c1b8ccd6c262a25
|
3 |
+
size 37801344
|