|
{ |
|
"sae": { |
|
"expansion_factor": 16, |
|
"normalize_decoder": true, |
|
"num_latents": 0, |
|
"k": 128, |
|
"multi_topk": false, |
|
"jumprelu": false, |
|
"jumprelu_init_threshold": 0.001, |
|
"jumprelu_bandwidth": 0.001, |
|
"jumprelu_target_l0": null, |
|
"jumprelu_per_layer_l0": false, |
|
"init_enc_as_dec_transpose": true, |
|
"init_b_dec_as_zeros": false |
|
}, |
|
"batch_size": 8, |
|
"max_seq_len": 1024, |
|
"num_training_tokens": 1000000000, |
|
"grad_acc_steps": 1, |
|
"micro_acc_steps": 1, |
|
"adam_8bit": false, |
|
"adam_epsilon": 1e-08, |
|
"adam_betas": [ |
|
0.9, |
|
0.999 |
|
], |
|
"lr": null, |
|
"lr_scheduler_name": "constant", |
|
"lr_warmup_steps": 0.0, |
|
"l1_coefficient": 0.0, |
|
"l1_warmup_steps": 0.0, |
|
"use_l2_loss": false, |
|
"auxk_alpha": 0.03125, |
|
"dead_feature_threshold": 10000000, |
|
"hookpoints": [ |
|
"layers.0", |
|
"layers.1", |
|
"layers.2", |
|
"layers.3", |
|
"layers.4", |
|
"layers.5", |
|
"layers.6", |
|
"layers.7", |
|
"layers.8", |
|
"layers.9", |
|
"layers.10", |
|
"layers.11", |
|
"layers.12", |
|
"layers.13", |
|
"layers.14", |
|
"layers.15", |
|
"layers.16", |
|
"layers.17", |
|
"layers.18", |
|
"layers.19", |
|
"layers.20", |
|
"layers.21", |
|
"layers.22", |
|
"layers.23" |
|
], |
|
"layers": [ |
|
0, |
|
1, |
|
2, |
|
3, |
|
4, |
|
5, |
|
6, |
|
7, |
|
8, |
|
9, |
|
10, |
|
11, |
|
12, |
|
13, |
|
14, |
|
15, |
|
16, |
|
17, |
|
18, |
|
19, |
|
20, |
|
21, |
|
22, |
|
23 |
|
], |
|
"layer_stride": 1, |
|
"distribute_modules": true, |
|
"save_every": 100000, |
|
"normalize_activations": 1.0, |
|
"num_norm_estimation_tokens": 5000000, |
|
"clusters": { |
|
"k1-c0": [ |
|
0, |
|
1, |
|
2, |
|
3, |
|
4, |
|
5, |
|
6, |
|
7, |
|
8, |
|
9, |
|
10, |
|
11, |
|
12, |
|
13, |
|
14, |
|
15, |
|
16, |
|
17, |
|
18, |
|
19, |
|
20, |
|
21, |
|
22 |
|
], |
|
"k2-c0": [ |
|
0, |
|
1, |
|
2, |
|
3, |
|
4, |
|
5, |
|
6, |
|
7, |
|
8, |
|
9, |
|
10, |
|
11, |
|
12, |
|
13, |
|
14, |
|
15 |
|
], |
|
"k2-c1": [ |
|
16, |
|
17, |
|
18, |
|
19, |
|
20, |
|
21, |
|
22 |
|
], |
|
"k3-c0": [ |
|
7, |
|
8, |
|
9, |
|
10, |
|
11, |
|
12, |
|
13, |
|
14, |
|
15 |
|
], |
|
"k3-c1": [ |
|
0, |
|
1, |
|
2, |
|
3, |
|
4, |
|
5, |
|
6 |
|
], |
|
"k4-c0": [ |
|
11, |
|
12, |
|
13, |
|
14, |
|
15 |
|
], |
|
"k4-c1": [ |
|
7, |
|
8, |
|
9, |
|
10 |
|
], |
|
"k5-c0": [ |
|
19, |
|
20, |
|
21, |
|
22 |
|
], |
|
"k5-c1": [ |
|
16, |
|
17, |
|
18 |
|
], |
|
"k6-c0": [ |
|
3, |
|
4, |
|
5, |
|
6 |
|
], |
|
"k6-c1": [ |
|
0, |
|
1, |
|
2 |
|
], |
|
"k7-c0": [ |
|
13, |
|
14, |
|
15 |
|
], |
|
"k7-c1": [ |
|
11, |
|
12 |
|
], |
|
"k8-c0": [ |
|
3, |
|
4 |
|
], |
|
"k8-c1": [ |
|
5, |
|
6 |
|
], |
|
"k9-c0": [ |
|
21, |
|
22 |
|
], |
|
"k9-c1": [ |
|
19, |
|
20 |
|
], |
|
"k10-c0": [ |
|
7, |
|
8 |
|
], |
|
"k10-c1": [ |
|
9, |
|
10 |
|
], |
|
"k11-c0": [ |
|
17, |
|
18 |
|
], |
|
"k12-c0": [ |
|
14, |
|
15 |
|
], |
|
"k13-c0": [ |
|
1, |
|
2 |
|
], |
|
"layers.0": [ |
|
0 |
|
], |
|
"layers.1": [ |
|
1 |
|
], |
|
"layers.2": [ |
|
2 |
|
], |
|
"layers.3": [ |
|
3 |
|
], |
|
"layers.4": [ |
|
4 |
|
], |
|
"layers.5": [ |
|
5 |
|
], |
|
"layers.6": [ |
|
6 |
|
], |
|
"layers.7": [ |
|
7 |
|
], |
|
"layers.8": [ |
|
8 |
|
], |
|
"layers.9": [ |
|
9 |
|
], |
|
"layers.10": [ |
|
10 |
|
], |
|
"layers.11": [ |
|
11 |
|
], |
|
"layers.12": [ |
|
12 |
|
], |
|
"layers.13": [ |
|
13 |
|
], |
|
"layers.14": [ |
|
14 |
|
], |
|
"layers.15": [ |
|
15 |
|
], |
|
"layers.16": [ |
|
16 |
|
], |
|
"layers.17": [ |
|
17 |
|
], |
|
"layers.18": [ |
|
18 |
|
], |
|
"layers.19": [ |
|
19 |
|
], |
|
"layers.20": [ |
|
20 |
|
], |
|
"layers.21": [ |
|
21 |
|
], |
|
"layers.22": [ |
|
22 |
|
], |
|
"layers.23": [ |
|
23 |
|
] |
|
}, |
|
"cluster_hookpoints": { |
|
"k1-c0": [ |
|
"layers.0", |
|
"layers.1", |
|
"layers.2", |
|
"layers.3", |
|
"layers.4", |
|
"layers.5", |
|
"layers.6", |
|
"layers.7", |
|
"layers.8", |
|
"layers.9", |
|
"layers.10", |
|
"layers.11", |
|
"layers.12", |
|
"layers.13", |
|
"layers.14", |
|
"layers.15", |
|
"layers.16", |
|
"layers.17", |
|
"layers.18", |
|
"layers.19", |
|
"layers.20", |
|
"layers.21", |
|
"layers.22" |
|
], |
|
"k2-c0": [ |
|
"layers.0", |
|
"layers.1", |
|
"layers.2", |
|
"layers.3", |
|
"layers.4", |
|
"layers.5", |
|
"layers.6", |
|
"layers.7", |
|
"layers.8", |
|
"layers.9", |
|
"layers.10", |
|
"layers.11", |
|
"layers.12", |
|
"layers.13", |
|
"layers.14", |
|
"layers.15" |
|
], |
|
"k2-c1": [ |
|
"layers.16", |
|
"layers.17", |
|
"layers.18", |
|
"layers.19", |
|
"layers.20", |
|
"layers.21", |
|
"layers.22" |
|
], |
|
"k3-c0": [ |
|
"layers.7", |
|
"layers.8", |
|
"layers.9", |
|
"layers.10", |
|
"layers.11", |
|
"layers.12", |
|
"layers.13", |
|
"layers.14", |
|
"layers.15" |
|
], |
|
"k3-c1": [ |
|
"layers.0", |
|
"layers.1", |
|
"layers.2", |
|
"layers.3", |
|
"layers.4", |
|
"layers.5", |
|
"layers.6" |
|
], |
|
"k4-c0": [ |
|
"layers.11", |
|
"layers.12", |
|
"layers.13", |
|
"layers.14", |
|
"layers.15" |
|
], |
|
"k4-c1": [ |
|
"layers.7", |
|
"layers.8", |
|
"layers.9", |
|
"layers.10" |
|
], |
|
"k5-c0": [ |
|
"layers.19", |
|
"layers.20", |
|
"layers.21", |
|
"layers.22" |
|
], |
|
"k5-c1": [ |
|
"layers.16", |
|
"layers.17", |
|
"layers.18" |
|
], |
|
"k6-c0": [ |
|
"layers.3", |
|
"layers.4", |
|
"layers.5", |
|
"layers.6" |
|
], |
|
"k6-c1": [ |
|
"layers.0", |
|
"layers.1", |
|
"layers.2" |
|
], |
|
"k7-c0": [ |
|
"layers.13", |
|
"layers.14", |
|
"layers.15" |
|
], |
|
"k7-c1": [ |
|
"layers.11", |
|
"layers.12" |
|
], |
|
"k8-c0": [ |
|
"layers.3", |
|
"layers.4" |
|
], |
|
"k8-c1": [ |
|
"layers.5", |
|
"layers.6" |
|
], |
|
"k9-c0": [ |
|
"layers.21", |
|
"layers.22" |
|
], |
|
"k9-c1": [ |
|
"layers.19", |
|
"layers.20" |
|
], |
|
"k10-c0": [ |
|
"layers.7", |
|
"layers.8" |
|
], |
|
"k10-c1": [ |
|
"layers.9", |
|
"layers.10" |
|
], |
|
"k11-c0": [ |
|
"layers.17", |
|
"layers.18" |
|
], |
|
"k12-c0": [ |
|
"layers.14", |
|
"layers.15" |
|
], |
|
"k13-c0": [ |
|
"layers.1", |
|
"layers.2" |
|
], |
|
"layers.0": [ |
|
"layers.0" |
|
], |
|
"layers.1": [ |
|
"layers.1" |
|
], |
|
"layers.2": [ |
|
"layers.2" |
|
], |
|
"layers.3": [ |
|
"layers.3" |
|
], |
|
"layers.4": [ |
|
"layers.4" |
|
], |
|
"layers.5": [ |
|
"layers.5" |
|
], |
|
"layers.6": [ |
|
"layers.6" |
|
], |
|
"layers.7": [ |
|
"layers.7" |
|
], |
|
"layers.8": [ |
|
"layers.8" |
|
], |
|
"layers.9": [ |
|
"layers.9" |
|
], |
|
"layers.10": [ |
|
"layers.10" |
|
], |
|
"layers.11": [ |
|
"layers.11" |
|
], |
|
"layers.12": [ |
|
"layers.12" |
|
], |
|
"layers.13": [ |
|
"layers.13" |
|
], |
|
"layers.14": [ |
|
"layers.14" |
|
], |
|
"layers.15": [ |
|
"layers.15" |
|
], |
|
"layers.16": [ |
|
"layers.16" |
|
], |
|
"layers.17": [ |
|
"layers.17" |
|
], |
|
"layers.18": [ |
|
"layers.18" |
|
], |
|
"layers.19": [ |
|
"layers.19" |
|
], |
|
"layers.20": [ |
|
"layers.20" |
|
], |
|
"layers.21": [ |
|
"layers.21" |
|
], |
|
"layers.22": [ |
|
"layers.22" |
|
], |
|
"layers.23": [ |
|
"layers.23" |
|
] |
|
}, |
|
"hook": null, |
|
"keep_last_n_checkpoints": 4, |
|
"resume_from": null, |
|
"log_to_wandb": true, |
|
"run_name": "checkpoints-clusters/pythia-410m-topk", |
|
"wandb_log_frequency": 1 |
|
} |