diff --git a/small_attn_out/A0_N10_S-1.pt b/small_attn_out/A0_N10_S-1.pt new file mode 100644 index 0000000000000000000000000000000000000000..7316e007baa6da6c1458e59115e95acef963759b --- /dev/null +++ b/small_attn_out/A0_N10_S-1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7eab15370a0e2b344c1c29f8fd49040c6aae1c6af8b1d602dcd06ae065faf5d +size 66536 diff --git a/small_attn_out/A0_N10_S-10.pt b/small_attn_out/A0_N10_S-10.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5febcd388dcd4768310dc048477a838ad810ce5 --- /dev/null +++ b/small_attn_out/A0_N10_S-10.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6f810f34a386842076e3141e25e2544b1faa9055b7c686b364f733519876186 +size 66544 diff --git a/small_attn_out/A0_N10_S-10_config.json b/small_attn_out/A0_N10_S-10_config.json new file mode 100644 index 0000000000000000000000000000000000000000..84cf766b673579a51e234f2fe30532554f75ce19 --- /dev/null +++ b/small_attn_out/A0_N10_S-10_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -10, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N10_S-10" +} \ No newline at end of file diff --git a/small_attn_out/A0_N10_S-1_config.json b/small_attn_out/A0_N10_S-1_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0d3c86289b6151b91f4fe73686d24cacda252e8e --- /dev/null +++ b/small_attn_out/A0_N10_S-1_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N10_S-1" +} \ No newline at end of file diff --git a/small_attn_out/A0_N10_S-2.pt b/small_attn_out/A0_N10_S-2.pt new file mode 100644 index 0000000000000000000000000000000000000000..9abd635173587165bd6abcd28d30b6f11e21540c --- /dev/null +++ b/small_attn_out/A0_N10_S-2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ee9eae6dff27ee925cea6e8d2178379f1e987feee39caffc13fff9d42f370a6 +size 66536 diff --git a/small_attn_out/A0_N10_S-2_config.json b/small_attn_out/A0_N10_S-2_config.json new file mode 100644 index 0000000000000000000000000000000000000000..26880a8cf8d4c3e9c5eadb9a8d602af00a494a79 --- /dev/null +++ b/small_attn_out/A0_N10_S-2_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N10_S-2" +} \ No newline at end of file diff --git a/small_attn_out/A0_N10_S-3.pt b/small_attn_out/A0_N10_S-3.pt new file mode 100644 index 0000000000000000000000000000000000000000..78132910397e6d1d76ee3fbd4c70c9c03d9dadef --- /dev/null +++ b/small_attn_out/A0_N10_S-3.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4fc4becc68bf6228a42ee2bd366d5f91ef2001b63c206633be97bfeb2e10059 +size 66536 diff --git a/small_attn_out/A0_N10_S-3_config.json b/small_attn_out/A0_N10_S-3_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c8041b5baf5903a2e42eb6502f63a140c1c049f5 --- /dev/null +++ b/small_attn_out/A0_N10_S-3_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -3, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N10_S-3" +} \ No newline at end of file diff --git a/small_attn_out/A0_N10_S-4.pt b/small_attn_out/A0_N10_S-4.pt new file mode 100644 index 0000000000000000000000000000000000000000..967e4ba972eb7b25eaecb79c4621f3c543085d12 --- /dev/null +++ b/small_attn_out/A0_N10_S-4.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be10049b4b82e05f46726f6edade5b1d4687a210e1bf15a8402465d5ba4798b2 +size 66536 diff --git a/small_attn_out/A0_N10_S-4_config.json b/small_attn_out/A0_N10_S-4_config.json new file mode 100644 index 0000000000000000000000000000000000000000..078b62620c7da1f20a67680ced113e0346845f6e --- /dev/null +++ b/small_attn_out/A0_N10_S-4_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -4, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N10_S-4" +} \ No newline at end of file diff --git a/small_attn_out/A0_N10_S-5.pt b/small_attn_out/A0_N10_S-5.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae612a6f168b1d763ee4caacac0c4aebee7bef75 --- /dev/null +++ b/small_attn_out/A0_N10_S-5.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c1d0b36a9b9d34e36e0916da20966fd1da9ce1e9e2086a73b78e71ded8750eb +size 66536 diff --git a/small_attn_out/A0_N10_S-5_config.json b/small_attn_out/A0_N10_S-5_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f75938c1a06e92b5243c591ab385d27ade1d8d74 --- /dev/null +++ b/small_attn_out/A0_N10_S-5_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -5, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N10_S-5" +} \ No newline at end of file diff --git a/small_attn_out/A0_N10_S-6.pt b/small_attn_out/A0_N10_S-6.pt new file mode 100644 index 0000000000000000000000000000000000000000..449573139bc27778a3cd2ea1b64a42107216666a --- /dev/null +++ b/small_attn_out/A0_N10_S-6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:452dd35ac9c316e65f8229ac38754df6c14c4863ce17534750c014bd17b0ef17 +size 66536 diff --git a/small_attn_out/A0_N10_S-6_config.json b/small_attn_out/A0_N10_S-6_config.json new file mode 100644 index 0000000000000000000000000000000000000000..134c22cf9309cac5b8c624c9fc8d2800c825ed17 --- /dev/null +++ b/small_attn_out/A0_N10_S-6_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -6, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N10_S-6" +} \ No newline at end of file diff --git a/small_attn_out/A0_N10_S-7.pt b/small_attn_out/A0_N10_S-7.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ed7c6c7bac0f3f1b3a5e17396e272fa8d48ac8a --- /dev/null +++ b/small_attn_out/A0_N10_S-7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a67b95bf4e3817c1014b4d6e8acaf8552a11b09847879ae8bf331e7c9618ca24 +size 66536 diff --git a/small_attn_out/A0_N10_S-7_config.json b/small_attn_out/A0_N10_S-7_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5b73d6d9eb8d1189ed8e5e8102df55c3fe4bbe9a --- /dev/null +++ b/small_attn_out/A0_N10_S-7_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -7, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N10_S-7" +} \ No newline at end of file diff --git a/small_attn_out/A0_N10_S-8.pt b/small_attn_out/A0_N10_S-8.pt new file mode 100644 index 0000000000000000000000000000000000000000..83b1193d3e2b33033d63a012c3dadb3e25247662 --- /dev/null +++ b/small_attn_out/A0_N10_S-8.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f268a7bad875e1f4576b1dd70aa72288816c19af9c5e4b7953ea24e7703b3cc3 +size 66536 diff --git a/small_attn_out/A0_N10_S-8_config.json b/small_attn_out/A0_N10_S-8_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d2fe8b97f45164cd473d238747f079049726477f --- /dev/null +++ b/small_attn_out/A0_N10_S-8_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -8, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N10_S-8" +} \ No newline at end of file diff --git a/small_attn_out/A0_N10_S-9.pt b/small_attn_out/A0_N10_S-9.pt new file mode 100644 index 0000000000000000000000000000000000000000..853155302e117aa1f9a99fcf32edbebf91f1ad8b --- /dev/null +++ b/small_attn_out/A0_N10_S-9.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dded55f6bd91e9d247f1dcba66a158d775c40e5e65abdadaa19aa4c9765c41d +size 66536 diff --git a/small_attn_out/A0_N10_S-9_config.json b/small_attn_out/A0_N10_S-9_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a2ce0cef74e4bad85b8a34eeb49cf56811d348e4 --- /dev/null +++ b/small_attn_out/A0_N10_S-9_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -9, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N10_S-9" +} \ No newline at end of file diff --git a/small_attn_out/A0_N10_S0.pt b/small_attn_out/A0_N10_S0.pt new file mode 100644 index 0000000000000000000000000000000000000000..c06ff8bc0eda5a5edef331c4e5e49ed2dd219786 --- /dev/null +++ b/small_attn_out/A0_N10_S0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4101a7397e58fddf7893bc532b3bfbd462735a9da573e75813db1c3aac4dd5d9 +size 66528 diff --git a/small_attn_out/A0_N10_S0_config.json b/small_attn_out/A0_N10_S0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3fb230ef3d746444992c5acaef61fc1a4eb0f171 --- /dev/null +++ b/small_attn_out/A0_N10_S0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": 0, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N10_S0" +} \ No newline at end of file diff --git a/small_attn_out/A0_N10_S1.pt b/small_attn_out/A0_N10_S1.pt new file mode 100644 index 0000000000000000000000000000000000000000..0d20b35b3d4f57520035afe7bc5dd840c060ca40 --- /dev/null +++ b/small_attn_out/A0_N10_S1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:063a221e678544f9cab588b29a23b841aa25ef18151d2b250afa70fa638448fb +size 66528 diff --git a/small_attn_out/A0_N10_S1_config.json b/small_attn_out/A0_N10_S1_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a13e9240144d4c74e49e4cfc6bf789861581df6d --- /dev/null +++ b/small_attn_out/A0_N10_S1_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": 1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N10_S1" +} \ No newline at end of file diff --git a/small_attn_out/A0_N10_S2.pt b/small_attn_out/A0_N10_S2.pt new file mode 100644 index 0000000000000000000000000000000000000000..192fdaded2809b648b7c589a5850df0cc6c18d81 --- /dev/null +++ b/small_attn_out/A0_N10_S2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc5759624ffaf188705eeb517f3d1aba51829b6fb2f820467f3886034d4d96a2 +size 66528 diff --git a/small_attn_out/A0_N10_S2_config.json b/small_attn_out/A0_N10_S2_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a873a102d239961f324b4b71b475c72f27edef07 --- /dev/null +++ b/small_attn_out/A0_N10_S2_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": 2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N10_S2" +} \ No newline at end of file diff --git a/small_attn_out/A0_N30_S-1.pt b/small_attn_out/A0_N30_S-1.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b1b9f1a16cb37e8cacc6fe9d38be250d80a29b5 --- /dev/null +++ b/small_attn_out/A0_N30_S-1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61091b4451d444d6435ecb62aaf0f88e72d8125faee7930a1085d35fa0402d77 +size 189480 diff --git a/small_attn_out/A0_N30_S-10.pt b/small_attn_out/A0_N30_S-10.pt new file mode 100644 index 0000000000000000000000000000000000000000..138c15806cd187f31c70b0c16d147f98fa873adb --- /dev/null +++ b/small_attn_out/A0_N30_S-10.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a24f6c8cc30c27eeba17e69391a4ca227f18417f747c9bf4a1f996a4295d150d +size 189488 diff --git a/small_attn_out/A0_N30_S-10_config.json b/small_attn_out/A0_N30_S-10_config.json new file mode 100644 index 0000000000000000000000000000000000000000..feeab8f42dfce96aa8e5d7e8f7500eb4516e4f61 --- /dev/null +++ b/small_attn_out/A0_N30_S-10_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -10, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N30_S-10" +} \ No newline at end of file diff --git a/small_attn_out/A0_N30_S-1_config.json b/small_attn_out/A0_N30_S-1_config.json new file mode 100644 index 0000000000000000000000000000000000000000..11ea8ea64c92044742ddc9c43970698b66cb1c37 --- /dev/null +++ b/small_attn_out/A0_N30_S-1_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N30_S-1" +} \ No newline at end of file diff --git a/small_attn_out/A0_N30_S-2.pt b/small_attn_out/A0_N30_S-2.pt new file mode 100644 index 0000000000000000000000000000000000000000..5366c4f979114d22b806fb8775e26c4d82260158 --- /dev/null +++ b/small_attn_out/A0_N30_S-2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af8909a9b7511ad2fcacbda96cadde537e92c96685459d629a8745a9cf892937 +size 189480 diff --git a/small_attn_out/A0_N30_S-2_config.json b/small_attn_out/A0_N30_S-2_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6fb10823b6c71bc7aa7a30900859a7002d8d42d7 --- /dev/null +++ b/small_attn_out/A0_N30_S-2_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N30_S-2" +} \ No newline at end of file diff --git a/small_attn_out/A0_N30_S-3.pt b/small_attn_out/A0_N30_S-3.pt new file mode 100644 index 0000000000000000000000000000000000000000..3cc1b28f19e1a735444e18f7fd8b5c49240050be --- /dev/null +++ b/small_attn_out/A0_N30_S-3.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bbc479ca99c8fb5490bcf5cd4ef1e4f7cde5d4ebdba0c2dca06e3e182758ed1 +size 189480 diff --git a/small_attn_out/A0_N30_S-3_config.json b/small_attn_out/A0_N30_S-3_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ed4ef5c07caca0377ffbe0b8fc105a1c2f22eb5a --- /dev/null +++ b/small_attn_out/A0_N30_S-3_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -3, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N30_S-3" +} \ No newline at end of file diff --git a/small_attn_out/A0_N30_S-4.pt b/small_attn_out/A0_N30_S-4.pt new file mode 100644 index 0000000000000000000000000000000000000000..36b1fbeccb41eaf4eccec30c5b0bbb7fc6dc0ff8 --- /dev/null +++ b/small_attn_out/A0_N30_S-4.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a9b6217a97dac3560a2e2d38c7fd3bcb9c21bdc15a1228a611360d443d1e80a +size 189480 diff --git a/small_attn_out/A0_N30_S-4_config.json b/small_attn_out/A0_N30_S-4_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a774d0ecadd75185183eb8304df024c2cce9d599 --- /dev/null +++ b/small_attn_out/A0_N30_S-4_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -4, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N30_S-4" +} \ No newline at end of file diff --git a/small_attn_out/A0_N30_S-5.pt b/small_attn_out/A0_N30_S-5.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe1c21b4d67e04e6e7b142a7ea744f0e09261dc0 --- /dev/null +++ b/small_attn_out/A0_N30_S-5.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cf31f75860167f052fbfb3cab6a8f589c3d51a5f85a6a8a1d4afe4f5c578a7d +size 189480 diff --git a/small_attn_out/A0_N30_S-5_config.json b/small_attn_out/A0_N30_S-5_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a6d7746b852b8ea615ca577f8ba47cd20fe17fa1 --- /dev/null +++ b/small_attn_out/A0_N30_S-5_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -5, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N30_S-5" +} \ No newline at end of file diff --git a/small_attn_out/A0_N30_S-6.pt b/small_attn_out/A0_N30_S-6.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa121c67e00ebd63783e51b30e1d588a273ae1c0 --- /dev/null +++ b/small_attn_out/A0_N30_S-6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9499ea5e8bcf5f4d6ea0ddaf2ec675e41dd7d50ce5bf627800a0ebd030de668 +size 189480 diff --git a/small_attn_out/A0_N30_S-6_config.json b/small_attn_out/A0_N30_S-6_config.json new file mode 100644 index 0000000000000000000000000000000000000000..15ff285c0fa3080a2e31cb0a5267fa44b1689530 --- /dev/null +++ b/small_attn_out/A0_N30_S-6_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -6, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N30_S-6" +} \ No newline at end of file diff --git a/small_attn_out/A0_N30_S-7.pt b/small_attn_out/A0_N30_S-7.pt new file mode 100644 index 0000000000000000000000000000000000000000..43b61b2d26d66a12e0700977aad668b393e437a8 --- /dev/null +++ b/small_attn_out/A0_N30_S-7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb981c4e3393be41a5fc3a59d9c725031cdf15364e227d7d5da878bf35956c40 +size 189480 diff --git a/small_attn_out/A0_N30_S-7_config.json b/small_attn_out/A0_N30_S-7_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9ef531714742603ece68d87de6d8f504ef73fa89 --- /dev/null +++ b/small_attn_out/A0_N30_S-7_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -7, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N30_S-7" +} \ No newline at end of file diff --git a/small_attn_out/A0_N30_S-8.pt b/small_attn_out/A0_N30_S-8.pt new file mode 100644 index 0000000000000000000000000000000000000000..1da660b9d7d1831e8096db823ec836f1c988d4c3 --- /dev/null +++ b/small_attn_out/A0_N30_S-8.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4bfafcba3c6662159d5b9be25bb6df54b5d0d3fef8e8e832de43f96f54ae8ea +size 189480 diff --git a/small_attn_out/A0_N30_S-8_config.json b/small_attn_out/A0_N30_S-8_config.json new file mode 100644 index 0000000000000000000000000000000000000000..60bff24ee6e56e8be0b4c343b0a45f3aba3afb1f --- /dev/null +++ b/small_attn_out/A0_N30_S-8_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -8, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N30_S-8" +} \ No newline at end of file diff --git a/small_attn_out/A0_N30_S-9.pt b/small_attn_out/A0_N30_S-9.pt new file mode 100644 index 0000000000000000000000000000000000000000..311f6b404be0c9ea7cc924815398efac2fec9f89 --- /dev/null +++ b/small_attn_out/A0_N30_S-9.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cd1eeba259f8f159bb6240a917ac817b4484103b642ea12615fd120e30ec191 +size 189480 diff --git a/small_attn_out/A0_N30_S-9_config.json b/small_attn_out/A0_N30_S-9_config.json new file mode 100644 index 0000000000000000000000000000000000000000..113026ef4cb88beacc847b73bfd3bc2b10a33c62 --- /dev/null +++ b/small_attn_out/A0_N30_S-9_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": -9, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N30_S-9" +} \ No newline at end of file diff --git a/small_attn_out/A0_N30_S0.pt b/small_attn_out/A0_N30_S0.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c5caea3948d2cf9c60377cd50ac38e0dd525f3b --- /dev/null +++ b/small_attn_out/A0_N30_S0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56905494df1647c62a5fef8507a76a706c17bcadb01c02a9b87526b92bc1dedb +size 189472 diff --git a/small_attn_out/A0_N30_S0_config.json b/small_attn_out/A0_N30_S0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7a8ed866426e5fc3e9f944c1fe4a521f6ff5bfab --- /dev/null +++ b/small_attn_out/A0_N30_S0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": 0, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N30_S0" +} \ No newline at end of file diff --git a/small_attn_out/A0_N30_S1.pt b/small_attn_out/A0_N30_S1.pt new file mode 100644 index 0000000000000000000000000000000000000000..8fcf228bbc85014a73542e19a484c94a01515c8a --- /dev/null +++ b/small_attn_out/A0_N30_S1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c84409741c16ea34fabff7392a941782b42c12174a7241632634005324205bcb +size 189472 diff --git a/small_attn_out/A0_N30_S1_config.json b/small_attn_out/A0_N30_S1_config.json new file mode 100644 index 0000000000000000000000000000000000000000..13a387d4c041177877fa6fd70d8762cae45583bd --- /dev/null +++ b/small_attn_out/A0_N30_S1_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": 1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N30_S1" +} \ No newline at end of file diff --git a/small_attn_out/A0_N30_S2.pt b/small_attn_out/A0_N30_S2.pt new file mode 100644 index 0000000000000000000000000000000000000000..83240f0d7c2e22e3960eac0ad3f72d9876191e76 --- /dev/null +++ b/small_attn_out/A0_N30_S2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c885158f635ef75a81eeb1e9919d7a4e81f5f87da1dee9893b28ea151a5f34e +size 189472 diff --git a/small_attn_out/A0_N30_S2_config.json b/small_attn_out/A0_N30_S2_config.json new file mode 100644 index 0000000000000000000000000000000000000000..57db6a1a17166b2499b25271759610f326b57cba --- /dev/null +++ b/small_attn_out/A0_N30_S2_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 0, + "l1_exp": 2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A0_N30_S2" +} \ No newline at end of file diff --git a/small_attn_out/A1_N10_S-1.pt b/small_attn_out/A1_N10_S-1.pt new file mode 100644 index 0000000000000000000000000000000000000000..72ec84aaa1f050996116489c94298b104f3a721c --- /dev/null +++ b/small_attn_out/A1_N10_S-1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab7d4d05ef8fabf2e728a641144d2d3b2f52351d64dc26934fa7ac4f0e49397d +size 66536 diff --git a/small_attn_out/A1_N10_S-10.pt b/small_attn_out/A1_N10_S-10.pt new file mode 100644 index 0000000000000000000000000000000000000000..dad22bb0d68b2e026e0247c043aeacdc2f3b2885 --- /dev/null +++ b/small_attn_out/A1_N10_S-10.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab0783c4279d3ba1796a9d3440f4a5026502d3cacbec3c05850011d95bf667df +size 66544 diff --git a/small_attn_out/A1_N10_S-10_config.json b/small_attn_out/A1_N10_S-10_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c76131ba6e01e13b8efc6521a05e17cb2506cc00 --- /dev/null +++ b/small_attn_out/A1_N10_S-10_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -10, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N10_S-10" +} \ No newline at end of file diff --git a/small_attn_out/A1_N10_S-1_config.json b/small_attn_out/A1_N10_S-1_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e461e692580a44da674e52911c7796bc2189ea36 --- /dev/null +++ b/small_attn_out/A1_N10_S-1_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N10_S-1" +} \ No newline at end of file diff --git a/small_attn_out/A1_N10_S-2.pt b/small_attn_out/A1_N10_S-2.pt new file mode 100644 index 0000000000000000000000000000000000000000..9beba833f9f13c330b6c295061fefac355c8e50b --- /dev/null +++ b/small_attn_out/A1_N10_S-2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88be60998ace55119af4498bd6f5103cdffae6ec8716057596f4e429529616c3 +size 66536 diff --git a/small_attn_out/A1_N10_S-2_config.json b/small_attn_out/A1_N10_S-2_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c4578e4b95e77beb664cbac5be26b50ef5c922fc --- /dev/null +++ b/small_attn_out/A1_N10_S-2_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N10_S-2" +} \ No newline at end of file diff --git a/small_attn_out/A1_N10_S-3.pt b/small_attn_out/A1_N10_S-3.pt new file mode 100644 index 0000000000000000000000000000000000000000..7fd4f43643aa8d6eceed169086df52e8767c562f --- /dev/null +++ b/small_attn_out/A1_N10_S-3.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:217424bece781eb19461a71b63fbb2984125c4dda969db6b375df1cccf20ed89 +size 66536 diff --git a/small_attn_out/A1_N10_S-3_config.json b/small_attn_out/A1_N10_S-3_config.json new file mode 100644 index 0000000000000000000000000000000000000000..06bc8d68cd504e588eb624bdf7a398e4618fadac --- /dev/null +++ b/small_attn_out/A1_N10_S-3_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -3, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N10_S-3" +} \ No newline at end of file diff --git a/small_attn_out/A1_N10_S-4.pt b/small_attn_out/A1_N10_S-4.pt new file mode 100644 index 0000000000000000000000000000000000000000..42522f16774b39ec22f22e99e00102b619104f80 --- /dev/null +++ b/small_attn_out/A1_N10_S-4.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19661a59f43c5f43ae9ba48d649a827d93f0817f66aac6fcc1837b52a0439f67 +size 66536 diff --git a/small_attn_out/A1_N10_S-4_config.json b/small_attn_out/A1_N10_S-4_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f5219af7ebee94b168acc6726844641812f518a9 --- /dev/null +++ b/small_attn_out/A1_N10_S-4_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -4, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N10_S-4" +} \ No newline at end of file diff --git a/small_attn_out/A1_N10_S-5.pt b/small_attn_out/A1_N10_S-5.pt new file mode 100644 index 0000000000000000000000000000000000000000..693b3a1da05f282ed19e215459d76bf67200fa99 --- /dev/null +++ b/small_attn_out/A1_N10_S-5.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18eb705098fda031c0a7bb5c5b52efc8835b7d44fef9eeb6d5fd905558eb6d5c +size 66536 diff --git a/small_attn_out/A1_N10_S-5_config.json b/small_attn_out/A1_N10_S-5_config.json new file mode 100644 index 0000000000000000000000000000000000000000..68bfa9b4d3d0763416e28e701ed739c51dfe4eae --- /dev/null +++ b/small_attn_out/A1_N10_S-5_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -5, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N10_S-5" +} \ No newline at end of file diff --git a/small_attn_out/A1_N10_S-6.pt b/small_attn_out/A1_N10_S-6.pt new file mode 100644 index 0000000000000000000000000000000000000000..979d60c09a416741d9c8bb034539796017dc462a --- /dev/null +++ b/small_attn_out/A1_N10_S-6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a439844060bc9a4e57a8e0ce18f43e557b7e6206e83993b82ba20b5f414f2cf +size 66536 diff --git a/small_attn_out/A1_N10_S-6_config.json b/small_attn_out/A1_N10_S-6_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5a50826089a075edfa822941870c75c524444261 --- /dev/null +++ b/small_attn_out/A1_N10_S-6_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -6, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N10_S-6" +} \ No newline at end of file diff --git a/small_attn_out/A1_N10_S-7.pt b/small_attn_out/A1_N10_S-7.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ca54c35ff58fb450d69102f75193a2046ab464c --- /dev/null +++ b/small_attn_out/A1_N10_S-7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6a888f0b96c2ad8248153288b281e63fbde1b6cc4dfdb6234762a7682a1f8ef +size 66536 diff --git a/small_attn_out/A1_N10_S-7_config.json b/small_attn_out/A1_N10_S-7_config.json new file mode 100644 index 0000000000000000000000000000000000000000..35f50dc73a3504a28a43bb724fd0f72aeb87dd0b --- /dev/null +++ b/small_attn_out/A1_N10_S-7_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -7, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N10_S-7" +} \ No newline at end of file diff --git a/small_attn_out/A1_N10_S-8.pt b/small_attn_out/A1_N10_S-8.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b9284d8a012c27773aedefaa32feb262d3c9d73 --- /dev/null +++ b/small_attn_out/A1_N10_S-8.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5971b3da0953549caeb84654d948da869aef39ade5e969ce6211e54e226a12fc +size 66536 diff --git a/small_attn_out/A1_N10_S-8_config.json b/small_attn_out/A1_N10_S-8_config.json new file mode 100644 index 0000000000000000000000000000000000000000..34031b5c774a2de94cd4c1a5bc1e9eb512fc4f3a --- /dev/null +++ b/small_attn_out/A1_N10_S-8_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -8, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N10_S-8" +} \ No newline at end of file diff --git a/small_attn_out/A1_N10_S-9.pt b/small_attn_out/A1_N10_S-9.pt new file mode 100644 index 0000000000000000000000000000000000000000..5996d43799db236fba98236c9064d1dbd4520938 --- /dev/null +++ b/small_attn_out/A1_N10_S-9.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d1178cdb9f15a460b1bf2c8dfa45b14c87bd6a3d012e996713ed24b7f9b1268 +size 66536 diff --git a/small_attn_out/A1_N10_S-9_config.json b/small_attn_out/A1_N10_S-9_config.json new file mode 100644 index 0000000000000000000000000000000000000000..eae9db2d731e61c5af6d80f0829fa6086cf9395d --- /dev/null +++ b/small_attn_out/A1_N10_S-9_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -9, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N10_S-9" +} \ No newline at end of file diff --git a/small_attn_out/A1_N10_S0.pt b/small_attn_out/A1_N10_S0.pt new file mode 100644 index 0000000000000000000000000000000000000000..6809a3be046292c99e0949caa8096082c811cd71 --- /dev/null +++ b/small_attn_out/A1_N10_S0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1038116b43cb3e5f0330a1e7aecdf6a256f8ce1a81f4a28e53da10f24a850c44 +size 66528 diff --git a/small_attn_out/A1_N10_S0_config.json b/small_attn_out/A1_N10_S0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..341f09a28593484766850cab9fc12b1cfd9ed487 --- /dev/null +++ b/small_attn_out/A1_N10_S0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": 0, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N10_S0" +} \ No newline at end of file diff --git a/small_attn_out/A1_N10_S1.pt b/small_attn_out/A1_N10_S1.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4ba5ab6adc53b87167e1595a1ea528a316be705 --- /dev/null +++ b/small_attn_out/A1_N10_S1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e24db4ce97a57dc82c4810a19963ba0ce017f5bc7688a4e05191727d44012504 +size 66528 diff --git a/small_attn_out/A1_N10_S1_config.json b/small_attn_out/A1_N10_S1_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4f4a30a431c36085a985cbbcb6918469103dce0d --- /dev/null +++ b/small_attn_out/A1_N10_S1_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": 1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N10_S1" +} \ No newline at end of file diff --git a/small_attn_out/A1_N10_S2.pt b/small_attn_out/A1_N10_S2.pt new file mode 100644 index 0000000000000000000000000000000000000000..c829ba7eeac9a15e9db166c8b5c3e17bcd111d60 --- /dev/null +++ b/small_attn_out/A1_N10_S2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a0b66d59c5078c82e6247298f62508e0f82f54242e65f1e3fce2facbd6322d2 +size 66528 diff --git a/small_attn_out/A1_N10_S2_config.json b/small_attn_out/A1_N10_S2_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e1bcf7fa1706258795982f2f637435fd8a472214 --- /dev/null +++ b/small_attn_out/A1_N10_S2_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 10, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": 2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N10_S2" +} \ No newline at end of file diff --git a/small_attn_out/A1_N30_S-1.pt b/small_attn_out/A1_N30_S-1.pt new file mode 100644 index 0000000000000000000000000000000000000000..b49c347353be0c379d21f4fc993c153f46ca26fd --- /dev/null +++ b/small_attn_out/A1_N30_S-1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d559d310977ea56ee761ccb04fcde244e63db2470bc134f20cbb727743c75cdf +size 189480 diff --git a/small_attn_out/A1_N30_S-10.pt b/small_attn_out/A1_N30_S-10.pt new file mode 100644 index 0000000000000000000000000000000000000000..32c5200477820d5b1474adbc3f7840cc2f1ac65d --- /dev/null +++ b/small_attn_out/A1_N30_S-10.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03dd467b3c69bdef9684056555da3ddea0c6d82a3f1a91eb79f577a0c26ffdfd +size 189488 diff --git a/small_attn_out/A1_N30_S-10_config.json b/small_attn_out/A1_N30_S-10_config.json new file mode 100644 index 0000000000000000000000000000000000000000..608be472ce5f4428dea983bbd70f81d77739862d --- /dev/null +++ b/small_attn_out/A1_N30_S-10_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -10, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N30_S-10" +} \ No newline at end of file diff --git a/small_attn_out/A1_N30_S-1_config.json b/small_attn_out/A1_N30_S-1_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9c6d62551c611799ba4a57b39e4aee8a2b3d1cd2 --- /dev/null +++ b/small_attn_out/A1_N30_S-1_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N30_S-1" +} \ No newline at end of file diff --git a/small_attn_out/A1_N30_S-2.pt b/small_attn_out/A1_N30_S-2.pt new file mode 100644 index 0000000000000000000000000000000000000000..159558d3e50cabec3ec75679ebecce8c2175177b --- /dev/null +++ b/small_attn_out/A1_N30_S-2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e0614999274d89e476d6a3cb059818430261b0c5a2d1c5e335df784da47a05c +size 189480 diff --git a/small_attn_out/A1_N30_S-2_config.json b/small_attn_out/A1_N30_S-2_config.json new file mode 100644 index 0000000000000000000000000000000000000000..79c53b0bf0eb1509318e8651dcf7fa40e29df2ff --- /dev/null +++ b/small_attn_out/A1_N30_S-2_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N30_S-2" +} \ No newline at end of file diff --git a/small_attn_out/A1_N30_S-3.pt b/small_attn_out/A1_N30_S-3.pt new file mode 100644 index 0000000000000000000000000000000000000000..c694284ce48a7374aabb73caa6c9ad567900f20d --- /dev/null +++ b/small_attn_out/A1_N30_S-3.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4337f24f1a6d085b0097ba313de8e602d21ad873488c472e4a978b5e78273b21 +size 189480 diff --git a/small_attn_out/A1_N30_S-3_config.json b/small_attn_out/A1_N30_S-3_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cd6df09a284543a493305acb5c53ae979ef18579 --- /dev/null +++ b/small_attn_out/A1_N30_S-3_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -3, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N30_S-3" +} \ No newline at end of file diff --git a/small_attn_out/A1_N30_S-4.pt b/small_attn_out/A1_N30_S-4.pt new file mode 100644 index 0000000000000000000000000000000000000000..a558a08073e4753bd380e1d20a132a5dde955cd5 --- /dev/null +++ b/small_attn_out/A1_N30_S-4.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9dc2c85256b4e88ae2c568b74c7a7e243e90d76172353b2a504fb45293d4315 +size 189480 diff --git a/small_attn_out/A1_N30_S-4_config.json b/small_attn_out/A1_N30_S-4_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94e311fce70f3cf7ee4b3ae0bab36d7622855fc8 --- /dev/null +++ b/small_attn_out/A1_N30_S-4_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -4, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N30_S-4" +} \ No newline at end of file diff --git a/small_attn_out/A1_N30_S-5.pt b/small_attn_out/A1_N30_S-5.pt new file mode 100644 index 0000000000000000000000000000000000000000..012c2bb0d366e02b0e62631c54943fb342fe5be2 --- /dev/null +++ b/small_attn_out/A1_N30_S-5.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a30de7240841b1280a8c24e3cda6142c173357625a955b27a95e70ae8674a87 +size 189480 diff --git a/small_attn_out/A1_N30_S-5_config.json b/small_attn_out/A1_N30_S-5_config.json new file mode 100644 index 0000000000000000000000000000000000000000..423d5503df90f0bcc942aafbf1d3caa5b0e872b5 --- /dev/null +++ b/small_attn_out/A1_N30_S-5_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -5, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N30_S-5" +} \ No newline at end of file diff --git a/small_attn_out/A1_N30_S-6.pt b/small_attn_out/A1_N30_S-6.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a48c5994f15f650140fb65932ee58dbd56bfaa5 --- /dev/null +++ b/small_attn_out/A1_N30_S-6.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:699112b875231d5e253fca507fe1c5e16eac9df90213843346b227de07090851 +size 189480 diff --git a/small_attn_out/A1_N30_S-6_config.json b/small_attn_out/A1_N30_S-6_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4ee017ed4ecf4cdaa1b2226cc4329f91f3f463fb --- /dev/null +++ b/small_attn_out/A1_N30_S-6_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -6, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N30_S-6" +} \ No newline at end of file diff --git a/small_attn_out/A1_N30_S-7.pt b/small_attn_out/A1_N30_S-7.pt new file mode 100644 index 0000000000000000000000000000000000000000..16bb25706a523cd8faaebbc004063107d0d04819 --- /dev/null +++ b/small_attn_out/A1_N30_S-7.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81e17cb0373673a0b981c394641babf12c5a78ae12e95bf949596936506856ee +size 189480 diff --git a/small_attn_out/A1_N30_S-7_config.json b/small_attn_out/A1_N30_S-7_config.json new file mode 100644 index 0000000000000000000000000000000000000000..022c294403574e36c7ec6efa3844573d029c89bb --- /dev/null +++ b/small_attn_out/A1_N30_S-7_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -7, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N30_S-7" +} \ No newline at end of file diff --git a/small_attn_out/A1_N30_S-8.pt b/small_attn_out/A1_N30_S-8.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce264cca82ca090f9d8e44d55084d610cd472d34 --- /dev/null +++ b/small_attn_out/A1_N30_S-8.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97a5c7903f23b09f0a77b962017f54ca58d43f8b8a7d2a645b09f050b804bf31 +size 189480 diff --git a/small_attn_out/A1_N30_S-8_config.json b/small_attn_out/A1_N30_S-8_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f0fe70f12dbc645a2a9d01edd47d53e709630a89 --- /dev/null +++ b/small_attn_out/A1_N30_S-8_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -8, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N30_S-8" +} \ No newline at end of file diff --git a/small_attn_out/A1_N30_S-9.pt b/small_attn_out/A1_N30_S-9.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e94dc000b32426a0c9216bc0b6413be2892a3d3 --- /dev/null +++ b/small_attn_out/A1_N30_S-9.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5871dc73a15434f65a7abe97d757ac48345dffba74819c0f252369556f9eaaa6 +size 189480 diff --git a/small_attn_out/A1_N30_S-9_config.json b/small_attn_out/A1_N30_S-9_config.json new file mode 100644 index 0000000000000000000000000000000000000000..15cac57e687459397b2286d1dfe0e2f6a15a977a --- /dev/null +++ b/small_attn_out/A1_N30_S-9_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": -9, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N30_S-9" +} \ No newline at end of file diff --git a/small_attn_out/A1_N30_S0.pt b/small_attn_out/A1_N30_S0.pt new file mode 100644 index 0000000000000000000000000000000000000000..e18d84122e89ad86195dc9d032ee51109d9a43ee --- /dev/null +++ b/small_attn_out/A1_N30_S0.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22ad03c5f1ceb3263a1d67b90aac99ff33aeddac0ada8eb8b2a17a686897811b +size 189472 diff --git a/small_attn_out/A1_N30_S0_config.json b/small_attn_out/A1_N30_S0_config.json new file mode 100644 index 0000000000000000000000000000000000000000..849988ac2fce98fb763b0b7e490a5a3d6ed633ee --- /dev/null +++ b/small_attn_out/A1_N30_S0_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": 0, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N30_S0" +} \ No newline at end of file diff --git a/small_attn_out/A1_N30_S1.pt b/small_attn_out/A1_N30_S1.pt new file mode 100644 index 0000000000000000000000000000000000000000..862606b728516d1f09b7adb2ec174546a6c4f4de --- /dev/null +++ b/small_attn_out/A1_N30_S1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97974d3bdd45a1d1667f0005ec50e76a96cdf69971c98642ffdeef07eb06e73c +size 189472 diff --git a/small_attn_out/A1_N30_S1_config.json b/small_attn_out/A1_N30_S1_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9872487db413a4a2afd25a4f6fd3f5d8493e6a82 --- /dev/null +++ b/small_attn_out/A1_N30_S1_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": 1, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N30_S1" +} \ No newline at end of file diff --git a/small_attn_out/A1_N30_S2.pt b/small_attn_out/A1_N30_S2.pt new file mode 100644 index 0000000000000000000000000000000000000000..14d00af8c59ffeffee9f8af719df1a3f258c7fd7 --- /dev/null +++ b/small_attn_out/A1_N30_S2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e342df1e7e417b7afb05c4787999db2d883b7ffc41dccef21921caab1687deb3 +size 189472 diff --git a/small_attn_out/A1_N30_S2_config.json b/small_attn_out/A1_N30_S2_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1c4f6ac90a2ffb11b1a4dc4234f733eda8903190 --- /dev/null +++ b/small_attn_out/A1_N30_S2_config.json @@ -0,0 +1,38 @@ +{ + "n_features": 30, + "d_model": 768, + "lr_exp": -10, + "disable_comet": false, + "per_neuron_reinit_interval": 0, + "reservoir_time_discount": 0.995, + "reinit_interval": 800, + "max_reinit_neurons": 5000, + "reservoir_size": 5000, + "n_piles": 292, + "log_interval": 200, + "reinit_input_norm": "target_scaled", + "reinit_input": "error", + "reinit_norm_alpha": 0.3, + "data_loc": "attn_data", + "reinit_threshold": -6, + "scheduler": "wsd", + "layer_idx": 1, + "l1_exp": 2, + "neuron_reinit_percent": 0.85, + "beta1": 1, + "beta2": 4, + "reinit_target": "error", + "sparse_adam": false, + "run_template": "A{layer_idx}_N{n_features}_S{l1_exp}", + "project_name": "small_attn", + "decoder_bias": true, + "l1_beta": 0.99, + "alt_sparsity_loss": "log", + "l1_ratio": 1, + "l1_p": 0, + "optimizer": "sparse_adam", + "model_type": "attn_out", + "adam_beta1": 0.5, + "adam_beta2": 0.9375, + "run_name": "A1_N30_S2" +} \ No newline at end of file