Marcus2112 commited on
Commit
2c5f076
·
verified ·
1 Parent(s): 9f5eba6

Upload folder using huggingface_hub

Browse files
checkpoint-1024/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/vault/c106fa/c106fa11/minipile_marcus/regular/pythia1.4b_dedup_untrained",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 0,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 2048,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 8192,
16
+ "layer_norm_eps": 1e-05,
17
+ "max_position_embeddings": 2048,
18
+ "model_type": "gpt_neox",
19
+ "num_attention_heads": 16,
20
+ "num_hidden_layers": 24,
21
+ "partial_rotary_factor": 0.25,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000,
24
+ "rotary_emb_base": 10000,
25
+ "rotary_pct": 0.25,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.47.1",
29
+ "use_cache": true,
30
+ "use_parallel_residual": true,
31
+ "vocab_size": 50304
32
+ }
checkpoint-1024/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 0,
5
+ "transformers_version": "4.47.1"
6
+ }
checkpoint-1024/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4831f2865cac402dbbf5d0cb1043c06f45dddf4b4dc5b9607866f458fb5f686
3
+ size 4977966760
checkpoint-1024/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8bd81362c3715a9fa27fb35b320fc0061032b7ec51e441df30a17e7606f4815
3
+ size 680658856
checkpoint-1024/model.safetensors.index.json ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 5658591232
4
+ },
5
+ "weight_map": {
6
+ "embed_out.weight": "model-00002-of-00002.safetensors",
7
+ "gpt_neox.embed_in.weight": "model-00001-of-00002.safetensors",
8
+ "gpt_neox.final_layer_norm.bias": "model-00002-of-00002.safetensors",
9
+ "gpt_neox.final_layer_norm.weight": "model-00002-of-00002.safetensors",
10
+ "gpt_neox.layers.0.attention.dense.bias": "model-00001-of-00002.safetensors",
11
+ "gpt_neox.layers.0.attention.dense.weight": "model-00001-of-00002.safetensors",
12
+ "gpt_neox.layers.0.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
13
+ "gpt_neox.layers.0.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
14
+ "gpt_neox.layers.0.input_layernorm.bias": "model-00001-of-00002.safetensors",
15
+ "gpt_neox.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
16
+ "gpt_neox.layers.0.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
17
+ "gpt_neox.layers.0.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
18
+ "gpt_neox.layers.0.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
19
+ "gpt_neox.layers.0.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
20
+ "gpt_neox.layers.0.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
21
+ "gpt_neox.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
22
+ "gpt_neox.layers.1.attention.dense.bias": "model-00001-of-00002.safetensors",
23
+ "gpt_neox.layers.1.attention.dense.weight": "model-00001-of-00002.safetensors",
24
+ "gpt_neox.layers.1.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
25
+ "gpt_neox.layers.1.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
26
+ "gpt_neox.layers.1.input_layernorm.bias": "model-00001-of-00002.safetensors",
27
+ "gpt_neox.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
28
+ "gpt_neox.layers.1.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
29
+ "gpt_neox.layers.1.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
30
+ "gpt_neox.layers.1.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
31
+ "gpt_neox.layers.1.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
32
+ "gpt_neox.layers.1.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
33
+ "gpt_neox.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
34
+ "gpt_neox.layers.10.attention.dense.bias": "model-00001-of-00002.safetensors",
35
+ "gpt_neox.layers.10.attention.dense.weight": "model-00001-of-00002.safetensors",
36
+ "gpt_neox.layers.10.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
37
+ "gpt_neox.layers.10.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
38
+ "gpt_neox.layers.10.input_layernorm.bias": "model-00001-of-00002.safetensors",
39
+ "gpt_neox.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
40
+ "gpt_neox.layers.10.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
41
+ "gpt_neox.layers.10.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
42
+ "gpt_neox.layers.10.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
43
+ "gpt_neox.layers.10.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
44
+ "gpt_neox.layers.10.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
45
+ "gpt_neox.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
46
+ "gpt_neox.layers.11.attention.dense.bias": "model-00001-of-00002.safetensors",
47
+ "gpt_neox.layers.11.attention.dense.weight": "model-00001-of-00002.safetensors",
48
+ "gpt_neox.layers.11.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
49
+ "gpt_neox.layers.11.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
50
+ "gpt_neox.layers.11.input_layernorm.bias": "model-00001-of-00002.safetensors",
51
+ "gpt_neox.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
52
+ "gpt_neox.layers.11.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
53
+ "gpt_neox.layers.11.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
54
+ "gpt_neox.layers.11.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
55
+ "gpt_neox.layers.11.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
56
+ "gpt_neox.layers.11.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
57
+ "gpt_neox.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
58
+ "gpt_neox.layers.12.attention.dense.bias": "model-00001-of-00002.safetensors",
59
+ "gpt_neox.layers.12.attention.dense.weight": "model-00001-of-00002.safetensors",
60
+ "gpt_neox.layers.12.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
61
+ "gpt_neox.layers.12.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
62
+ "gpt_neox.layers.12.input_layernorm.bias": "model-00001-of-00002.safetensors",
63
+ "gpt_neox.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
64
+ "gpt_neox.layers.12.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
65
+ "gpt_neox.layers.12.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
66
+ "gpt_neox.layers.12.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
67
+ "gpt_neox.layers.12.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
68
+ "gpt_neox.layers.12.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
69
+ "gpt_neox.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
70
+ "gpt_neox.layers.13.attention.dense.bias": "model-00001-of-00002.safetensors",
71
+ "gpt_neox.layers.13.attention.dense.weight": "model-00001-of-00002.safetensors",
72
+ "gpt_neox.layers.13.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
73
+ "gpt_neox.layers.13.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
74
+ "gpt_neox.layers.13.input_layernorm.bias": "model-00001-of-00002.safetensors",
75
+ "gpt_neox.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
76
+ "gpt_neox.layers.13.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
77
+ "gpt_neox.layers.13.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
78
+ "gpt_neox.layers.13.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
79
+ "gpt_neox.layers.13.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
80
+ "gpt_neox.layers.13.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
81
+ "gpt_neox.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
82
+ "gpt_neox.layers.14.attention.dense.bias": "model-00001-of-00002.safetensors",
83
+ "gpt_neox.layers.14.attention.dense.weight": "model-00001-of-00002.safetensors",
84
+ "gpt_neox.layers.14.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
85
+ "gpt_neox.layers.14.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
86
+ "gpt_neox.layers.14.input_layernorm.bias": "model-00001-of-00002.safetensors",
87
+ "gpt_neox.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
88
+ "gpt_neox.layers.14.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
89
+ "gpt_neox.layers.14.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
90
+ "gpt_neox.layers.14.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
91
+ "gpt_neox.layers.14.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
92
+ "gpt_neox.layers.14.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
93
+ "gpt_neox.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
94
+ "gpt_neox.layers.15.attention.dense.bias": "model-00001-of-00002.safetensors",
95
+ "gpt_neox.layers.15.attention.dense.weight": "model-00001-of-00002.safetensors",
96
+ "gpt_neox.layers.15.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
97
+ "gpt_neox.layers.15.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
98
+ "gpt_neox.layers.15.input_layernorm.bias": "model-00001-of-00002.safetensors",
99
+ "gpt_neox.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
100
+ "gpt_neox.layers.15.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
101
+ "gpt_neox.layers.15.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
102
+ "gpt_neox.layers.15.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
103
+ "gpt_neox.layers.15.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
104
+ "gpt_neox.layers.15.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
105
+ "gpt_neox.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
106
+ "gpt_neox.layers.16.attention.dense.bias": "model-00001-of-00002.safetensors",
107
+ "gpt_neox.layers.16.attention.dense.weight": "model-00001-of-00002.safetensors",
108
+ "gpt_neox.layers.16.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
109
+ "gpt_neox.layers.16.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
110
+ "gpt_neox.layers.16.input_layernorm.bias": "model-00001-of-00002.safetensors",
111
+ "gpt_neox.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
112
+ "gpt_neox.layers.16.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
113
+ "gpt_neox.layers.16.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
114
+ "gpt_neox.layers.16.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
115
+ "gpt_neox.layers.16.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
116
+ "gpt_neox.layers.16.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
117
+ "gpt_neox.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
118
+ "gpt_neox.layers.17.attention.dense.bias": "model-00001-of-00002.safetensors",
119
+ "gpt_neox.layers.17.attention.dense.weight": "model-00001-of-00002.safetensors",
120
+ "gpt_neox.layers.17.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
121
+ "gpt_neox.layers.17.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
122
+ "gpt_neox.layers.17.input_layernorm.bias": "model-00001-of-00002.safetensors",
123
+ "gpt_neox.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
124
+ "gpt_neox.layers.17.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
125
+ "gpt_neox.layers.17.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
126
+ "gpt_neox.layers.17.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
127
+ "gpt_neox.layers.17.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
128
+ "gpt_neox.layers.17.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
129
+ "gpt_neox.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
130
+ "gpt_neox.layers.18.attention.dense.bias": "model-00001-of-00002.safetensors",
131
+ "gpt_neox.layers.18.attention.dense.weight": "model-00001-of-00002.safetensors",
132
+ "gpt_neox.layers.18.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
133
+ "gpt_neox.layers.18.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
134
+ "gpt_neox.layers.18.input_layernorm.bias": "model-00001-of-00002.safetensors",
135
+ "gpt_neox.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
136
+ "gpt_neox.layers.18.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
137
+ "gpt_neox.layers.18.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
138
+ "gpt_neox.layers.18.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
139
+ "gpt_neox.layers.18.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
140
+ "gpt_neox.layers.18.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
141
+ "gpt_neox.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
142
+ "gpt_neox.layers.19.attention.dense.bias": "model-00001-of-00002.safetensors",
143
+ "gpt_neox.layers.19.attention.dense.weight": "model-00001-of-00002.safetensors",
144
+ "gpt_neox.layers.19.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
145
+ "gpt_neox.layers.19.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
146
+ "gpt_neox.layers.19.input_layernorm.bias": "model-00001-of-00002.safetensors",
147
+ "gpt_neox.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
148
+ "gpt_neox.layers.19.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
149
+ "gpt_neox.layers.19.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
150
+ "gpt_neox.layers.19.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
151
+ "gpt_neox.layers.19.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
152
+ "gpt_neox.layers.19.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
153
+ "gpt_neox.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
154
+ "gpt_neox.layers.2.attention.dense.bias": "model-00001-of-00002.safetensors",
155
+ "gpt_neox.layers.2.attention.dense.weight": "model-00001-of-00002.safetensors",
156
+ "gpt_neox.layers.2.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
157
+ "gpt_neox.layers.2.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
158
+ "gpt_neox.layers.2.input_layernorm.bias": "model-00001-of-00002.safetensors",
159
+ "gpt_neox.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
160
+ "gpt_neox.layers.2.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
161
+ "gpt_neox.layers.2.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
162
+ "gpt_neox.layers.2.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
163
+ "gpt_neox.layers.2.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
164
+ "gpt_neox.layers.2.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
165
+ "gpt_neox.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
166
+ "gpt_neox.layers.20.attention.dense.bias": "model-00001-of-00002.safetensors",
167
+ "gpt_neox.layers.20.attention.dense.weight": "model-00001-of-00002.safetensors",
168
+ "gpt_neox.layers.20.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
169
+ "gpt_neox.layers.20.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
170
+ "gpt_neox.layers.20.input_layernorm.bias": "model-00001-of-00002.safetensors",
171
+ "gpt_neox.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
172
+ "gpt_neox.layers.20.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
173
+ "gpt_neox.layers.20.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
174
+ "gpt_neox.layers.20.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
175
+ "gpt_neox.layers.20.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
176
+ "gpt_neox.layers.20.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
177
+ "gpt_neox.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
178
+ "gpt_neox.layers.21.attention.dense.bias": "model-00001-of-00002.safetensors",
179
+ "gpt_neox.layers.21.attention.dense.weight": "model-00001-of-00002.safetensors",
180
+ "gpt_neox.layers.21.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
181
+ "gpt_neox.layers.21.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
182
+ "gpt_neox.layers.21.input_layernorm.bias": "model-00001-of-00002.safetensors",
183
+ "gpt_neox.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
184
+ "gpt_neox.layers.21.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
185
+ "gpt_neox.layers.21.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
186
+ "gpt_neox.layers.21.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
187
+ "gpt_neox.layers.21.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
188
+ "gpt_neox.layers.21.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
189
+ "gpt_neox.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
190
+ "gpt_neox.layers.22.attention.dense.bias": "model-00001-of-00002.safetensors",
191
+ "gpt_neox.layers.22.attention.dense.weight": "model-00001-of-00002.safetensors",
192
+ "gpt_neox.layers.22.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
193
+ "gpt_neox.layers.22.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
194
+ "gpt_neox.layers.22.input_layernorm.bias": "model-00001-of-00002.safetensors",
195
+ "gpt_neox.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
196
+ "gpt_neox.layers.22.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
197
+ "gpt_neox.layers.22.mlp.dense_4h_to_h.weight": "model-00002-of-00002.safetensors",
198
+ "gpt_neox.layers.22.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
199
+ "gpt_neox.layers.22.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
200
+ "gpt_neox.layers.22.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
201
+ "gpt_neox.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
202
+ "gpt_neox.layers.23.attention.dense.bias": "model-00002-of-00002.safetensors",
203
+ "gpt_neox.layers.23.attention.dense.weight": "model-00002-of-00002.safetensors",
204
+ "gpt_neox.layers.23.attention.query_key_value.bias": "model-00002-of-00002.safetensors",
205
+ "gpt_neox.layers.23.attention.query_key_value.weight": "model-00002-of-00002.safetensors",
206
+ "gpt_neox.layers.23.input_layernorm.bias": "model-00002-of-00002.safetensors",
207
+ "gpt_neox.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
208
+ "gpt_neox.layers.23.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
209
+ "gpt_neox.layers.23.mlp.dense_4h_to_h.weight": "model-00002-of-00002.safetensors",
210
+ "gpt_neox.layers.23.mlp.dense_h_to_4h.bias": "model-00002-of-00002.safetensors",
211
+ "gpt_neox.layers.23.mlp.dense_h_to_4h.weight": "model-00002-of-00002.safetensors",
212
+ "gpt_neox.layers.23.post_attention_layernorm.bias": "model-00002-of-00002.safetensors",
213
+ "gpt_neox.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
214
+ "gpt_neox.layers.3.attention.dense.bias": "model-00001-of-00002.safetensors",
215
+ "gpt_neox.layers.3.attention.dense.weight": "model-00001-of-00002.safetensors",
216
+ "gpt_neox.layers.3.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
217
+ "gpt_neox.layers.3.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
218
+ "gpt_neox.layers.3.input_layernorm.bias": "model-00001-of-00002.safetensors",
219
+ "gpt_neox.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
220
+ "gpt_neox.layers.3.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
221
+ "gpt_neox.layers.3.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
222
+ "gpt_neox.layers.3.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
223
+ "gpt_neox.layers.3.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
224
+ "gpt_neox.layers.3.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
225
+ "gpt_neox.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
226
+ "gpt_neox.layers.4.attention.dense.bias": "model-00001-of-00002.safetensors",
227
+ "gpt_neox.layers.4.attention.dense.weight": "model-00001-of-00002.safetensors",
228
+ "gpt_neox.layers.4.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
229
+ "gpt_neox.layers.4.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
230
+ "gpt_neox.layers.4.input_layernorm.bias": "model-00001-of-00002.safetensors",
231
+ "gpt_neox.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
232
+ "gpt_neox.layers.4.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
233
+ "gpt_neox.layers.4.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
234
+ "gpt_neox.layers.4.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
235
+ "gpt_neox.layers.4.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
236
+ "gpt_neox.layers.4.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
237
+ "gpt_neox.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
238
+ "gpt_neox.layers.5.attention.dense.bias": "model-00001-of-00002.safetensors",
239
+ "gpt_neox.layers.5.attention.dense.weight": "model-00001-of-00002.safetensors",
240
+ "gpt_neox.layers.5.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
241
+ "gpt_neox.layers.5.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
242
+ "gpt_neox.layers.5.input_layernorm.bias": "model-00001-of-00002.safetensors",
243
+ "gpt_neox.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
244
+ "gpt_neox.layers.5.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
245
+ "gpt_neox.layers.5.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
246
+ "gpt_neox.layers.5.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
247
+ "gpt_neox.layers.5.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
248
+ "gpt_neox.layers.5.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
249
+ "gpt_neox.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
250
+ "gpt_neox.layers.6.attention.dense.bias": "model-00001-of-00002.safetensors",
251
+ "gpt_neox.layers.6.attention.dense.weight": "model-00001-of-00002.safetensors",
252
+ "gpt_neox.layers.6.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
253
+ "gpt_neox.layers.6.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
254
+ "gpt_neox.layers.6.input_layernorm.bias": "model-00001-of-00002.safetensors",
255
+ "gpt_neox.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
256
+ "gpt_neox.layers.6.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
257
+ "gpt_neox.layers.6.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
258
+ "gpt_neox.layers.6.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
259
+ "gpt_neox.layers.6.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
260
+ "gpt_neox.layers.6.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
261
+ "gpt_neox.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
262
+ "gpt_neox.layers.7.attention.dense.bias": "model-00001-of-00002.safetensors",
263
+ "gpt_neox.layers.7.attention.dense.weight": "model-00001-of-00002.safetensors",
264
+ "gpt_neox.layers.7.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
265
+ "gpt_neox.layers.7.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
266
+ "gpt_neox.layers.7.input_layernorm.bias": "model-00001-of-00002.safetensors",
267
+ "gpt_neox.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
268
+ "gpt_neox.layers.7.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
269
+ "gpt_neox.layers.7.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
270
+ "gpt_neox.layers.7.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
271
+ "gpt_neox.layers.7.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
272
+ "gpt_neox.layers.7.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
273
+ "gpt_neox.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
274
+ "gpt_neox.layers.8.attention.dense.bias": "model-00001-of-00002.safetensors",
275
+ "gpt_neox.layers.8.attention.dense.weight": "model-00001-of-00002.safetensors",
276
+ "gpt_neox.layers.8.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
277
+ "gpt_neox.layers.8.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
278
+ "gpt_neox.layers.8.input_layernorm.bias": "model-00001-of-00002.safetensors",
279
+ "gpt_neox.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
280
+ "gpt_neox.layers.8.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
281
+ "gpt_neox.layers.8.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
282
+ "gpt_neox.layers.8.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
283
+ "gpt_neox.layers.8.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
284
+ "gpt_neox.layers.8.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
285
+ "gpt_neox.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
286
+ "gpt_neox.layers.9.attention.dense.bias": "model-00001-of-00002.safetensors",
287
+ "gpt_neox.layers.9.attention.dense.weight": "model-00001-of-00002.safetensors",
288
+ "gpt_neox.layers.9.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
289
+ "gpt_neox.layers.9.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
290
+ "gpt_neox.layers.9.input_layernorm.bias": "model-00001-of-00002.safetensors",
291
+ "gpt_neox.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
292
+ "gpt_neox.layers.9.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
293
+ "gpt_neox.layers.9.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
294
+ "gpt_neox.layers.9.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
295
+ "gpt_neox.layers.9.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
296
+ "gpt_neox.layers.9.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
297
+ "gpt_neox.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors"
298
+ }
299
+ }
checkpoint-1024/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff907a7b81887585d69839615ef952ca1038c8f0e722f0672adebad773928bdc
3
+ size 11317433298
checkpoint-1024/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee31c34fa1004b0e8d3e73f2c24d25dc2bd16a545fb0f5eba40502426686703e
3
+ size 15984
checkpoint-1024/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5afa94de38aab896e6b809e82ac1f7be77ec803161b05f7897e959d56250deba
3
+ size 15984
checkpoint-1024/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5004fab2ff664143e70ef16a0556e48c83fdad5c8c7cbea017b8a1a7e43496a0
3
+ size 15984
checkpoint-1024/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97b302d0abf96b5fd918db631fb30db7e78a16e2763cc51bdad8cac5e6e5695c
3
+ size 15984
checkpoint-1024/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e3021f42768f056ae4c06a24a27f0dd34528360a3d6a473027eb0bd052fdea1
3
+ size 15984
checkpoint-1024/rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5911d37d531a36b3b7d9b95b56b5038ca2bbf3493ee8178e2330790edfdc02c1
3
+ size 15984
checkpoint-1024/rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a43190c9afb367a5134733a294d4f31a56cedacb61492d97fd97bb2087ece22
3
+ size 15984
checkpoint-1024/rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78b12c90cb0f68fb4d5c5a30dad19b050bee259ad88ca59573636d9b477fc828
3
+ size 15984
checkpoint-1024/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a580f0f1f55f914888f4d2b939067dfc5732198cab8e074ca0bbd27cb611eda
3
+ size 16976119594
checkpoint-1024/trainer_state.json ADDED
@@ -0,0 +1,827 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.048128,
5
+ "eval_steps": 100,
6
+ "global_step": 1024,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01024,
13
+ "grad_norm": 269.9034729003906,
14
+ "learning_rate": 0.0001999995200527669,
15
+ "loss": 624.6766,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.02048,
20
+ "grad_norm": 73.23262786865234,
21
+ "learning_rate": 0.000199941931959037,
22
+ "loss": 544.9223,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.03072,
27
+ "grad_norm": 95.47492218017578,
28
+ "learning_rate": 0.00019978841775475367,
29
+ "loss": 511.6501,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.04096,
34
+ "grad_norm": 145.1549530029297,
35
+ "learning_rate": 0.00019953912478568305,
36
+ "loss": 497.5438,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.0512,
41
+ "grad_norm": 150.53515625,
42
+ "learning_rate": 0.00019919429232781712,
43
+ "loss": 497.208,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.06144,
48
+ "grad_norm": 151.51736450195312,
49
+ "learning_rate": 0.0001987542513577122,
50
+ "loss": 496.9036,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.07168,
55
+ "grad_norm": 159.39337158203125,
56
+ "learning_rate": 0.0001982771584048096,
57
+ "loss": 489.2719,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.08192,
62
+ "grad_norm": 148.92176818847656,
63
+ "learning_rate": 0.00019765746006440455,
64
+ "loss": 482.9451,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.09216,
69
+ "grad_norm": 139.00486755371094,
70
+ "learning_rate": 0.0001970195706599109,
71
+ "loss": 476.9848,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.1024,
76
+ "grad_norm": 157.25439453125,
77
+ "learning_rate": 0.00019622236172137374,
78
+ "loss": 471.4595,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.1024,
83
+ "eval_loss": 7.320011615753174,
84
+ "eval_runtime": 3.4958,
85
+ "eval_samples_per_second": 143.029,
86
+ "eval_steps_per_second": 9.154,
87
+ "step": 100
88
+ },
89
+ {
90
+ "epoch": 0.11264,
91
+ "grad_norm": 141.9667510986328,
92
+ "learning_rate": 0.0001953327967844356,
93
+ "loss": 468.9138,
94
+ "step": 110
95
+ },
96
+ {
97
+ "epoch": 0.12288,
98
+ "grad_norm": 99.36428833007812,
99
+ "learning_rate": 0.0001943517296699384,
100
+ "loss": 468.5555,
101
+ "step": 120
102
+ },
103
+ {
104
+ "epoch": 0.13312,
105
+ "grad_norm": 98.90169525146484,
106
+ "learning_rate": 0.00019328010202420258,
107
+ "loss": 463.3139,
108
+ "step": 130
109
+ },
110
+ {
111
+ "epoch": 0.14336,
112
+ "grad_norm": 77.31971740722656,
113
+ "learning_rate": 0.00019211894241521758,
114
+ "loss": 458.0901,
115
+ "step": 140
116
+ },
117
+ {
118
+ "epoch": 0.1536,
119
+ "grad_norm": 134.48403930664062,
120
+ "learning_rate": 0.0001908693653454033,
121
+ "loss": 454.8131,
122
+ "step": 150
123
+ },
124
+ {
125
+ "epoch": 0.16384,
126
+ "grad_norm": 95.27815246582031,
127
+ "learning_rate": 0.00018953257018189024,
128
+ "loss": 454.0167,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 0.17408,
133
+ "grad_norm": 87.9055404663086,
134
+ "learning_rate": 0.00018810984000534458,
135
+ "loss": 449.3531,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 0.18432,
140
+ "grad_norm": 116.1905288696289,
141
+ "learning_rate": 0.00018660254037844388,
142
+ "loss": 447.6146,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 0.19456,
147
+ "grad_norm": 81.48722076416016,
148
+ "learning_rate": 0.00018501211803518468,
149
+ "loss": 450.6066,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 0.2048,
154
+ "grad_norm": 75.57894897460938,
155
+ "learning_rate": 0.00018334009949228061,
156
+ "loss": 448.9498,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 0.2048,
161
+ "eval_loss": 6.992556571960449,
162
+ "eval_runtime": 3.4911,
163
+ "eval_samples_per_second": 143.219,
164
+ "eval_steps_per_second": 9.166,
165
+ "step": 200
166
+ },
167
+ {
168
+ "epoch": 0.21504,
169
+ "grad_norm": 160.8426513671875,
170
+ "learning_rate": 0.00018158808958398338,
171
+ "loss": 449.3857,
172
+ "step": 210
173
+ },
174
+ {
175
+ "epoch": 0.22528,
176
+ "grad_norm": 86.6707992553711,
177
+ "learning_rate": 0.00017975776992173344,
178
+ "loss": 449.0133,
179
+ "step": 220
180
+ },
181
+ {
182
+ "epoch": 0.23552,
183
+ "grad_norm": 65.60407257080078,
184
+ "learning_rate": 0.00017785089728011798,
185
+ "loss": 446.8142,
186
+ "step": 230
187
+ },
188
+ {
189
+ "epoch": 0.24576,
190
+ "grad_norm": 125.04796600341797,
191
+ "learning_rate": 0.00017586930191068655,
192
+ "loss": 446.0437,
193
+ "step": 240
194
+ },
195
+ {
196
+ "epoch": 0.256,
197
+ "grad_norm": 89.98313903808594,
198
+ "learning_rate": 0.00017381488578524173,
199
+ "loss": 445.3744,
200
+ "step": 250
201
+ },
202
+ {
203
+ "epoch": 0.26624,
204
+ "grad_norm": 269.5317077636719,
205
+ "learning_rate": 0.00017168962077029147,
206
+ "loss": 446.719,
207
+ "step": 260
208
+ },
209
+ {
210
+ "epoch": 0.27648,
211
+ "grad_norm": 73.03639221191406,
212
+ "learning_rate": 0.00016949554673441534,
213
+ "loss": 448.0971,
214
+ "step": 270
215
+ },
216
+ {
217
+ "epoch": 0.28672,
218
+ "grad_norm": 122.57962799072266,
219
+ "learning_rate": 0.00016723476959036083,
220
+ "loss": 448.991,
221
+ "step": 280
222
+ },
223
+ {
224
+ "epoch": 0.29696,
225
+ "grad_norm": 81.79669189453125,
226
+ "learning_rate": 0.0001649094592737497,
227
+ "loss": 444.1866,
228
+ "step": 290
229
+ },
230
+ {
231
+ "epoch": 0.3072,
232
+ "grad_norm": 74.33326721191406,
233
+ "learning_rate": 0.00016252184766033342,
234
+ "loss": 436.623,
235
+ "step": 300
236
+ },
237
+ {
238
+ "epoch": 0.3072,
239
+ "eval_loss": 6.777511119842529,
240
+ "eval_runtime": 3.4227,
241
+ "eval_samples_per_second": 146.083,
242
+ "eval_steps_per_second": 9.349,
243
+ "step": 300
244
+ },
245
+ {
246
+ "epoch": 0.31744,
247
+ "grad_norm": 113.46410369873047,
248
+ "learning_rate": 0.0001600742264237979,
249
+ "loss": 435.7422,
250
+ "step": 310
251
+ },
252
+ {
253
+ "epoch": 0.32768,
254
+ "grad_norm": 99.42645263671875,
255
+ "learning_rate": 0.00015756894483617267,
256
+ "loss": 439.4858,
257
+ "step": 320
258
+ },
259
+ {
260
+ "epoch": 0.33792,
261
+ "grad_norm": 328.0025634765625,
262
+ "learning_rate": 0.0001550084075129563,
263
+ "loss": 447.5792,
264
+ "step": 330
265
+ },
266
+ {
267
+ "epoch": 0.34816,
268
+ "grad_norm": 82.54906463623047,
269
+ "learning_rate": 0.00015239507210512194,
270
+ "loss": 446.5024,
271
+ "step": 340
272
+ },
273
+ {
274
+ "epoch": 0.3584,
275
+ "grad_norm": 62.32942581176758,
276
+ "learning_rate": 0.00014973144694021876,
277
+ "loss": 437.9146,
278
+ "step": 350
279
+ },
280
+ {
281
+ "epoch": 0.36864,
282
+ "grad_norm": 64.61022186279297,
283
+ "learning_rate": 0.00014702008861483266,
284
+ "loss": 430.4142,
285
+ "step": 360
286
+ },
287
+ {
288
+ "epoch": 0.37888,
289
+ "grad_norm": 133.777587890625,
290
+ "learning_rate": 0.00014426359954071796,
291
+ "loss": 428.6971,
292
+ "step": 370
293
+ },
294
+ {
295
+ "epoch": 0.38912,
296
+ "grad_norm": 218.512939453125,
297
+ "learning_rate": 0.00014146462544695426,
298
+ "loss": 435.1475,
299
+ "step": 380
300
+ },
301
+ {
302
+ "epoch": 0.39936,
303
+ "grad_norm": 125.56941986083984,
304
+ "learning_rate": 0.00013862585284052714,
305
+ "loss": 445.5835,
306
+ "step": 390
307
+ },
308
+ {
309
+ "epoch": 0.4096,
310
+ "grad_norm": 109.06041717529297,
311
+ "learning_rate": 0.00013575000642776893,
312
+ "loss": 446.3095,
313
+ "step": 400
314
+ },
315
+ {
316
+ "epoch": 0.4096,
317
+ "eval_loss": 6.905622959136963,
318
+ "eval_runtime": 3.4269,
319
+ "eval_samples_per_second": 145.903,
320
+ "eval_steps_per_second": 9.338,
321
+ "step": 400
322
+ },
323
+ {
324
+ "epoch": 0.41984,
325
+ "grad_norm": 69.12989044189453,
326
+ "learning_rate": 0.0001328398464991355,
327
+ "loss": 438.9709,
328
+ "step": 410
329
+ },
330
+ {
331
+ "epoch": 0.43008,
332
+ "grad_norm": 68.11474609375,
333
+ "learning_rate": 0.00012989816627982848,
334
+ "loss": 432.2964,
335
+ "step": 420
336
+ },
337
+ {
338
+ "epoch": 0.44032,
339
+ "grad_norm": 65.17674255371094,
340
+ "learning_rate": 0.00012692778924880603,
341
+ "loss": 428.2125,
342
+ "step": 430
343
+ },
344
+ {
345
+ "epoch": 0.45056,
346
+ "grad_norm": 114.95523834228516,
347
+ "learning_rate": 0.0001239315664287558,
348
+ "loss": 426.8882,
349
+ "step": 440
350
+ },
351
+ {
352
+ "epoch": 0.4608,
353
+ "grad_norm": 185.0157470703125,
354
+ "learning_rate": 0.00012091237364963071,
355
+ "loss": 435.8043,
356
+ "step": 450
357
+ },
358
+ {
359
+ "epoch": 0.47104,
360
+ "grad_norm": 92.59754180908203,
361
+ "learning_rate": 0.00011787310878837422,
362
+ "loss": 440.9751,
363
+ "step": 460
364
+ },
365
+ {
366
+ "epoch": 0.48128,
367
+ "grad_norm": 75.24162292480469,
368
+ "learning_rate": 0.00011481668898748475,
369
+ "loss": 439.3276,
370
+ "step": 470
371
+ },
372
+ {
373
+ "epoch": 0.49152,
374
+ "grad_norm": 55.42325210571289,
375
+ "learning_rate": 0.00011174604785508813,
376
+ "loss": 432.4603,
377
+ "step": 480
378
+ },
379
+ {
380
+ "epoch": 0.50176,
381
+ "grad_norm": 62.27671813964844,
382
+ "learning_rate": 0.00010866413264920678,
383
+ "loss": 427.5299,
384
+ "step": 490
385
+ },
386
+ {
387
+ "epoch": 0.512,
388
+ "grad_norm": 65.43367767333984,
389
+ "learning_rate": 0.00010557390144892684,
390
+ "loss": 425.4595,
391
+ "step": 500
392
+ },
393
+ {
394
+ "epoch": 0.512,
395
+ "eval_loss": 6.613161087036133,
396
+ "eval_runtime": 3.4182,
397
+ "eval_samples_per_second": 146.277,
398
+ "eval_steps_per_second": 9.362,
399
+ "step": 500
400
+ },
401
+ {
402
+ "epoch": 0.52224,
403
+ "grad_norm": 175.58470153808594,
404
+ "learning_rate": 0.0001024783203151793,
405
+ "loss": 425.5378,
406
+ "step": 510
407
+ },
408
+ {
409
+ "epoch": 0.53248,
410
+ "grad_norm": 199.92291259765625,
411
+ "learning_rate": 9.938036044386005e-05,
412
+ "loss": 431.3893,
413
+ "step": 520
414
+ },
415
+ {
416
+ "epoch": 0.54272,
417
+ "grad_norm": 212.65650939941406,
418
+ "learning_rate": 9.628299531402117e-05,
419
+ "loss": 443.9659,
420
+ "step": 530
421
+ },
422
+ {
423
+ "epoch": 0.55296,
424
+ "grad_norm": 93.11270141601562,
425
+ "learning_rate": 9.318919783387094e-05,
426
+ "loss": 443.3476,
427
+ "step": 540
428
+ },
429
+ {
430
+ "epoch": 0.5632,
431
+ "grad_norm": 93.02433013916016,
432
+ "learning_rate": 9.010193748732155e-05,
433
+ "loss": 438.1048,
434
+ "step": 550
435
+ },
436
+ {
437
+ "epoch": 0.57344,
438
+ "grad_norm": 72.0661849975586,
439
+ "learning_rate": 8.702417748382385e-05,
440
+ "loss": 431.1463,
441
+ "step": 560
442
+ },
443
+ {
444
+ "epoch": 0.58368,
445
+ "grad_norm": 67.0578842163086,
446
+ "learning_rate": 8.395887191422397e-05,
447
+ "loss": 427.2931,
448
+ "step": 570
449
+ },
450
+ {
451
+ "epoch": 0.59392,
452
+ "grad_norm": 85.532958984375,
453
+ "learning_rate": 8.090896291537273e-05,
454
+ "loss": 424.9293,
455
+ "step": 580
456
+ },
457
+ {
458
+ "epoch": 0.60416,
459
+ "grad_norm": 72.48572540283203,
460
+ "learning_rate": 7.787737784620803e-05,
461
+ "loss": 424.9051,
462
+ "step": 590
463
+ },
464
+ {
465
+ "epoch": 0.6144,
466
+ "grad_norm": 237.96592712402344,
467
+ "learning_rate": 7.486702647802213e-05,
468
+ "loss": 425.6438,
469
+ "step": 600
470
+ },
471
+ {
472
+ "epoch": 0.6144,
473
+ "eval_loss": 6.683709621429443,
474
+ "eval_runtime": 3.436,
475
+ "eval_samples_per_second": 145.519,
476
+ "eval_steps_per_second": 9.313,
477
+ "step": 600
478
+ },
479
+ {
480
+ "epoch": 0.62464,
481
+ "grad_norm": 178.17239379882812,
482
+ "learning_rate": 7.188079820160904e-05,
483
+ "loss": 432.3896,
484
+ "step": 610
485
+ },
486
+ {
487
+ "epoch": 0.63488,
488
+ "grad_norm": 84.38874053955078,
489
+ "learning_rate": 6.892155925397436e-05,
490
+ "loss": 434.9848,
491
+ "step": 620
492
+ },
493
+ {
494
+ "epoch": 0.64512,
495
+ "grad_norm": 66.67383575439453,
496
+ "learning_rate": 6.59921499672677e-05,
497
+ "loss": 433.8923,
498
+ "step": 630
499
+ },
500
+ {
501
+ "epoch": 0.65536,
502
+ "grad_norm": 74.11187744140625,
503
+ "learning_rate": 6.309538204257977e-05,
504
+ "loss": 430.2817,
505
+ "step": 640
506
+ },
507
+ {
508
+ "epoch": 0.6656,
509
+ "grad_norm": 95.32003784179688,
510
+ "learning_rate": 6.02340358512196e-05,
511
+ "loss": 427.1533,
512
+ "step": 650
513
+ },
514
+ {
515
+ "epoch": 0.67584,
516
+ "grad_norm": 71.91348266601562,
517
+ "learning_rate": 5.7410857766062966e-05,
518
+ "loss": 425.3034,
519
+ "step": 660
520
+ },
521
+ {
522
+ "epoch": 0.68608,
523
+ "grad_norm": 95.72642517089844,
524
+ "learning_rate": 5.4628557525532976e-05,
525
+ "loss": 425.3343,
526
+ "step": 670
527
+ },
528
+ {
529
+ "epoch": 0.69632,
530
+ "grad_norm": 161.08612060546875,
531
+ "learning_rate": 5.188980563274315e-05,
532
+ "loss": 426.5362,
533
+ "step": 680
534
+ },
535
+ {
536
+ "epoch": 0.70656,
537
+ "grad_norm": 130.4775848388672,
538
+ "learning_rate": 4.9197230792299195e-05,
539
+ "loss": 431.4921,
540
+ "step": 690
541
+ },
542
+ {
543
+ "epoch": 0.7168,
544
+ "grad_norm": 102.47798919677734,
545
+ "learning_rate": 4.6553417387219886e-05,
546
+ "loss": 432.9831,
547
+ "step": 700
548
+ },
549
+ {
550
+ "epoch": 0.7168,
551
+ "eval_loss": 6.725553512573242,
552
+ "eval_runtime": 3.4338,
553
+ "eval_samples_per_second": 145.61,
554
+ "eval_steps_per_second": 9.319,
555
+ "step": 700
556
+ },
557
+ {
558
+ "epoch": 0.72704,
559
+ "grad_norm": 73.72420501708984,
560
+ "learning_rate": 4.421777466693434e-05,
561
+ "loss": 431.4859,
562
+ "step": 710
563
+ },
564
+ {
565
+ "epoch": 0.73728,
566
+ "grad_norm": 83.63558197021484,
567
+ "learning_rate": 4.167355837898584e-05,
568
+ "loss": 428.698,
569
+ "step": 720
570
+ },
571
+ {
572
+ "epoch": 0.74752,
573
+ "grad_norm": 69.27027893066406,
574
+ "learning_rate": 3.918532488602094e-05,
575
+ "loss": 428.0623,
576
+ "step": 730
577
+ },
578
+ {
579
+ "epoch": 0.75776,
580
+ "grad_norm": 107.68405151367188,
581
+ "learning_rate": 3.675546244046228e-05,
582
+ "loss": 425.6424,
583
+ "step": 740
584
+ },
585
+ {
586
+ "epoch": 0.768,
587
+ "grad_norm": 96.42312622070312,
588
+ "learning_rate": 3.438630326912414e-05,
589
+ "loss": 425.8188,
590
+ "step": 750
591
+ },
592
+ {
593
+ "epoch": 0.77824,
594
+ "grad_norm": 116.1615982055664,
595
+ "learning_rate": 3.208012133469799e-05,
596
+ "loss": 425.9528,
597
+ "step": 760
598
+ },
599
+ {
600
+ "epoch": 0.78848,
601
+ "grad_norm": 91.75414276123047,
602
+ "learning_rate": 2.9839130153161154e-05,
603
+ "loss": 426.8583,
604
+ "step": 770
605
+ },
606
+ {
607
+ "epoch": 0.79872,
608
+ "grad_norm": 79.31800079345703,
609
+ "learning_rate": 2.766548066920338e-05,
610
+ "loss": 425.4576,
611
+ "step": 780
612
+ },
613
+ {
614
+ "epoch": 0.80896,
615
+ "grad_norm": 108.06861114501953,
616
+ "learning_rate": 2.5561259191710407e-05,
617
+ "loss": 425.0249,
618
+ "step": 790
619
+ },
620
+ {
621
+ "epoch": 0.8192,
622
+ "grad_norm": 86.25403594970703,
623
+ "learning_rate": 2.3528485391286147e-05,
624
+ "loss": 426.0778,
625
+ "step": 800
626
+ },
627
+ {
628
+ "epoch": 0.8192,
629
+ "eval_loss": 6.630011558532715,
630
+ "eval_runtime": 3.4108,
631
+ "eval_samples_per_second": 146.591,
632
+ "eval_steps_per_second": 9.382,
633
+ "step": 800
634
+ },
635
+ {
636
+ "epoch": 0.82944,
637
+ "grad_norm": 80.1161117553711,
638
+ "learning_rate": 2.1569110361735677e-05,
639
+ "loss": 426.9529,
640
+ "step": 810
641
+ },
642
+ {
643
+ "epoch": 0.83968,
644
+ "grad_norm": 92.57079315185547,
645
+ "learning_rate": 2e-05,
646
+ "loss": 425.7674,
647
+ "step": 820
648
+ },
649
+ {
650
+ "epoch": 0.84992,
651
+ "grad_norm": 76.28582000732422,
652
+ "learning_rate": 2e-05,
653
+ "loss": 425.3365,
654
+ "step": 830
655
+ },
656
+ {
657
+ "epoch": 0.86016,
658
+ "grad_norm": 111.58943176269531,
659
+ "learning_rate": 2e-05,
660
+ "loss": 424.6131,
661
+ "step": 840
662
+ },
663
+ {
664
+ "epoch": 0.8704,
665
+ "grad_norm": 158.44044494628906,
666
+ "learning_rate": 2e-05,
667
+ "loss": 425.0354,
668
+ "step": 850
669
+ },
670
+ {
671
+ "epoch": 0.88064,
672
+ "grad_norm": 101.99372100830078,
673
+ "learning_rate": 2e-05,
674
+ "loss": 424.9413,
675
+ "step": 860
676
+ },
677
+ {
678
+ "epoch": 0.89088,
679
+ "grad_norm": 140.2552490234375,
680
+ "learning_rate": 2e-05,
681
+ "loss": 426.5773,
682
+ "step": 870
683
+ },
684
+ {
685
+ "epoch": 0.90112,
686
+ "grad_norm": 117.06301879882812,
687
+ "learning_rate": 2e-05,
688
+ "loss": 427.2063,
689
+ "step": 880
690
+ },
691
+ {
692
+ "epoch": 0.91136,
693
+ "grad_norm": 147.27670288085938,
694
+ "learning_rate": 2e-05,
695
+ "loss": 427.2577,
696
+ "step": 890
697
+ },
698
+ {
699
+ "epoch": 0.9216,
700
+ "grad_norm": 109.34888458251953,
701
+ "learning_rate": 2e-05,
702
+ "loss": 428.4192,
703
+ "step": 900
704
+ },
705
+ {
706
+ "epoch": 0.9216,
707
+ "eval_loss": 6.663826942443848,
708
+ "eval_runtime": 3.429,
709
+ "eval_samples_per_second": 145.817,
710
+ "eval_steps_per_second": 9.332,
711
+ "step": 900
712
+ },
713
+ {
714
+ "epoch": 0.93184,
715
+ "grad_norm": 145.62522888183594,
716
+ "learning_rate": 2e-05,
717
+ "loss": 428.9891,
718
+ "step": 910
719
+ },
720
+ {
721
+ "epoch": 0.94208,
722
+ "grad_norm": 90.70750427246094,
723
+ "learning_rate": 2e-05,
724
+ "loss": 429.0984,
725
+ "step": 920
726
+ },
727
+ {
728
+ "epoch": 0.95232,
729
+ "grad_norm": 92.83578491210938,
730
+ "learning_rate": 2e-05,
731
+ "loss": 429.0002,
732
+ "step": 930
733
+ },
734
+ {
735
+ "epoch": 0.96256,
736
+ "grad_norm": 125.1180648803711,
737
+ "learning_rate": 2e-05,
738
+ "loss": 428.4883,
739
+ "step": 940
740
+ },
741
+ {
742
+ "epoch": 0.9728,
743
+ "grad_norm": 132.3828125,
744
+ "learning_rate": 2e-05,
745
+ "loss": 428.6993,
746
+ "step": 950
747
+ },
748
+ {
749
+ "epoch": 0.98304,
750
+ "grad_norm": 100.2248306274414,
751
+ "learning_rate": 2e-05,
752
+ "loss": 429.164,
753
+ "step": 960
754
+ },
755
+ {
756
+ "epoch": 0.99328,
757
+ "grad_norm": 112.38407897949219,
758
+ "learning_rate": 2e-05,
759
+ "loss": 430.1383,
760
+ "step": 970
761
+ },
762
+ {
763
+ "epoch": 1.003072,
764
+ "grad_norm": 104.0753173828125,
765
+ "learning_rate": 2e-05,
766
+ "loss": 410.847,
767
+ "step": 980
768
+ },
769
+ {
770
+ "epoch": 1.013312,
771
+ "grad_norm": 137.40553283691406,
772
+ "learning_rate": 2e-05,
773
+ "loss": 431.1839,
774
+ "step": 990
775
+ },
776
+ {
777
+ "epoch": 1.023552,
778
+ "grad_norm": 191.53709411621094,
779
+ "learning_rate": 2e-05,
780
+ "loss": 431.9959,
781
+ "step": 1000
782
+ },
783
+ {
784
+ "epoch": 1.023552,
785
+ "eval_loss": 6.718299865722656,
786
+ "eval_runtime": 3.4132,
787
+ "eval_samples_per_second": 146.49,
788
+ "eval_steps_per_second": 9.375,
789
+ "step": 1000
790
+ },
791
+ {
792
+ "epoch": 1.033792,
793
+ "grad_norm": 188.12156677246094,
794
+ "learning_rate": 2e-05,
795
+ "loss": 432.0317,
796
+ "step": 1010
797
+ },
798
+ {
799
+ "epoch": 1.044032,
800
+ "grad_norm": 119.64492797851562,
801
+ "learning_rate": 2e-05,
802
+ "loss": 432.8214,
803
+ "step": 1020
804
+ }
805
+ ],
806
+ "logging_steps": 10,
807
+ "max_steps": 1024,
808
+ "num_input_tokens_seen": 0,
809
+ "num_train_epochs": 2,
810
+ "save_steps": 1024,
811
+ "stateful_callbacks": {
812
+ "TrainerControl": {
813
+ "args": {
814
+ "should_epoch_stop": false,
815
+ "should_evaluate": false,
816
+ "should_log": false,
817
+ "should_save": true,
818
+ "should_training_stop": true
819
+ },
820
+ "attributes": {}
821
+ }
822
+ },
823
+ "total_flos": 9.134583528711782e+18,
824
+ "train_batch_size": 2,
825
+ "trial_name": null,
826
+ "trial_params": null
827
+ }
checkpoint-1024/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f05808bb06f5a381500cd04f06a922b537630b6f18d2cfd53d4617345e2fb08c
3
+ size 5368
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/vault/c106fa/c106fa11/minipile_marcus/regular/pythia1.4b_dedup_untrained",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 0,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 2048,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 8192,
16
+ "layer_norm_eps": 1e-05,
17
+ "max_position_embeddings": 2048,
18
+ "model_type": "gpt_neox",
19
+ "num_attention_heads": 16,
20
+ "num_hidden_layers": 24,
21
+ "partial_rotary_factor": 0.25,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000,
24
+ "rotary_emb_base": 10000,
25
+ "rotary_pct": 0.25,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.47.1",
29
+ "use_cache": true,
30
+ "use_parallel_residual": true,
31
+ "vocab_size": 50304
32
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 0,
5
+ "transformers_version": "4.47.1"
6
+ }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4831f2865cac402dbbf5d0cb1043c06f45dddf4b4dc5b9607866f458fb5f686
3
+ size 4977966760
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8bd81362c3715a9fa27fb35b320fc0061032b7ec51e441df30a17e7606f4815
3
+ size 680658856
model.safetensors.index.json ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 5658591232
4
+ },
5
+ "weight_map": {
6
+ "embed_out.weight": "model-00002-of-00002.safetensors",
7
+ "gpt_neox.embed_in.weight": "model-00001-of-00002.safetensors",
8
+ "gpt_neox.final_layer_norm.bias": "model-00002-of-00002.safetensors",
9
+ "gpt_neox.final_layer_norm.weight": "model-00002-of-00002.safetensors",
10
+ "gpt_neox.layers.0.attention.dense.bias": "model-00001-of-00002.safetensors",
11
+ "gpt_neox.layers.0.attention.dense.weight": "model-00001-of-00002.safetensors",
12
+ "gpt_neox.layers.0.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
13
+ "gpt_neox.layers.0.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
14
+ "gpt_neox.layers.0.input_layernorm.bias": "model-00001-of-00002.safetensors",
15
+ "gpt_neox.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
16
+ "gpt_neox.layers.0.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
17
+ "gpt_neox.layers.0.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
18
+ "gpt_neox.layers.0.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
19
+ "gpt_neox.layers.0.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
20
+ "gpt_neox.layers.0.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
21
+ "gpt_neox.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
22
+ "gpt_neox.layers.1.attention.dense.bias": "model-00001-of-00002.safetensors",
23
+ "gpt_neox.layers.1.attention.dense.weight": "model-00001-of-00002.safetensors",
24
+ "gpt_neox.layers.1.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
25
+ "gpt_neox.layers.1.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
26
+ "gpt_neox.layers.1.input_layernorm.bias": "model-00001-of-00002.safetensors",
27
+ "gpt_neox.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
28
+ "gpt_neox.layers.1.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
29
+ "gpt_neox.layers.1.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
30
+ "gpt_neox.layers.1.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
31
+ "gpt_neox.layers.1.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
32
+ "gpt_neox.layers.1.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
33
+ "gpt_neox.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
34
+ "gpt_neox.layers.10.attention.dense.bias": "model-00001-of-00002.safetensors",
35
+ "gpt_neox.layers.10.attention.dense.weight": "model-00001-of-00002.safetensors",
36
+ "gpt_neox.layers.10.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
37
+ "gpt_neox.layers.10.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
38
+ "gpt_neox.layers.10.input_layernorm.bias": "model-00001-of-00002.safetensors",
39
+ "gpt_neox.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
40
+ "gpt_neox.layers.10.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
41
+ "gpt_neox.layers.10.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
42
+ "gpt_neox.layers.10.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
43
+ "gpt_neox.layers.10.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
44
+ "gpt_neox.layers.10.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
45
+ "gpt_neox.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
46
+ "gpt_neox.layers.11.attention.dense.bias": "model-00001-of-00002.safetensors",
47
+ "gpt_neox.layers.11.attention.dense.weight": "model-00001-of-00002.safetensors",
48
+ "gpt_neox.layers.11.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
49
+ "gpt_neox.layers.11.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
50
+ "gpt_neox.layers.11.input_layernorm.bias": "model-00001-of-00002.safetensors",
51
+ "gpt_neox.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
52
+ "gpt_neox.layers.11.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
53
+ "gpt_neox.layers.11.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
54
+ "gpt_neox.layers.11.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
55
+ "gpt_neox.layers.11.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
56
+ "gpt_neox.layers.11.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
57
+ "gpt_neox.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
58
+ "gpt_neox.layers.12.attention.dense.bias": "model-00001-of-00002.safetensors",
59
+ "gpt_neox.layers.12.attention.dense.weight": "model-00001-of-00002.safetensors",
60
+ "gpt_neox.layers.12.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
61
+ "gpt_neox.layers.12.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
62
+ "gpt_neox.layers.12.input_layernorm.bias": "model-00001-of-00002.safetensors",
63
+ "gpt_neox.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
64
+ "gpt_neox.layers.12.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
65
+ "gpt_neox.layers.12.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
66
+ "gpt_neox.layers.12.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
67
+ "gpt_neox.layers.12.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
68
+ "gpt_neox.layers.12.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
69
+ "gpt_neox.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
70
+ "gpt_neox.layers.13.attention.dense.bias": "model-00001-of-00002.safetensors",
71
+ "gpt_neox.layers.13.attention.dense.weight": "model-00001-of-00002.safetensors",
72
+ "gpt_neox.layers.13.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
73
+ "gpt_neox.layers.13.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
74
+ "gpt_neox.layers.13.input_layernorm.bias": "model-00001-of-00002.safetensors",
75
+ "gpt_neox.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
76
+ "gpt_neox.layers.13.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
77
+ "gpt_neox.layers.13.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
78
+ "gpt_neox.layers.13.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
79
+ "gpt_neox.layers.13.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
80
+ "gpt_neox.layers.13.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
81
+ "gpt_neox.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
82
+ "gpt_neox.layers.14.attention.dense.bias": "model-00001-of-00002.safetensors",
83
+ "gpt_neox.layers.14.attention.dense.weight": "model-00001-of-00002.safetensors",
84
+ "gpt_neox.layers.14.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
85
+ "gpt_neox.layers.14.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
86
+ "gpt_neox.layers.14.input_layernorm.bias": "model-00001-of-00002.safetensors",
87
+ "gpt_neox.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
88
+ "gpt_neox.layers.14.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
89
+ "gpt_neox.layers.14.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
90
+ "gpt_neox.layers.14.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
91
+ "gpt_neox.layers.14.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
92
+ "gpt_neox.layers.14.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
93
+ "gpt_neox.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
94
+ "gpt_neox.layers.15.attention.dense.bias": "model-00001-of-00002.safetensors",
95
+ "gpt_neox.layers.15.attention.dense.weight": "model-00001-of-00002.safetensors",
96
+ "gpt_neox.layers.15.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
97
+ "gpt_neox.layers.15.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
98
+ "gpt_neox.layers.15.input_layernorm.bias": "model-00001-of-00002.safetensors",
99
+ "gpt_neox.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
100
+ "gpt_neox.layers.15.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
101
+ "gpt_neox.layers.15.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
102
+ "gpt_neox.layers.15.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
103
+ "gpt_neox.layers.15.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
104
+ "gpt_neox.layers.15.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
105
+ "gpt_neox.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
106
+ "gpt_neox.layers.16.attention.dense.bias": "model-00001-of-00002.safetensors",
107
+ "gpt_neox.layers.16.attention.dense.weight": "model-00001-of-00002.safetensors",
108
+ "gpt_neox.layers.16.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
109
+ "gpt_neox.layers.16.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
110
+ "gpt_neox.layers.16.input_layernorm.bias": "model-00001-of-00002.safetensors",
111
+ "gpt_neox.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
112
+ "gpt_neox.layers.16.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
113
+ "gpt_neox.layers.16.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
114
+ "gpt_neox.layers.16.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
115
+ "gpt_neox.layers.16.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
116
+ "gpt_neox.layers.16.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
117
+ "gpt_neox.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
118
+ "gpt_neox.layers.17.attention.dense.bias": "model-00001-of-00002.safetensors",
119
+ "gpt_neox.layers.17.attention.dense.weight": "model-00001-of-00002.safetensors",
120
+ "gpt_neox.layers.17.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
121
+ "gpt_neox.layers.17.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
122
+ "gpt_neox.layers.17.input_layernorm.bias": "model-00001-of-00002.safetensors",
123
+ "gpt_neox.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
124
+ "gpt_neox.layers.17.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
125
+ "gpt_neox.layers.17.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
126
+ "gpt_neox.layers.17.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
127
+ "gpt_neox.layers.17.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
128
+ "gpt_neox.layers.17.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
129
+ "gpt_neox.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
130
+ "gpt_neox.layers.18.attention.dense.bias": "model-00001-of-00002.safetensors",
131
+ "gpt_neox.layers.18.attention.dense.weight": "model-00001-of-00002.safetensors",
132
+ "gpt_neox.layers.18.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
133
+ "gpt_neox.layers.18.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
134
+ "gpt_neox.layers.18.input_layernorm.bias": "model-00001-of-00002.safetensors",
135
+ "gpt_neox.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
136
+ "gpt_neox.layers.18.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
137
+ "gpt_neox.layers.18.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
138
+ "gpt_neox.layers.18.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
139
+ "gpt_neox.layers.18.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
140
+ "gpt_neox.layers.18.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
141
+ "gpt_neox.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
142
+ "gpt_neox.layers.19.attention.dense.bias": "model-00001-of-00002.safetensors",
143
+ "gpt_neox.layers.19.attention.dense.weight": "model-00001-of-00002.safetensors",
144
+ "gpt_neox.layers.19.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
145
+ "gpt_neox.layers.19.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
146
+ "gpt_neox.layers.19.input_layernorm.bias": "model-00001-of-00002.safetensors",
147
+ "gpt_neox.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
148
+ "gpt_neox.layers.19.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
149
+ "gpt_neox.layers.19.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
150
+ "gpt_neox.layers.19.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
151
+ "gpt_neox.layers.19.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
152
+ "gpt_neox.layers.19.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
153
+ "gpt_neox.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
154
+ "gpt_neox.layers.2.attention.dense.bias": "model-00001-of-00002.safetensors",
155
+ "gpt_neox.layers.2.attention.dense.weight": "model-00001-of-00002.safetensors",
156
+ "gpt_neox.layers.2.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
157
+ "gpt_neox.layers.2.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
158
+ "gpt_neox.layers.2.input_layernorm.bias": "model-00001-of-00002.safetensors",
159
+ "gpt_neox.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
160
+ "gpt_neox.layers.2.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
161
+ "gpt_neox.layers.2.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
162
+ "gpt_neox.layers.2.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
163
+ "gpt_neox.layers.2.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
164
+ "gpt_neox.layers.2.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
165
+ "gpt_neox.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
166
+ "gpt_neox.layers.20.attention.dense.bias": "model-00001-of-00002.safetensors",
167
+ "gpt_neox.layers.20.attention.dense.weight": "model-00001-of-00002.safetensors",
168
+ "gpt_neox.layers.20.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
169
+ "gpt_neox.layers.20.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
170
+ "gpt_neox.layers.20.input_layernorm.bias": "model-00001-of-00002.safetensors",
171
+ "gpt_neox.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
172
+ "gpt_neox.layers.20.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
173
+ "gpt_neox.layers.20.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
174
+ "gpt_neox.layers.20.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
175
+ "gpt_neox.layers.20.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
176
+ "gpt_neox.layers.20.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
177
+ "gpt_neox.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
178
+ "gpt_neox.layers.21.attention.dense.bias": "model-00001-of-00002.safetensors",
179
+ "gpt_neox.layers.21.attention.dense.weight": "model-00001-of-00002.safetensors",
180
+ "gpt_neox.layers.21.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
181
+ "gpt_neox.layers.21.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
182
+ "gpt_neox.layers.21.input_layernorm.bias": "model-00001-of-00002.safetensors",
183
+ "gpt_neox.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
184
+ "gpt_neox.layers.21.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
185
+ "gpt_neox.layers.21.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
186
+ "gpt_neox.layers.21.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
187
+ "gpt_neox.layers.21.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
188
+ "gpt_neox.layers.21.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
189
+ "gpt_neox.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
190
+ "gpt_neox.layers.22.attention.dense.bias": "model-00001-of-00002.safetensors",
191
+ "gpt_neox.layers.22.attention.dense.weight": "model-00001-of-00002.safetensors",
192
+ "gpt_neox.layers.22.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
193
+ "gpt_neox.layers.22.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
194
+ "gpt_neox.layers.22.input_layernorm.bias": "model-00001-of-00002.safetensors",
195
+ "gpt_neox.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
196
+ "gpt_neox.layers.22.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
197
+ "gpt_neox.layers.22.mlp.dense_4h_to_h.weight": "model-00002-of-00002.safetensors",
198
+ "gpt_neox.layers.22.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
199
+ "gpt_neox.layers.22.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
200
+ "gpt_neox.layers.22.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
201
+ "gpt_neox.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
202
+ "gpt_neox.layers.23.attention.dense.bias": "model-00002-of-00002.safetensors",
203
+ "gpt_neox.layers.23.attention.dense.weight": "model-00002-of-00002.safetensors",
204
+ "gpt_neox.layers.23.attention.query_key_value.bias": "model-00002-of-00002.safetensors",
205
+ "gpt_neox.layers.23.attention.query_key_value.weight": "model-00002-of-00002.safetensors",
206
+ "gpt_neox.layers.23.input_layernorm.bias": "model-00002-of-00002.safetensors",
207
+ "gpt_neox.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
208
+ "gpt_neox.layers.23.mlp.dense_4h_to_h.bias": "model-00002-of-00002.safetensors",
209
+ "gpt_neox.layers.23.mlp.dense_4h_to_h.weight": "model-00002-of-00002.safetensors",
210
+ "gpt_neox.layers.23.mlp.dense_h_to_4h.bias": "model-00002-of-00002.safetensors",
211
+ "gpt_neox.layers.23.mlp.dense_h_to_4h.weight": "model-00002-of-00002.safetensors",
212
+ "gpt_neox.layers.23.post_attention_layernorm.bias": "model-00002-of-00002.safetensors",
213
+ "gpt_neox.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
214
+ "gpt_neox.layers.3.attention.dense.bias": "model-00001-of-00002.safetensors",
215
+ "gpt_neox.layers.3.attention.dense.weight": "model-00001-of-00002.safetensors",
216
+ "gpt_neox.layers.3.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
217
+ "gpt_neox.layers.3.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
218
+ "gpt_neox.layers.3.input_layernorm.bias": "model-00001-of-00002.safetensors",
219
+ "gpt_neox.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
220
+ "gpt_neox.layers.3.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
221
+ "gpt_neox.layers.3.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
222
+ "gpt_neox.layers.3.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
223
+ "gpt_neox.layers.3.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
224
+ "gpt_neox.layers.3.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
225
+ "gpt_neox.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
226
+ "gpt_neox.layers.4.attention.dense.bias": "model-00001-of-00002.safetensors",
227
+ "gpt_neox.layers.4.attention.dense.weight": "model-00001-of-00002.safetensors",
228
+ "gpt_neox.layers.4.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
229
+ "gpt_neox.layers.4.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
230
+ "gpt_neox.layers.4.input_layernorm.bias": "model-00001-of-00002.safetensors",
231
+ "gpt_neox.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
232
+ "gpt_neox.layers.4.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
233
+ "gpt_neox.layers.4.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
234
+ "gpt_neox.layers.4.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
235
+ "gpt_neox.layers.4.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
236
+ "gpt_neox.layers.4.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
237
+ "gpt_neox.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
238
+ "gpt_neox.layers.5.attention.dense.bias": "model-00001-of-00002.safetensors",
239
+ "gpt_neox.layers.5.attention.dense.weight": "model-00001-of-00002.safetensors",
240
+ "gpt_neox.layers.5.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
241
+ "gpt_neox.layers.5.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
242
+ "gpt_neox.layers.5.input_layernorm.bias": "model-00001-of-00002.safetensors",
243
+ "gpt_neox.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
244
+ "gpt_neox.layers.5.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
245
+ "gpt_neox.layers.5.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
246
+ "gpt_neox.layers.5.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
247
+ "gpt_neox.layers.5.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
248
+ "gpt_neox.layers.5.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
249
+ "gpt_neox.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
250
+ "gpt_neox.layers.6.attention.dense.bias": "model-00001-of-00002.safetensors",
251
+ "gpt_neox.layers.6.attention.dense.weight": "model-00001-of-00002.safetensors",
252
+ "gpt_neox.layers.6.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
253
+ "gpt_neox.layers.6.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
254
+ "gpt_neox.layers.6.input_layernorm.bias": "model-00001-of-00002.safetensors",
255
+ "gpt_neox.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
256
+ "gpt_neox.layers.6.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
257
+ "gpt_neox.layers.6.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
258
+ "gpt_neox.layers.6.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
259
+ "gpt_neox.layers.6.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
260
+ "gpt_neox.layers.6.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
261
+ "gpt_neox.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
262
+ "gpt_neox.layers.7.attention.dense.bias": "model-00001-of-00002.safetensors",
263
+ "gpt_neox.layers.7.attention.dense.weight": "model-00001-of-00002.safetensors",
264
+ "gpt_neox.layers.7.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
265
+ "gpt_neox.layers.7.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
266
+ "gpt_neox.layers.7.input_layernorm.bias": "model-00001-of-00002.safetensors",
267
+ "gpt_neox.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
268
+ "gpt_neox.layers.7.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
269
+ "gpt_neox.layers.7.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
270
+ "gpt_neox.layers.7.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
271
+ "gpt_neox.layers.7.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
272
+ "gpt_neox.layers.7.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
273
+ "gpt_neox.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
274
+ "gpt_neox.layers.8.attention.dense.bias": "model-00001-of-00002.safetensors",
275
+ "gpt_neox.layers.8.attention.dense.weight": "model-00001-of-00002.safetensors",
276
+ "gpt_neox.layers.8.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
277
+ "gpt_neox.layers.8.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
278
+ "gpt_neox.layers.8.input_layernorm.bias": "model-00001-of-00002.safetensors",
279
+ "gpt_neox.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
280
+ "gpt_neox.layers.8.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
281
+ "gpt_neox.layers.8.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
282
+ "gpt_neox.layers.8.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
283
+ "gpt_neox.layers.8.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
284
+ "gpt_neox.layers.8.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
285
+ "gpt_neox.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
286
+ "gpt_neox.layers.9.attention.dense.bias": "model-00001-of-00002.safetensors",
287
+ "gpt_neox.layers.9.attention.dense.weight": "model-00001-of-00002.safetensors",
288
+ "gpt_neox.layers.9.attention.query_key_value.bias": "model-00001-of-00002.safetensors",
289
+ "gpt_neox.layers.9.attention.query_key_value.weight": "model-00001-of-00002.safetensors",
290
+ "gpt_neox.layers.9.input_layernorm.bias": "model-00001-of-00002.safetensors",
291
+ "gpt_neox.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
292
+ "gpt_neox.layers.9.mlp.dense_4h_to_h.bias": "model-00001-of-00002.safetensors",
293
+ "gpt_neox.layers.9.mlp.dense_4h_to_h.weight": "model-00001-of-00002.safetensors",
294
+ "gpt_neox.layers.9.mlp.dense_h_to_4h.bias": "model-00001-of-00002.safetensors",
295
+ "gpt_neox.layers.9.mlp.dense_h_to_4h.weight": "model-00001-of-00002.safetensors",
296
+ "gpt_neox.layers.9.post_attention_layernorm.bias": "model-00001-of-00002.safetensors",
297
+ "gpt_neox.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors"
298
+ }
299
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f05808bb06f5a381500cd04f06a922b537630b6f18d2cfd53d4617345e2fb08c
3
+ size 5368