rokjaer commited on
Commit
3c1f709
·
verified ·
1 Parent(s): bd19f16

Update config.json

Browse files
Files changed (1) hide show
  1. config.json +52 -45
config.json CHANGED
@@ -1,56 +1,63 @@
1
  {
 
2
  "architectures": [
3
- "ScriptableCrammedBERT"
4
  ],
5
- "num_transformer_layers": 16,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "hidden_size": 768,
 
 
 
 
7
  "intermed_size": 3072,
8
- "hidden_dropout_prob": 0.1,
 
9
  "norm": "LayerNorm",
10
  "norm_eps": 1e-12,
11
  "norm_scheme": "pre",
12
- "nonlin": "GELUglu",
13
- "tie_weights": true,
14
- "decoder_bias": false,
15
- "sparse_prediction": 0.25,
16
- "loss": "cross-entropy",
17
  "objective_layout": "MLM",
18
- "embedding": {
19
- "vocab_size": null,
20
- "pos_embedding": "scaled-sinusoidal",
21
- "dropout_prob": 0.1,
22
- "pad_token_id": 0,
23
- "max_seq_length": 128,
24
- "embedding_dim": 768,
25
- "normalization": true,
26
- "stable_low_precision": false
27
- },
28
- "attention": {
29
- "type": "self-attention",
30
- "causal_attention": false,
31
- "num_attention_heads": 12,
32
- "dropout_prob": 0.1,
33
- "skip_output_projection": false,
34
- "qkv_bias": false,
35
- "rotary_embedding": false,
36
- "seq_op_in_fp32": false,
37
- "sequence_op": "torch-softmax"
38
- },
39
- "init": {
40
- "type": "normal",
41
- "std": 0.02
42
- },
43
- "ffn_layer_frequency": 1,
44
  "skip_head_transform": true,
45
- "use_bias": false,
46
- "final_norm": true,
47
- "num_labels": 0,
48
- "classification_head": {
49
- "pooler": "avg",
50
- "include_ff_layer": true,
51
- "head_dim": 1024,
52
- "nonlin": "Tanh",
53
- "classifier_dropout": 0.1
54
- },
55
- "attn_implementation": null
56
  }
 
1
  {
2
+ "arch": {
3
  "architectures": [
4
+ "ScriptableCrammedBERT"
5
  ],
6
+ "attention": {
7
+ "causal_attention": false,
8
+ "dropout_prob": 0.1,
9
+ "num_attention_heads": 12,
10
+ "qkv_bias": false,
11
+ "rotary_embedding": false,
12
+ "seq_op_in_fp32": false,
13
+ "sequence_op": "torch-softmax",
14
+ "skip_output_projection": false,
15
+ "type": "self-attention"
16
+ },
17
+ "classification_head": {
18
+ "classifier_dropout": 0.1,
19
+ "head_dim": 1024,
20
+ "include_ff_layer": true,
21
+ "nonlin": "Tanh",
22
+ "pooler": "zero_index"
23
+ },
24
+ "decoder_bias": false,
25
+ "embedding": {
26
+ "dropout_prob": 0.1,
27
+ "embedding_dim": 768,
28
+ "max_seq_length": 128,
29
+ "normalization": true,
30
+ "pad_token_id": 0,
31
+ "pos_embedding": "scaled-sinusoidal",
32
+ "stable_low_precision": false,
33
+ "vocab_size": 32768
34
+ },
35
+ "ffn_layer_frequency": 1,
36
+ "final_norm": true,
37
+ "hidden_dropout_prob": 0.1,
38
  "hidden_size": 768,
39
+ "init": {
40
+ "std": 0.02,
41
+ "type": "normal"
42
+ },
43
  "intermed_size": 3072,
44
+ "loss": "cross-entropy",
45
+ "nonlin": "GELUglu",
46
  "norm": "LayerNorm",
47
  "norm_eps": 1e-12,
48
  "norm_scheme": "pre",
49
+ "num_labels": null,
50
+ "num_transformer_layers": 16,
 
 
 
51
  "objective_layout": "MLM",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  "skip_head_transform": true,
53
+ "sparse_prediction": 0.25,
54
+ "tie_weights": true,
55
+ "use_bias": false
56
+ },
57
+ "architectures": [
58
+ "ScriptableLMForPreTraining"
59
+ ],
60
+ "model_type": "crammedBERT",
61
+ "torch_dtype": "float32",
62
+ "transformers_version": "4.29.2"
 
63
  }