File size: 1,367 Bytes
a2c94d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e464b5
 
 
 
a2c94d3
 
 
 
 
 
 
 
8e464b5
a2c94d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
{
    "ngpus": 32,
    "tokens": 50257,
    "gpt_dir": "assets/gpt2-large",
    "outdir": "../output",
    "training": {
        "batch_size": 512,
        "accum": 1,
        "n_iters": 1000001,
        "snapshot_freq": 50000,
        "log_freq": 50,
        "eval_freq": 100,
        "snapshot_freq_for_preemption": 10000,
        "weight": "standard",
        "snapshot_sampling": false,
        "ema": 0.9999,
        "loss_type": "t_DCE"
    },
    "data": {
        "train": "openwebtext",
        "valid": "wikitext103",
        "cache_dir": "data"
    },
    "noise": {
        "type": "loglinear",
        "sigma_min": 0.0001,
        "sigma_max": 20
    },
    "sampling": {
        "predictor": "euler",
        "steps": 1024
    },
    "eval": {
        "batch_size": 512,
        "perplexity": true,
        "perplexity_batch_size": 16
    },
    "optim": {
        "weight_decay": 0.03,
        "optimizer": "AdamW",
        "lr": 0.0003,
        "beta1": 0.9,
        "beta2": 0.999,
        "eps": 1e-08,
        "warmup": 2500,
        "grad_clip": 1.0
    },
    "model": {
        "name": "small_wotsm",
        "type": "ddit_wot",
        "hidden_size": 768,
        "cond_dim": 128,
        "length": 1024,
        "n_blocks": 12,
        "n_heads": 12,
        "dropout": 0.02,
        "use_checkpoint": false,
        "dtype": "float16"
    }
}