{ "data": { "cache_dir": "data", "train": "openwebtext", "valid": "wikitext103" }, "eval": { "batch_size": 512 }, "graph": { "file": "data", "type": "simplex" }, "model": { "cond_dim": 128, "dropout": 0.1, "embedding": 0, "hidden_size": 768, "length": 1024, "n_blocks": 12, "n_heads": 12, "name": "small", "scale_by_sigma": false, "score_strategy": 0, "type": "ddit" }, "ngpus": 16, "noise": { "sigma_max": 20, "sigma_min": 0.0001, "type": "geometric" }, "optim": { "beta1": 0.9, "beta2": 0.999, "eps": 1e-08, "grad_clip": 1.0, "lr": 0.0003, "optimizer": "AdamW", "warmup": 2500, "weight_decay": 0 }, "sampling": { "corr_steps": 1, "corrector": "none", "noise_removal": false, "predictor": "euler", "roc": 0.01, "steps": 1000 }, "tokens": 50257, "training": { "accum": 1, "batch_size": 512, "ema": 0.9999, "eval_freq": 100, "log_freq": 50, "n_iters": 1300001, "snapshot_freq": 10000, "snapshot_freq_for_preemption": 10000, "snapshot_sampling": true } }