File size: 3,599 Bytes
2705b16 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
name: comma_v0p1_yolooooo
dump_dir: /fsx/craffel/lingua_logs/comma_v0p1/
seed: 777
grad_acc_steps: 4
gc_collect_freq: 1000
probe_freq: null
steps: 500000
data:
root_dir: /scratch/craffel/lingua/data/
sources:
peS2o: 0.274065475510351
stackexchange: 0.134617935796937
stackv2_edu: 0.127770669195666
cccc: 0.0871992270000557
wikimedia: 0.0861800315862719
github_archive: 0.0606452345122248
uspto: 0.0413469377516883
pubmed: 0.0367902799837971
arxiv_papers: 0.0292395449667613
caselaw_access_project: 0.0193875362722656
wikiteam: 0.0137485410839637
doab: 0.0180439781895451
uk_hansard: 0.0144498535570883
pre_1929_books: 0.0115755547988338
ubuntu_irc: 0.00794254267719456
regulations: 0.00762583706405442
data_provenance_initiative: 0.00512264496834867
project_gutenberg: 0.00502100654070129
youtube: 0.00465917165839394
arxiv_abstracts: 0.00359635066160403
stackv2_html: 0.00225924255952781
usgpo: 0.00226024581728848
library_of_congress: 0.00222469340783564
biodiversity_heritage_library: 0.00221737524370278
pressbooks: 0.000865101033213598
libretexts: 0.00054149556727006
news: 0.000372716196818104
foodista: 0.000125363443065615
oercommons: 7.78696843693821e-05
python_enhancement_proposals: 1.69983991984805e-05
public_domain_review: 1.05448719635173e-05
batch_size: 2
seq_len: 4096
n_views: 2
seed: 42
add_bos: true
add_eos: true
load_async: true
prefetch_size: 4096
tokenizer:
name: tiktoken
path: /fsx/craffel/lingua/tokenizers/common-pile-tokenizer.tiktoken
optim:
lr: 0.001
weight_decay: 0.2
epsilon: 1.0e-08
beta1: 0.9
beta2: 0.95
clip: 1.0
scheduler: cosine
warmup: 2000
lr_min_ratio: 1.0e-06
cycle_length: 1.0
cosine_theta: 1.0
annealing_step: 1000
decay_fraction: 0.1
exp_factor: 0.5
model:
dim: 4096
n_layers: 32
head_dim: null
n_heads: 32
n_kv_heads: null
ffn_dim_multiplier: 1.0
multiple_of: 256
norm_eps: 1.0e-05
rope_theta: 100000.0
init_base_std: null
init_std_factor: disabled
max_seqlen: 4096
seed: 42
vocab_size: 64256
weight_tying: false
sliding_window: null
distributed:
dp_shard: 1
dp_replicate: 64
tp_size: 1
selective_activation_checkpointing: false
compile: true
fsdp_type: full_shard
model_dtype: bf16
float8_recipe: null
float8_filter: layers\.[0-9]+\.
matmul_allow_tf32: false
detect_anomaly: false
compile_cache_size_limit: 8
spawn_method: forkserver
env:
MKL_SERVICE_FORCE_INTEL: GNU
OMP_NUM_THREADS: '1'
MKL_NUM_THREADS: '1'
ENABLE_INTRA_NODE_COMM: '1'
TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
NCCL_IB_TIMEOUT: '22'
NCCL_DEBUG: INFO
TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
checkpoint:
dump:
every: 10000
keep: -1
eval:
every: 2000
keep: 3
path: /fsx/craffel/lingua_logs/comma_v0p1/checkpoints
init_ckpt_path: null
continue_training_from_init: false
profiling:
run: true
trace_folder: profiling
mem_warmup: 0
mem_steps: 4
profile_warmup: 100
profile_steps: 4
logging:
freq: 1
acc_freq: null
wandb: null
async_eval_gpus: 8
eval:
harness:
tasks:
- hellaswag
- task: boolq
dataset_kwargs:
trust_remote_code: true
- piqa
- task: social_iqa
dataset_kwargs:
trust_remote_code: true
- winogrande
- openbookqa
- arc_easy
- arc_challenge
- race
- commonsense_qa
- task: copa
dataset_kwargs:
trust_remote_code: true
- mmlu
- mmlu_pro
generator:
max_tokens: 8192
dtype: bf16
|