File size: 3,599 Bytes
2705b16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
name: comma_v0p1_yolooooo
dump_dir: /fsx/craffel/lingua_logs/comma_v0p1/
seed: 777
grad_acc_steps: 4
gc_collect_freq: 1000
probe_freq: null
steps: 500000
data:
  root_dir: /scratch/craffel/lingua/data/
  sources:
    peS2o: 0.274065475510351
    stackexchange: 0.134617935796937
    stackv2_edu: 0.127770669195666
    cccc: 0.0871992270000557
    wikimedia: 0.0861800315862719
    github_archive: 0.0606452345122248
    uspto: 0.0413469377516883
    pubmed: 0.0367902799837971
    arxiv_papers: 0.0292395449667613
    caselaw_access_project: 0.0193875362722656
    wikiteam: 0.0137485410839637
    doab: 0.0180439781895451
    uk_hansard: 0.0144498535570883
    pre_1929_books: 0.0115755547988338
    ubuntu_irc: 0.00794254267719456
    regulations: 0.00762583706405442
    data_provenance_initiative: 0.00512264496834867
    project_gutenberg: 0.00502100654070129
    youtube: 0.00465917165839394
    arxiv_abstracts: 0.00359635066160403
    stackv2_html: 0.00225924255952781
    usgpo: 0.00226024581728848
    library_of_congress: 0.00222469340783564
    biodiversity_heritage_library: 0.00221737524370278
    pressbooks: 0.000865101033213598
    libretexts: 0.00054149556727006
    news: 0.000372716196818104
    foodista: 0.000125363443065615
    oercommons: 7.78696843693821e-05
    python_enhancement_proposals: 1.69983991984805e-05
    public_domain_review: 1.05448719635173e-05
  batch_size: 2
  seq_len: 4096
  n_views: 2
  seed: 42
  add_bos: true
  add_eos: true
  load_async: true
  prefetch_size: 4096
  tokenizer:
    name: tiktoken
    path: /fsx/craffel/lingua/tokenizers/common-pile-tokenizer.tiktoken
optim:
  lr: 0.001
  weight_decay: 0.2
  epsilon: 1.0e-08
  beta1: 0.9
  beta2: 0.95
  clip: 1.0
  scheduler: cosine
  warmup: 2000
  lr_min_ratio: 1.0e-06
  cycle_length: 1.0
  cosine_theta: 1.0
  annealing_step: 1000
  decay_fraction: 0.1
  exp_factor: 0.5
model:
  dim: 4096
  n_layers: 32
  head_dim: null
  n_heads: 32
  n_kv_heads: null
  ffn_dim_multiplier: 1.0
  multiple_of: 256
  norm_eps: 1.0e-05
  rope_theta: 100000.0
  init_base_std: null
  init_std_factor: disabled
  max_seqlen: 4096
  seed: 42
  vocab_size: 64256
  weight_tying: false
  sliding_window: null
distributed:
  dp_shard: 1
  dp_replicate: 64
  tp_size: 1
  selective_activation_checkpointing: false
  compile: true
  fsdp_type: full_shard
  model_dtype: bf16
  float8_recipe: null
  float8_filter: layers\.[0-9]+\.
  matmul_allow_tf32: false
  detect_anomaly: false
  compile_cache_size_limit: 8
  spawn_method: forkserver
env:
  MKL_SERVICE_FORCE_INTEL: GNU
  OMP_NUM_THREADS: '1'
  MKL_NUM_THREADS: '1'
  ENABLE_INTRA_NODE_COMM: '1'
  TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
  NCCL_IB_TIMEOUT: '22'
  NCCL_DEBUG: INFO
  TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
checkpoint:
  dump:
    every: 10000
    keep: -1
  eval:
    every: 2000
    keep: 3
  path: /fsx/craffel/lingua_logs/comma_v0p1/checkpoints
  init_ckpt_path: null
  continue_training_from_init: false
profiling:
  run: true
  trace_folder: profiling
  mem_warmup: 0
  mem_steps: 4
  profile_warmup: 100
  profile_steps: 4
logging:
  freq: 1
  acc_freq: null
  wandb: null
async_eval_gpus: 8
eval:
  harness:
    tasks:
    - hellaswag
    - task: boolq
      dataset_kwargs:
        trust_remote_code: true
    - piqa
    - task: social_iqa
      dataset_kwargs:
        trust_remote_code: true
    - winogrande
    - openbookqa
    - arc_easy
    - arc_challenge
    - race
    - commonsense_qa
    - task: copa
      dataset_kwargs:
        trust_remote_code: true
    - mmlu
    - mmlu_pro
  generator:
    max_tokens: 8192
    dtype: bf16