File size: 3,599 Bytes

2705b16

name: comma_v0p1_yolooooo
dump_dir: /fsx/craffel/lingua_logs/comma_v0p1/
seed: 777
grad_acc_steps: 4
gc_collect_freq: 1000
probe_freq: null
steps: 500000
data:
  root_dir: /scratch/craffel/lingua/data/
  sources:
    peS2o: 0.274065475510351
    stackexchange: 0.134617935796937
    stackv2_edu: 0.127770669195666
    cccc: 0.0871992270000557
    wikimedia: 0.0861800315862719
    github_archive: 0.0606452345122248
    uspto: 0.0413469377516883
    pubmed: 0.0367902799837971
    arxiv_papers: 0.0292395449667613
    caselaw_access_project: 0.0193875362722656
    wikiteam: 0.0137485410839637
    doab: 0.0180439781895451
    uk_hansard: 0.0144498535570883
    pre_1929_books: 0.0115755547988338
    ubuntu_irc: 0.00794254267719456
    regulations: 0.00762583706405442
    data_provenance_initiative: 0.00512264496834867
    project_gutenberg: 0.00502100654070129
    youtube: 0.00465917165839394
    arxiv_abstracts: 0.00359635066160403
    stackv2_html: 0.00225924255952781
    usgpo: 0.00226024581728848
    library_of_congress: 0.00222469340783564
    biodiversity_heritage_library: 0.00221737524370278
    pressbooks: 0.000865101033213598
    libretexts: 0.00054149556727006
    news: 0.000372716196818104
    foodista: 0.000125363443065615
    oercommons: 7.78696843693821e-05
    python_enhancement_proposals: 1.69983991984805e-05
    public_domain_review: 1.05448719635173e-05
  batch_size: 2
  seq_len: 4096
  n_views: 2
  seed: 42
  add_bos: true
  add_eos: true
  load_async: true
  prefetch_size: 4096
  tokenizer:
    name: tiktoken
    path: /fsx/craffel/lingua/tokenizers/common-pile-tokenizer.tiktoken
optim:
  lr: 0.001
  weight_decay: 0.2
  epsilon: 1.0e-08
  beta1: 0.9
  beta2: 0.95
  clip: 1.0
  scheduler: cosine
  warmup: 2000
  lr_min_ratio: 1.0e-06
  cycle_length: 1.0
  cosine_theta: 1.0
  annealing_step: 1000
  decay_fraction: 0.1
  exp_factor: 0.5
model:
  dim: 4096
  n_layers: 32
  head_dim: null
  n_heads: 32
  n_kv_heads: null
  ffn_dim_multiplier: 1.0
  multiple_of: 256
  norm_eps: 1.0e-05
  rope_theta: 100000.0
  init_base_std: null
  init_std_factor: disabled
  max_seqlen: 4096
  seed: 42
  vocab_size: 64256
  weight_tying: false
  sliding_window: null
distributed:
  dp_shard: 1
  dp_replicate: 64
  tp_size: 1
  selective_activation_checkpointing: false
  compile: true
  fsdp_type: full_shard
  model_dtype: bf16
  float8_recipe: null
  float8_filter: layers\.[0-9]+\.
  matmul_allow_tf32: false
  detect_anomaly: false
  compile_cache_size_limit: 8
  spawn_method: forkserver
env:
  MKL_SERVICE_FORCE_INTEL: GNU
  OMP_NUM_THREADS: '1'
  MKL_NUM_THREADS: '1'
  ENABLE_INTRA_NODE_COMM: '1'
  TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
  NCCL_IB_TIMEOUT: '22'
  NCCL_DEBUG: INFO
  TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
checkpoint:
  dump:
    every: 10000
    keep: -1
  eval:
    every: 2000
    keep: 3
  path: /fsx/craffel/lingua_logs/comma_v0p1/checkpoints
  init_ckpt_path: null
  continue_training_from_init: false
profiling:
  run: true
  trace_folder: profiling
  mem_warmup: 0
  mem_steps: 4
  profile_warmup: 100
  profile_steps: 4
logging:
  freq: 1
  acc_freq: null
  wandb: null
async_eval_gpus: 8
eval:
  harness:
    tasks:
    - hellaswag
    - task: boolq
      dataset_kwargs:
        trust_remote_code: true
    - piqa
    - task: social_iqa
      dataset_kwargs:
        trust_remote_code: true
    - winogrande
    - openbookqa
    - arc_easy
    - arc_challenge
    - race
    - commonsense_qa
    - task: copa
      dataset_kwargs:
        trust_remote_code: true
    - mmlu
    - mmlu_pro
  generator:
    max_tokens: 8192
    dtype: bf16