File size: 3,852 Bytes
90fc0ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# lightning.pytorch==2.4.0.dev20240728
seed_everything: 123
trainer:
  accelerator: gpu
  strategy:
    class_path: lightning.pytorch.strategies.DeepSpeedStrategy
    init_args:
      accelerator: null
      zero_optimization: true
      stage: 2
      remote_device: null
      offload_optimizer: false
      offload_parameters: true
      offload_params_device: cpu
      nvme_path: /local_nvme
      params_buffer_count: 5
      params_buffer_size: 100000000
      max_in_cpu: 1000000000
      offload_optimizer_device: cpu
      optimizer_buffer_count: 4
      block_size: 1048576
      queue_depth: 8
      single_submit: false
      overlap_events: true
      thread_count: 1
      pin_memory: true
      sub_group_size: 1000000000000
      contiguous_gradients: true
      overlap_comm: true
      allgather_partitions: true
      reduce_scatter: true
      allgather_bucket_size: 200000000
      reduce_bucket_size: 200000000
      zero_allow_untested_optimizer: true
      logging_batch_size_per_gpu: auto
      config: null
      logging_level: 30
      parallel_devices: null
      cluster_environment: null
      loss_scale: 0.0
      initial_scale_power: 16
      loss_scale_window: 1000
      hysteresis: 2
      min_loss_scale: 1
      partition_activations: false
      cpu_checkpointing: false
      contiguous_memory_optimization: false
      synchronize_checkpoint_boundary: false
      load_full_weights: false
      precision_plugin: null
      process_group_backend: null
  devices: 8
  num_nodes: 1
  precision: bf16-true
  logger:
    class_path: lightning.pytorch.loggers.TensorBoardLogger
    init_args:
      save_dir: /media/logs
      name: main
      version: null
      log_graph: false
      default_hp_metric: true
      prefix: ''
      sub_dir: null
      comment: ''
      purge_step: null
      max_queue: 10
      flush_secs: 120
      filename_suffix: ''
  callbacks: null
  fast_dev_run: false
  max_epochs: 2
  min_epochs: null
  max_steps: -1
  min_steps: null
  max_time: null
  limit_train_batches: null
  limit_val_batches: null
  limit_test_batches: null
  limit_predict_batches: null
  overfit_batches: 0.0
  val_check_interval: null
  check_val_every_n_epoch: 1
  num_sanity_val_steps: 0
  log_every_n_steps: 1
  enable_checkpointing: null
  enable_progress_bar: null
  enable_model_summary: null
  accumulate_grad_batches: 8
  gradient_clip_val: null
  gradient_clip_algorithm: null
  deterministic: null
  benchmark: null
  inference_mode: true
  use_distributed_sampler: true
  profiler: null
  detect_anomaly: false
  barebones: false
  plugins: null
  sync_batchnorm: false
  reload_dataloaders_every_n_epochs: 0
  default_root_dir: null
model:
  config:
    model_name: Mistral-7B-v0.2
    dtype: bfloat16
    num_thoughts: 2
    thought_length: 8
    lookahead_tokens: 4
    embedding_grad_weights: 100.0
    temperature: 1.0
    do_sample: true
    train_max_length: 120
    offload_cache: false
    top_k: null
    top_p: null
  checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2
  weight_decay: 0.001
  warmup_steps: 20
  policy_weight: 1.0
  init_lr: 1.0e-06
  optimizer:
    class_path: torch.optim.AdamW
    init_args:
      lr: 1.0e-06
      betas:
      - 0.9
      - 0.999
      eps: 1.0e-08
      weight_decay: 0.001
      amsgrad: false
      maximize: false
      foreach: null
      capturable: false
      differentiable: false
      fused: null
  scheduler: null
ckpt_path: null
data:
  class_path: src.dataset.OpenWebMathDataModule
  init_args:
    data_path: /media/datasets/openwebmath
    tokenizer:
      class_path: src.dataset.SpecialTokenizer
      init_args:
        checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2
    batch_size: 1
    max_seq_length: 120
    num_samples: 2048
    ignore_index: -100
    val_split_fraction: 0.125
    seed: 42
    num_workers: 1