File size: 3,852 Bytes
90fc0ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
# lightning.pytorch==2.4.0.dev20240728
seed_everything: 123
trainer:
accelerator: gpu
strategy:
class_path: lightning.pytorch.strategies.DeepSpeedStrategy
init_args:
accelerator: null
zero_optimization: true
stage: 2
remote_device: null
offload_optimizer: false
offload_parameters: true
offload_params_device: cpu
nvme_path: /local_nvme
params_buffer_count: 5
params_buffer_size: 100000000
max_in_cpu: 1000000000
offload_optimizer_device: cpu
optimizer_buffer_count: 4
block_size: 1048576
queue_depth: 8
single_submit: false
overlap_events: true
thread_count: 1
pin_memory: true
sub_group_size: 1000000000000
contiguous_gradients: true
overlap_comm: true
allgather_partitions: true
reduce_scatter: true
allgather_bucket_size: 200000000
reduce_bucket_size: 200000000
zero_allow_untested_optimizer: true
logging_batch_size_per_gpu: auto
config: null
logging_level: 30
parallel_devices: null
cluster_environment: null
loss_scale: 0.0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
min_loss_scale: 1
partition_activations: false
cpu_checkpointing: false
contiguous_memory_optimization: false
synchronize_checkpoint_boundary: false
load_full_weights: false
precision_plugin: null
process_group_backend: null
devices: 8
num_nodes: 1
precision: bf16-true
logger:
class_path: lightning.pytorch.loggers.TensorBoardLogger
init_args:
save_dir: /media/logs
name: main
version: null
log_graph: false
default_hp_metric: true
prefix: ''
sub_dir: null
comment: ''
purge_step: null
max_queue: 10
flush_secs: 120
filename_suffix: ''
callbacks: null
fast_dev_run: false
max_epochs: 2
min_epochs: null
max_steps: -1
min_steps: null
max_time: null
limit_train_batches: null
limit_val_batches: null
limit_test_batches: null
limit_predict_batches: null
overfit_batches: 0.0
val_check_interval: null
check_val_every_n_epoch: 1
num_sanity_val_steps: 0
log_every_n_steps: 1
enable_checkpointing: null
enable_progress_bar: null
enable_model_summary: null
accumulate_grad_batches: 8
gradient_clip_val: null
gradient_clip_algorithm: null
deterministic: null
benchmark: null
inference_mode: true
use_distributed_sampler: true
profiler: null
detect_anomaly: false
barebones: false
plugins: null
sync_batchnorm: false
reload_dataloaders_every_n_epochs: 0
default_root_dir: null
model:
config:
model_name: Mistral-7B-v0.2
dtype: bfloat16
num_thoughts: 2
thought_length: 8
lookahead_tokens: 4
embedding_grad_weights: 100.0
temperature: 1.0
do_sample: true
train_max_length: 120
offload_cache: false
top_k: null
top_p: null
checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2
weight_decay: 0.001
warmup_steps: 20
policy_weight: 1.0
init_lr: 1.0e-06
optimizer:
class_path: torch.optim.AdamW
init_args:
lr: 1.0e-06
betas:
- 0.9
- 0.999
eps: 1.0e-08
weight_decay: 0.001
amsgrad: false
maximize: false
foreach: null
capturable: false
differentiable: false
fused: null
scheduler: null
ckpt_path: null
data:
class_path: src.dataset.OpenWebMathDataModule
init_args:
data_path: /media/datasets/openwebmath
tokenizer:
class_path: src.dataset.SpecialTokenizer
init_args:
checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2
batch_size: 1
max_seq_length: 120
num_samples: 2048
ignore_index: -100
val_split_fraction: 0.125
seed: 42
num_workers: 1
|