diff --git a/sbatch_4b284b84b10c4pyseed1.sh b/sbatch_4b284b84b10c4pyseed1.sh new file mode 100644 index 0000000000000000000000000000000000000000..a0d88cecadc82e691d41d2e0124eef99f5d42198 --- /dev/null +++ b/sbatch_4b284b84b10c4pyseed1.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b10c4pyseed1 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b10c4py.txt +# "train: 0.1 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.9 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 1 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b10c4pyseed2.sh b/sbatch_4b284b84b10c4pyseed2.sh new file mode 100644 index 0000000000000000000000000000000000000000..2f6b8366ae8c85f3648a89d828dd78c0e026ffdf --- /dev/null +++ b/sbatch_4b284b84b10c4pyseed2.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b10c4pyseed2 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b10c4py.txt +# "train: 0.1 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.9 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 2 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b10c4pyseed3.sh b/sbatch_4b284b84b10c4pyseed3.sh new file mode 100644 index 0000000000000000000000000000000000000000..d350d7e496a9e858876b45d506c7371bd142b647 --- /dev/null +++ b/sbatch_4b284b84b10c4pyseed3.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b10c4pyseed3 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b10c4py.txt +# "train: 0.1 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.9 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 3 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b10c4pyseed4.sh b/sbatch_4b284b84b10c4pyseed4.sh new file mode 100644 index 0000000000000000000000000000000000000000..e9a97261e5514694b2be8d94fecf5aa10f96ff42 --- /dev/null +++ b/sbatch_4b284b84b10c4pyseed4.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b10c4pyseed4 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b10c4py.txt +# "train: 0.1 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.9 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 4 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b20c4pyseed1.sh b/sbatch_4b284b84b20c4pyseed1.sh new file mode 100644 index 0000000000000000000000000000000000000000..3469a6c9680bfe704a528c8da2b23c31abeb7bf3 --- /dev/null +++ b/sbatch_4b284b84b20c4pyseed1.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b20c4pyseed1 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b20c4py.txt +# "train: 0.2 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.8 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 1 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b20c4pyseed2.sh b/sbatch_4b284b84b20c4pyseed2.sh new file mode 100644 index 0000000000000000000000000000000000000000..a1ff37f3efa3f1aaf1c03a66b7d1eb42484191e1 --- /dev/null +++ b/sbatch_4b284b84b20c4pyseed2.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b20c4pyseed2 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b20c4py.txt +# "train: 0.2 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.8 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 2 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b20c4pyseed3.sh b/sbatch_4b284b84b20c4pyseed3.sh new file mode 100644 index 0000000000000000000000000000000000000000..e4b69794729377aede4262b3e3b40cf787efeb88 --- /dev/null +++ b/sbatch_4b284b84b20c4pyseed3.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b20c4pyseed3 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b20c4py.txt +# "train: 0.2 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.8 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 3 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b20c4pyseed4.sh b/sbatch_4b284b84b20c4pyseed4.sh new file mode 100644 index 0000000000000000000000000000000000000000..5106de9f28e81332e37b600a30e231d65d5483e5 --- /dev/null +++ b/sbatch_4b284b84b20c4pyseed4.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b20c4pyseed4 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b20c4py.txt +# "train: 0.2 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.8 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 4 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b30c4pyseed1.sh b/sbatch_4b284b84b30c4pyseed1.sh new file mode 100644 index 0000000000000000000000000000000000000000..7d106afd542bfe6469828c18dadca867d25bff09 --- /dev/null +++ b/sbatch_4b284b84b30c4pyseed1.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b30c4pyseed1 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b30c4py.txt +# "train: 0.3 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.7 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 1 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b30c4pyseed2.sh b/sbatch_4b284b84b30c4pyseed2.sh new file mode 100644 index 0000000000000000000000000000000000000000..e64e33e29cef38ccad1036295e31d37bafadd85d --- /dev/null +++ b/sbatch_4b284b84b30c4pyseed2.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b30c4pyseed2 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b30c4py.txt +# "train: 0.3 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.7 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 2 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b30c4pyseed3.sh b/sbatch_4b284b84b30c4pyseed3.sh new file mode 100644 index 0000000000000000000000000000000000000000..95c14435c47228ba4527df54a71cc859f6561371 --- /dev/null +++ b/sbatch_4b284b84b30c4pyseed3.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b30c4pyseed3 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b30c4py.txt +# "train: 0.3 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.7 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 3 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b30c4pyseed4.sh b/sbatch_4b284b84b30c4pyseed4.sh new file mode 100644 index 0000000000000000000000000000000000000000..04abd95f9879228236e5fb053de16c0bcdb290b7 --- /dev/null +++ b/sbatch_4b284b84b30c4pyseed4.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b30c4pyseed4 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b30c4py.txt +# "train: 0.3 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.7 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 4 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b40c4pyseed1.sh b/sbatch_4b284b84b40c4pyseed1.sh new file mode 100644 index 0000000000000000000000000000000000000000..5c53359ab610d817b555f66252efc57a6f8b8a34 --- /dev/null +++ b/sbatch_4b284b84b40c4pyseed1.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b40c4pyseed1 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b40c4py.txt +# "train: 0.4 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.6 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 1 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b40c4pyseed2.sh b/sbatch_4b284b84b40c4pyseed2.sh new file mode 100644 index 0000000000000000000000000000000000000000..f23d243bcf8f5fd93405c0f15a70fab51c1ab496 --- /dev/null +++ b/sbatch_4b284b84b40c4pyseed2.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b40c4pyseed2 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b40c4py.txt +# "train: 0.4 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.6 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 2 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b40c4pyseed3.sh b/sbatch_4b284b84b40c4pyseed3.sh new file mode 100644 index 0000000000000000000000000000000000000000..4713a82259d0d81470f1f5a96173b9a205f96dee --- /dev/null +++ b/sbatch_4b284b84b40c4pyseed3.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b40c4pyseed3 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b40c4py.txt +# "train: 0.4 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.6 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 3 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b40c4pyseed4.sh b/sbatch_4b284b84b40c4pyseed4.sh new file mode 100644 index 0000000000000000000000000000000000000000..236dd72fe4f556a6a110f1e8c250d93432f4d5bb --- /dev/null +++ b/sbatch_4b284b84b40c4pyseed4.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b40c4pyseed4 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b40c4py.txt +# "train: 0.4 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.6 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 4 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b50c4pyseed1.sh b/sbatch_4b284b84b50c4pyseed1.sh new file mode 100644 index 0000000000000000000000000000000000000000..40d7bf6e5c92e646f6f452531d8e2b79655644e9 --- /dev/null +++ b/sbatch_4b284b84b50c4pyseed1.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b50c4pyseed1 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b50c4py.txt +# "train: 0.5 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.5 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 1 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b50c4pyseed2.sh b/sbatch_4b284b84b50c4pyseed2.sh new file mode 100644 index 0000000000000000000000000000000000000000..443ed346c0a3cb0524f72d984378061773eb6403 --- /dev/null +++ b/sbatch_4b284b84b50c4pyseed2.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b50c4pyseed2 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b50c4py.txt +# "train: 0.5 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.5 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 2 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b50c4pyseed3.sh b/sbatch_4b284b84b50c4pyseed3.sh new file mode 100644 index 0000000000000000000000000000000000000000..c8e3b201d5e506fa8bc3b8708545c316ab7f82d0 --- /dev/null +++ b/sbatch_4b284b84b50c4pyseed3.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b50c4pyseed3 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b50c4py.txt +# "train: 0.5 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.5 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 3 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b50c4pyseed4.sh b/sbatch_4b284b84b50c4pyseed4.sh new file mode 100644 index 0000000000000000000000000000000000000000..6010791bb273bdf6e19618e822acf4ede3f21876 --- /dev/null +++ b/sbatch_4b284b84b50c4pyseed4.sh @@ -0,0 +1,165 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b50c4pyseed4 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b50c4py.txt +# "train: 0.5 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.5 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 4 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b60c4pyseed1.sh b/sbatch_4b284b84b60c4pyseed1.sh new file mode 100644 index 0000000000000000000000000000000000000000..6af97ad47da89c696d23e5c7b253fd468ae82d2f --- /dev/null +++ b/sbatch_4b284b84b60c4pyseed1.sh @@ -0,0 +1,164 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b60c4pyseed1 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b60c4py.txt +# "train: 0.6 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.4 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 1 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b60c4pyseed2.sh b/sbatch_4b284b84b60c4pyseed2.sh new file mode 100644 index 0000000000000000000000000000000000000000..4c2b8fb7e5f53c14f7cf7754875b9302f9118088 --- /dev/null +++ b/sbatch_4b284b84b60c4pyseed2.sh @@ -0,0 +1,164 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b60c4pyseed2 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b60c4py.txt +# "train: 0.6 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.4 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 2 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b60c4pyseed3.sh b/sbatch_4b284b84b60c4pyseed3.sh new file mode 100644 index 0000000000000000000000000000000000000000..560c1914c6c712f43c4e418a88016231b97cca87 --- /dev/null +++ b/sbatch_4b284b84b60c4pyseed3.sh @@ -0,0 +1,164 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b60c4pyseed3 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b60c4py.txt +# "train: 0.6 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.4 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 3 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b60c4pyseed4.sh b/sbatch_4b284b84b60c4pyseed4.sh new file mode 100644 index 0000000000000000000000000000000000000000..8adaf9d1c6f77295fb25a15e6fd90179ece39d2d --- /dev/null +++ b/sbatch_4b284b84b60c4pyseed4.sh @@ -0,0 +1,164 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b60c4pyseed4 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b60c4py.txt +# "train: 0.6 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.4 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 4 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b70c4pyseed1.sh b/sbatch_4b284b84b70c4pyseed1.sh new file mode 100644 index 0000000000000000000000000000000000000000..e258c3c4bf4c2314dc4ccda622f2c5041dc3ac27 --- /dev/null +++ b/sbatch_4b284b84b70c4pyseed1.sh @@ -0,0 +1,164 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b70c4pyseed1 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b70c4py.txt +# "train: 0.7 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.3 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 1 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b70c4pyseed2.sh b/sbatch_4b284b84b70c4pyseed2.sh new file mode 100644 index 0000000000000000000000000000000000000000..2f91afa525a06842eaf7cd1057aac2df7d950fc5 --- /dev/null +++ b/sbatch_4b284b84b70c4pyseed2.sh @@ -0,0 +1,164 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b70c4pyseed2 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b70c4py.txt +# "train: 0.7 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.3 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 2 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b70c4pyseed3.sh b/sbatch_4b284b84b70c4pyseed3.sh new file mode 100644 index 0000000000000000000000000000000000000000..911498c1588727a4bac56702709a30baf6d435d5 --- /dev/null +++ b/sbatch_4b284b84b70c4pyseed3.sh @@ -0,0 +1,164 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b70c4pyseed3 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b70c4py.txt +# "train: 0.7 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.3 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 3 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b70c4pyseed4.sh b/sbatch_4b284b84b70c4pyseed4.sh new file mode 100644 index 0000000000000000000000000000000000000000..a902769e0ca93bafbcdad9464631628c6e52c3cf --- /dev/null +++ b/sbatch_4b284b84b70c4pyseed4.sh @@ -0,0 +1,164 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b70c4pyseed4 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b70c4py.txt +# "train: 0.7 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.3 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 4 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b80c4pyseed1.sh b/sbatch_4b284b84b80c4pyseed1.sh new file mode 100644 index 0000000000000000000000000000000000000000..441672407de2d5c1d5ca953a99babb2cbdbab3b9 --- /dev/null +++ b/sbatch_4b284b84b80c4pyseed1.sh @@ -0,0 +1,164 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b80c4pyseed1 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b80c4py.txt +# "train: 0.8 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.2 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 1 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b80c4pyseed2.sh b/sbatch_4b284b84b80c4pyseed2.sh new file mode 100644 index 0000000000000000000000000000000000000000..405599db6b24a4d51df436234e9405257a3e8818 --- /dev/null +++ b/sbatch_4b284b84b80c4pyseed2.sh @@ -0,0 +1,164 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b80c4pyseed2 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b80c4py.txt +# "train: 0.8 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.2 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 2 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b80c4pyseed3.sh b/sbatch_4b284b84b80c4pyseed3.sh new file mode 100644 index 0000000000000000000000000000000000000000..408414a50d4e3332e91d7efb3fa434edb4cc99d6 --- /dev/null +++ b/sbatch_4b284b84b80c4pyseed3.sh @@ -0,0 +1,164 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b80c4pyseed3 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b80c4py.txt +# "train: 0.8 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.2 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 3 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b80c4pyseed4.sh b/sbatch_4b284b84b80c4pyseed4.sh new file mode 100644 index 0000000000000000000000000000000000000000..1481d71aaab25ccd1a931dce7a2909a877c5d103 --- /dev/null +++ b/sbatch_4b284b84b80c4pyseed4.sh @@ -0,0 +1,164 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b80c4pyseed4 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b80c4py.txt +# "train: 0.8 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.2 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 4 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b90c4pyseed1.sh b/sbatch_4b284b84b90c4pyseed1.sh new file mode 100644 index 0000000000000000000000000000000000000000..c6ec29e87c93f9350694d24ff3860ca054a6f0ee --- /dev/null +++ b/sbatch_4b284b84b90c4pyseed1.sh @@ -0,0 +1,164 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b90c4pyseed1 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b90c4py.txt +# "train: 0.9 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.1 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 1 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b90c4pyseed2.sh b/sbatch_4b284b84b90c4pyseed2.sh new file mode 100644 index 0000000000000000000000000000000000000000..a2792c4626920a85aa2b51ccee5b98a466fe0ee4 --- /dev/null +++ b/sbatch_4b284b84b90c4pyseed2.sh @@ -0,0 +1,164 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b90c4pyseed2 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b90c4py.txt +# "train: 0.9 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.1 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 2 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b90c4pyseed3.sh b/sbatch_4b284b84b90c4pyseed3.sh new file mode 100644 index 0000000000000000000000000000000000000000..564830a767b796b0855ea6f45039e1fa4ac1762f --- /dev/null +++ b/sbatch_4b284b84b90c4pyseed3.sh @@ -0,0 +1,164 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b90c4pyseed3 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b90c4py.txt +# "train: 0.9 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.1 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 3 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/sbatch_4b284b84b90c4pyseed4.sh b/sbatch_4b284b84b90c4pyseed4.sh new file mode 100644 index 0000000000000000000000000000000000000000..0d9e9c6aa8d988c75799d53d52f7b3104dfc5aeb --- /dev/null +++ b/sbatch_4b284b84b90c4pyseed4.sh @@ -0,0 +1,164 @@ +#!/bin/bash +#SBATCH --exclude=nid007571,nid007112,nid006774,nid007502,nid007506,nid007507,nid005145,nid006692,nid007218,nid007123,nid006124,nid006123,nid007496,nid007237,nid006852,nid007206,nid006947,nid007212,nid006977,nid007222,nid005444,nid007219,nid007493,nid007221,nid005300,nid005619,nid006118,nid005203,nid006113,nid006481,nid007077,nid005208,nid005207,nid005879,nid005901 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=40 +#SBATCH --mem=256G +#SBATCH -p standard-g +#SBATCH -t 48:00:00 +#SBATCH --gpus-per-node=mi250:8 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +VARIANT=4b284b84b90c4pyseed4 + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest.out and logs/latest.err +ln -f -s $SLURM_JOB_ID.out logs/latest.out +ln -f -s $SLURM_JOB_ID.err logs/latest.err + +KILL_SWITCH_PATH=kill-switch-$VARIANT +CHECKPOINT_PATH=checkpoints_$VARIANT +TENSORBOARD_PATH=tensorboard_$VARIANT + +# Data +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +TRAIN_DATA_PATH=train84b90c4py.txt +# "train: 0.9 0:1 /scratch/project_462000119/data/c4_subsampled/gpt2tok_c4_en_84B_text_document, 0.1 0:1 /scratch/project_462000119/data/python/gpt2tok_python_84B_content_document" +VALID_DATA_PATH=valc4py.txt +# "validation: 1.0 0:1 /scratch/project_462000119/data/c4_validation/gpt2tok_c4validation_rerun_text_document" "validation_python: 1.0 0.95:1 /scratch/project_462000119/data/python/gpt2tok_python_content_document" + +PP_SIZE=1 +TP_SIZE=2 + +MICRO_BATCH_SIZE=2 +GRADIENT_ACCUMULATION_STEPS=1 +WORLD_SIZE=$((SLURM_GPUS_ON_NODE*SLURM_JOB_NUM_NODES)) +GLOBAL_BATCH_SIZE=$((MICRO_BATCH_SIZE*WORLD_SIZE*GRADIENT_ACCUMULATION_STEPS)) + +# Model parameters +source model_params.sh +MODEL_PARAM=("${PARAM_4516M[@]}") +NHIDDEN=${MODEL_PARAM[0]} +FFN_HIDDEN_SIZE=${MODEL_PARAM[1]} +KV_SIZE=${MODEL_PARAM[2]} +NHEADS=${MODEL_PARAM[3]} +NLAYERS=${MODEL_PARAM[4]} +SEQ_LEN=2048 + +echo "Model parameters: d_model $NHIDDEN ffw_size $FFN_HIDDEN_SIZE kv_size $KV_SIZE n_heads $NHEADS n_layers $NLAYERS" + +SAVE_INTERVAL=10000 + +# Tokens: 84_000_000_000 +# -> Samples: 41_015_625.0 +TRAIN_SAMPLES=41_015_625 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 2e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples $TRAIN_SAMPLES \ + --lr-warmup-samples 410_156 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --kv-channels $KV_SIZE \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples $TRAIN_SAMPLES \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --clip-grad 1.0 \ + --kill-switch-path $KILL_SWITCH_PATH \ + --bf16 \ + $OPTIMIZER_ARGS \ + " + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 5000 \ + --eval-iters 10 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD=" \ + Megatron-DeepSpeed/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $TRAIN_DATA_PATH \ + --valid-weighted-split-paths-path $VALID_DATA_PATH \ + --data-impl mmap \ + $DEEPSPEED_ARGS \ + --seed 4 \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +# bash launch_srun.sh $CMD +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed1/events.out.tfevents.1683756022.nid007048.61333.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed1/events.out.tfevents.1683756022.nid007048.61333.0 new file mode 100644 index 0000000000000000000000000000000000000000..897ba13b9cb94e65fb2112eba57111412af59be5 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed1/events.out.tfevents.1683756022.nid007048.61333.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d67bc92ca2ba02857de441b9e358b7c0c23a3e7e4d20c822a50f4bb41c035023 +size 19996 diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed1/events.out.tfevents.1683756633.nid006995.89119.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed1/events.out.tfevents.1683756633.nid006995.89119.0 new file mode 100644 index 0000000000000000000000000000000000000000..9247fe9518278a8aa3664a5c1a4080cd87e60506 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed1/events.out.tfevents.1683756633.nid006995.89119.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0333176e62af9844e2b9390582b8da6291daa668b619e3105c0df66e4961820 +size 113224610 diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed1/events.out.tfevents.1683928600.nid007131.83743.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed1/events.out.tfevents.1683928600.nid007131.83743.0 new file mode 100644 index 0000000000000000000000000000000000000000..4efd0d616ff7be47e9727860f40b1408cdc6d14a --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed1/events.out.tfevents.1683928600.nid007131.83743.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c5e6659617a6329ad7bdfe6575be5e875bdff89b84f293b3299f6c735079f4d +size 30411691 diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed1/events.out.tfevents.1683974921.nid006671.116485.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed1/events.out.tfevents.1683974921.nid006671.116485.0 new file mode 100644 index 0000000000000000000000000000000000000000..29d2a79d3f807359ef5bdd61ba0f990fa51c2531 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed1/events.out.tfevents.1683974921.nid006671.116485.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c44a801835b83dc6284b98fee71d9599e2b78bddd36d063132446d76881a74f0 +size 40 diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed1/events.out.tfevents.1683978316.nid006500.80744.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed1/events.out.tfevents.1683978316.nid006500.80744.0 new file mode 100644 index 0000000000000000000000000000000000000000..18f5e5b06498eba939945834d1ad43bc0bcf0a03 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed1/events.out.tfevents.1683978316.nid006500.80744.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5aad21bc9610e6b6c11f76b3b02aa534d7d3a6b8787b9f3ba60a32cb5fd98db +size 216295 diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed2/events.out.tfevents.1683756022.nid005878.51908.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed2/events.out.tfevents.1683756022.nid005878.51908.0 new file mode 100644 index 0000000000000000000000000000000000000000..f4a6d2f466161a434070a5e270af2bfece0afd5b --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed2/events.out.tfevents.1683756022.nid005878.51908.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a62915fb10088dff87555a23741c81f59215fb2f8c6e29c529d3858a7f393696 +size 113531666 diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed2/events.out.tfevents.1683928065.nid006518.16895.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed2/events.out.tfevents.1683928065.nid006518.16895.0 new file mode 100644 index 0000000000000000000000000000000000000000..2633ad44adaccae97a85804d0aec1ac0034080be --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed2/events.out.tfevents.1683928065.nid006518.16895.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0736d9395b5347605e7b441f2eae463bc34465241b388aa5fe225eb35ed502f1 +size 30104633 diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed2/events.out.tfevents.1683973531.nid007019.108443.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed2/events.out.tfevents.1683973531.nid007019.108443.0 new file mode 100644 index 0000000000000000000000000000000000000000..1e3618a57042f06cfe9219b4b7ae38b80b532f79 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed2/events.out.tfevents.1683973531.nid007019.108443.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f873da0be24a74805be74b1468d4068c6196cedcf95532dd401243b8e463e4c +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed2/events.out.tfevents.1683978316.nid006848.10586.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed2/events.out.tfevents.1683978316.nid006848.10586.0 new file mode 100644 index 0000000000000000000000000000000000000000..748ae56c1ac8b6044f0e92c26fdc58974f94ab4e --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed2/events.out.tfevents.1683978316.nid006848.10586.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39618b3ba29c6ecf0c6a27d79d2a9a07f1afbc2010f478accc507e7e4bdad8f4 +size 216295 diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed3/events.out.tfevents.1683756022.nid006518.29112.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed3/events.out.tfevents.1683756022.nid006518.29112.0 new file mode 100644 index 0000000000000000000000000000000000000000..778b64afe9376105624518eebc2bfba858500eb4 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed3/events.out.tfevents.1683756022.nid006518.29112.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f479f2724d0af83fbe9116768d933689c9102159cf721f25daed543a3a8c339e +size 113815604 diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed3/events.out.tfevents.1683928065.nid005878.6854.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed3/events.out.tfevents.1683928065.nid005878.6854.0 new file mode 100644 index 0000000000000000000000000000000000000000..f2978626edd55448f33ef852a517834b8741169c --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed3/events.out.tfevents.1683928065.nid005878.6854.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf920e80db0cc8fbdd8691551a242c53a2a4e9aaddd8a7cc5dea7e663d7af02d +size 29820697 diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed3/events.out.tfevents.1683974293.nid006598.20858.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed3/events.out.tfevents.1683974293.nid006598.20858.0 new file mode 100644 index 0000000000000000000000000000000000000000..aae175a715fd046219bf201d6f5e21f84b68fb91 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed3/events.out.tfevents.1683974293.nid006598.20858.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75e21748690b8a869904b54a47db62873fe4b586067ed450219a6b926f5b511d +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed3/events.out.tfevents.1683978316.nid007019.25418.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed3/events.out.tfevents.1683978316.nid007019.25418.0 new file mode 100644 index 0000000000000000000000000000000000000000..4e95225aff2ffe738301eed03e0a83a29cea4534 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed3/events.out.tfevents.1683978316.nid007019.25418.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e0b8256e9a31b5290f0eb83c30f3aa5fb604a96a61631d1b13667d5d2edbb1e +size 216295 diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed4/events.out.tfevents.1683756022.nid006586.32254.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed4/events.out.tfevents.1683756022.nid006586.32254.0 new file mode 100644 index 0000000000000000000000000000000000000000..9140f6d43115da14925d19fe87693616c4024e28 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed4/events.out.tfevents.1683756022.nid006586.32254.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a32662873dc5f8d16b35c82bd2970c50aef0dc39a6a1b110a0b03c3dda84dca +size 113637713 diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed4/events.out.tfevents.1683928007.nid006586.119158.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed4/events.out.tfevents.1683928007.nid006586.119158.0 new file mode 100644 index 0000000000000000000000000000000000000000..3e649aba78c8916668057da3a8991f8c6b285dfd --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed4/events.out.tfevents.1683928007.nid006586.119158.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76f73905b9fffaa2561f5af6c0ee8cfcc909af15aab54d4b3f645fc274935bb7 +size 29998588 diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed4/events.out.tfevents.1683973531.nid006518.41695.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed4/events.out.tfevents.1683973531.nid006518.41695.0 new file mode 100644 index 0000000000000000000000000000000000000000..207e2642250961aac989e59e5536fad862fd30b1 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed4/events.out.tfevents.1683973531.nid006518.41695.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9efa6c2ec9f50a6b5caad5ceb1ad750cfe4400ccbbf29e76ed9b109d3272643b +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed4/events.out.tfevents.1683973842.nid006518.47196.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed4/events.out.tfevents.1683973842.nid006518.47196.0 new file mode 100644 index 0000000000000000000000000000000000000000..f2c65d1eebd8b01d1cecf8044f41853f552c3bb5 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed4/events.out.tfevents.1683973842.nid006518.47196.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9626e2cea087e864f55a45d250854c57f75a31aaa27936bba9524fe4f36d5c15 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b10c4pyseed4/events.out.tfevents.1683978316.nid006906.17222.0 b/tensorboard/tensorboard_4b284b84b10c4pyseed4/events.out.tfevents.1683978316.nid006906.17222.0 new file mode 100644 index 0000000000000000000000000000000000000000..37d8b7b14902a1988d6610e15dafd0d06341420d --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b10c4pyseed4/events.out.tfevents.1683978316.nid006906.17222.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32d147cb246cfa1ed682c0b26c71e1204eacbd12a44b0aa78097fe3a865dd056 +size 216295 diff --git a/tensorboard/tensorboard_4b284b84b20c4pyseed1/events.out.tfevents.1683756022.nid007019.95019.0 b/tensorboard/tensorboard_4b284b84b20c4pyseed1/events.out.tfevents.1683756022.nid007019.95019.0 new file mode 100644 index 0000000000000000000000000000000000000000..cb6feeacaffb133921c050292cf2d261ee0cbc5c --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b20c4pyseed1/events.out.tfevents.1683756022.nid007019.95019.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b618a67347a72ce1bc03cfbe7e9c53383ce127d39653a74ac0536eb7fb9467b1 +size 113730956 diff --git a/tensorboard/tensorboard_4b284b84b20c4pyseed1/events.out.tfevents.1683928091.nid007019.83767.0 b/tensorboard/tensorboard_4b284b84b20c4pyseed1/events.out.tfevents.1683928091.nid007019.83767.0 new file mode 100644 index 0000000000000000000000000000000000000000..c7b704b267e13f5478601366c2d584962e62e543 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b20c4pyseed1/events.out.tfevents.1683928091.nid007019.83767.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0979b1f4bfba91bdc6c00a14edac95dc46df9da567d9a130f4652746c651cfec +size 29905345 diff --git a/tensorboard/tensorboard_4b284b84b20c4pyseed1/events.out.tfevents.1683973371.nid005878.30890.0 b/tensorboard/tensorboard_4b284b84b20c4pyseed1/events.out.tfevents.1683973371.nid005878.30890.0 new file mode 100644 index 0000000000000000000000000000000000000000..6b397a4a725507598803554cf28af47b82e14db4 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b20c4pyseed1/events.out.tfevents.1683973371.nid005878.30890.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c905392a14f9f7ba58751eb9a63549358d33b30ce41ad1645ede90e04dfa06f +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b20c4pyseed2/events.out.tfevents.1683841277.nid006823.121017.0 b/tensorboard/tensorboard_4b284b84b20c4pyseed2/events.out.tfevents.1683841277.nid006823.121017.0 new file mode 100644 index 0000000000000000000000000000000000000000..b179cb9795436b1c1cec829adc8f23e748aaacc7 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b20c4pyseed2/events.out.tfevents.1683841277.nid006823.121017.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1336a2f48464892a331d5a3d3df5cc97bababa0d8d8c9c770bfe917a579eb31f +size 113628228 diff --git a/tensorboard/tensorboard_4b284b84b20c4pyseed2/events.out.tfevents.1684014113.nid005608.17070.0 b/tensorboard/tensorboard_4b284b84b20c4pyseed2/events.out.tfevents.1684014113.nid005608.17070.0 new file mode 100644 index 0000000000000000000000000000000000000000..822d9721b446fe94a5b99ba1947cafe1f3ecbf9c --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b20c4pyseed2/events.out.tfevents.1684014113.nid005608.17070.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e433a09e0a3c1c07c3eb9fad8f1d36d845017624eb179b4d1325fe1e40c3a598 +size 36146407 diff --git a/tensorboard/tensorboard_4b284b84b20c4pyseed2/events.out.tfevents.1684068700.nid005386.27271.0 b/tensorboard/tensorboard_4b284b84b20c4pyseed2/events.out.tfevents.1684068700.nid005386.27271.0 new file mode 100644 index 0000000000000000000000000000000000000000..ce0c3bb54c8741fe0c0cebc465aa961353413b82 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b20c4pyseed2/events.out.tfevents.1684068700.nid005386.27271.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18556196a9fdb23632241e450599f27274e1c3af265bf393b6e50891018a66c2 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b20c4pyseed2/events.out.tfevents.1684069021.nid005608.100235.0 b/tensorboard/tensorboard_4b284b84b20c4pyseed2/events.out.tfevents.1684069021.nid005608.100235.0 new file mode 100644 index 0000000000000000000000000000000000000000..3f4c16774bcfd692ee1505163b7dd4e35b6602d3 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b20c4pyseed2/events.out.tfevents.1684069021.nid005608.100235.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:267a000c6fa7e466e900c02907ab750f5fbc5a9b1882547b5b9ce0348bda7c64 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b20c4pyseed3/events.out.tfevents.1683756022.nid006436.6585.0 b/tensorboard/tensorboard_4b284b84b20c4pyseed3/events.out.tfevents.1683756022.nid006436.6585.0 new file mode 100644 index 0000000000000000000000000000000000000000..b7d775189b2ad90616392ccc36b69c96c348db44 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b20c4pyseed3/events.out.tfevents.1683756022.nid006436.6585.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95d1634ec49daf3599830f13e6aa7d208dc829be9fc9a691cdd4798ab522cb6b +size 40 diff --git a/tensorboard/tensorboard_4b284b84b20c4pyseed3/events.out.tfevents.1683763006.nid006643.129715.0 b/tensorboard/tensorboard_4b284b84b20c4pyseed3/events.out.tfevents.1683763006.nid006643.129715.0 new file mode 100644 index 0000000000000000000000000000000000000000..70a6d8f387c8d81db127df35cb7a13fbb64ef7f0 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b20c4pyseed3/events.out.tfevents.1683763006.nid006643.129715.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c46105b78e25216a942fa8c9ab68f813da123abbc3c583fef2cbd019ee9be26e +size 109292870 diff --git a/tensorboard/tensorboard_4b284b84b20c4pyseed3/events.out.tfevents.1683928773.nid006643.48244.0 b/tensorboard/tensorboard_4b284b84b20c4pyseed3/events.out.tfevents.1683928773.nid006643.48244.0 new file mode 100644 index 0000000000000000000000000000000000000000..7ff5aa62c17daf87741114a2848e5e39c3c092c3 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b20c4pyseed3/events.out.tfevents.1683928773.nid006643.48244.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0808a8dae26f6e579d75cbcdf7b97dc53cfbe7aaed6602126b14ab2bd8b8fdb7 +size 34343431 diff --git a/tensorboard/tensorboard_4b284b84b20c4pyseed4/events.out.tfevents.1683756022.nid006691.100882.0 b/tensorboard/tensorboard_4b284b84b20c4pyseed4/events.out.tfevents.1683756022.nid006691.100882.0 new file mode 100644 index 0000000000000000000000000000000000000000..f5a62cf0bdf9139992869351559ccffa00ec2ffe --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b20c4pyseed4/events.out.tfevents.1683756022.nid006691.100882.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a40e8e142be4b4ea4b5ad0f4deac17d00d6ca6e53373b146318857d2768b9f8 +size 113107478 diff --git a/tensorboard/tensorboard_4b284b84b20c4pyseed4/events.out.tfevents.1683928091.nid006691.57208.0 b/tensorboard/tensorboard_4b284b84b20c4pyseed4/events.out.tfevents.1683928091.nid006691.57208.0 new file mode 100644 index 0000000000000000000000000000000000000000..76579c65a52c8812055cc8661a433506c32d1fee --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b20c4pyseed4/events.out.tfevents.1683928091.nid006691.57208.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:330b5037477c4170a5fe2562b590a8a51bc4d5c76cceb1b06ad56dad4ae200ec +size 30528823 diff --git a/tensorboard/tensorboard_4b284b84b20c4pyseed4/events.out.tfevents.1683974452.nid006691.87332.0 b/tensorboard/tensorboard_4b284b84b20c4pyseed4/events.out.tfevents.1683974452.nid006691.87332.0 new file mode 100644 index 0000000000000000000000000000000000000000..df68e903eb8dcdaf79817e917a83ea05902ae0a5 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b20c4pyseed4/events.out.tfevents.1683974452.nid006691.87332.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3414f6dee1689ef1400b8b7c9898d7060a9e5f805a1dd5c4106a2fbf4fb1e04 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b30c4pyseed1/events.out.tfevents.1683756071.nid006387.50977.0 b/tensorboard/tensorboard_4b284b84b30c4pyseed1/events.out.tfevents.1683756071.nid006387.50977.0 new file mode 100644 index 0000000000000000000000000000000000000000..fc1818cfedbc04bb405a60ff784545b6a31dbc23 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b30c4pyseed1/events.out.tfevents.1683756071.nid006387.50977.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb158281b646952ce96d76531c7a1dfff772b831a274d90d9c4c581130715e2e +size 113436704 diff --git a/tensorboard/tensorboard_4b284b84b30c4pyseed1/events.out.tfevents.1683928167.nid006387.43703.0 b/tensorboard/tensorboard_4b284b84b30c4pyseed1/events.out.tfevents.1683928167.nid006387.43703.0 new file mode 100644 index 0000000000000000000000000000000000000000..de7aa2e4a9962379b6e225c38918150c7aab46b5 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b30c4pyseed1/events.out.tfevents.1683928167.nid006387.43703.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ba83a232975f364072f1e9fd0388d7d585f2be6cb3cda6ef72d13feeaae376b +size 30199596 diff --git a/tensorboard/tensorboard_4b284b84b30c4pyseed1/events.out.tfevents.1683973842.nid005608.81778.0 b/tensorboard/tensorboard_4b284b84b30c4pyseed1/events.out.tfevents.1683973842.nid005608.81778.0 new file mode 100644 index 0000000000000000000000000000000000000000..86930e03a69e06dd9781b4b66a59aa99a06f017d --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b30c4pyseed1/events.out.tfevents.1683973842.nid005608.81778.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c0211576b2cc307ec54146d3196b3486b4dfe73f798be9c98d360e1e3be7b05 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b30c4pyseed2/events.out.tfevents.1683756071.nid005322.12196.0 b/tensorboard/tensorboard_4b284b84b30c4pyseed2/events.out.tfevents.1683756071.nid005322.12196.0 new file mode 100644 index 0000000000000000000000000000000000000000..60a18782f7e9e96763f019116f826d79f0bbf6ed --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b30c4pyseed2/events.out.tfevents.1683756071.nid005322.12196.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0614ba1b0f8ce610a0cf28deccecd176fda33c1606649452c6d7a3c7a30ade73 +size 113265689 diff --git a/tensorboard/tensorboard_4b284b84b30c4pyseed2/events.out.tfevents.1683928092.nid005322.101841.0 b/tensorboard/tensorboard_4b284b84b30c4pyseed2/events.out.tfevents.1683928092.nid005322.101841.0 new file mode 100644 index 0000000000000000000000000000000000000000..2e0cc13f52796c1b0d1445e58e28134c8a84cd5f --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b30c4pyseed2/events.out.tfevents.1683928092.nid005322.101841.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e361170123ff0fc380fe9311520dd57d052761bf3710b47de14d6c0edbf2635 +size 30370612 diff --git a/tensorboard/tensorboard_4b284b84b30c4pyseed2/events.out.tfevents.1683974011.nid005322.127920.0 b/tensorboard/tensorboard_4b284b84b30c4pyseed2/events.out.tfevents.1683974011.nid005322.127920.0 new file mode 100644 index 0000000000000000000000000000000000000000..963ff30d316a12acdd88896d60bbfba3f528f50c --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b30c4pyseed2/events.out.tfevents.1683974011.nid005322.127920.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ca9bcbbe96e25b3cc22b53797e59145cd1db446c31d16b393f617bca36fec14 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b30c4pyseed2/events.out.tfevents.1683974293.nid007005.96298.0 b/tensorboard/tensorboard_4b284b84b30c4pyseed2/events.out.tfevents.1683974293.nid007005.96298.0 new file mode 100644 index 0000000000000000000000000000000000000000..08dfff3dfe954fbd5c77c0851b6d5e83820a4d67 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b30c4pyseed2/events.out.tfevents.1683974293.nid007005.96298.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2660796a888209cbce38ecd9b107b255ed45f47dd23acf90eced0366c790ee83 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b30c4pyseed3/events.out.tfevents.1683756785.nid007067.28515.0 b/tensorboard/tensorboard_4b284b84b30c4pyseed3/events.out.tfevents.1683756785.nid007067.28515.0 new file mode 100644 index 0000000000000000000000000000000000000000..5f0072c996fa6094887a8a1e8f220f221e46c290 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b30c4pyseed3/events.out.tfevents.1683756785.nid007067.28515.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ca1f024b8598e8f43732691c6a4afdaf0739ddfc0be4b1c29842ce6459da31c +size 90668414 diff --git a/tensorboard/tensorboard_4b284b84b30c4pyseed3/events.out.tfevents.1683928169.nid007067.13645.0 b/tensorboard/tensorboard_4b284b84b30c4pyseed3/events.out.tfevents.1683928169.nid007067.13645.0 new file mode 100644 index 0000000000000000000000000000000000000000..56211683f97c0a36953f11672929a520898ba654 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b30c4pyseed3/events.out.tfevents.1683928169.nid007067.13645.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66e7e94c5c41b65cb5e92183f1f45a0bdb195f99b5399d3ac6cc0f0f04464812 +size 52967887 diff --git a/tensorboard/tensorboard_4b284b84b30c4pyseed3/events.out.tfevents.1684035117.nid005842.84783.0 b/tensorboard/tensorboard_4b284b84b30c4pyseed3/events.out.tfevents.1684035117.nid005842.84783.0 new file mode 100644 index 0000000000000000000000000000000000000000..0d634e030f0d7134f4c66c7078614c50b553e1e2 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b30c4pyseed3/events.out.tfevents.1684035117.nid005842.84783.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3608c86c32bb73cd84b27ebe2ff26c8f27d2d34a1fe5d250dcbbf1d361446fe +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b30c4pyseed3/events.out.tfevents.1684035432.nid005842.93703.0 b/tensorboard/tensorboard_4b284b84b30c4pyseed3/events.out.tfevents.1684035432.nid005842.93703.0 new file mode 100644 index 0000000000000000000000000000000000000000..b1cbd36fb251f6e9cb071ed0096d5801e4102ae0 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b30c4pyseed3/events.out.tfevents.1684035432.nid005842.93703.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d104ad339f3c9a41ae64c04d0ae6de64ccc3d3d83961dbd617cda1c281c386c +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b30c4pyseed4/events.out.tfevents.1683756785.nid007035.10756.0 b/tensorboard/tensorboard_4b284b84b30c4pyseed4/events.out.tfevents.1683756785.nid007035.10756.0 new file mode 100644 index 0000000000000000000000000000000000000000..6f8cf90c20cfb9ff09ebebf853481d064275c925 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b30c4pyseed4/events.out.tfevents.1683756785.nid007035.10756.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c451c417f1d555be5936e8bc7f024decc9d97ea1ec97fd7e7b84a15da3952e2a +size 113195564 diff --git a/tensorboard/tensorboard_4b284b84b30c4pyseed4/events.out.tfevents.1683928168.nid007035.92078.0 b/tensorboard/tensorboard_4b284b84b30c4pyseed4/events.out.tfevents.1683928168.nid007035.92078.0 new file mode 100644 index 0000000000000000000000000000000000000000..711aa1085a3bdf1539065f7bf04e7a754262d310 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b30c4pyseed4/events.out.tfevents.1683928168.nid007035.92078.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac5fd1d1b0b4078354040017d964022d65ba2bec1bf753790e5320046fd08b83 +size 30440737 diff --git a/tensorboard/tensorboard_4b284b84b30c4pyseed4/events.out.tfevents.1683974183.nid006518.56357.0 b/tensorboard/tensorboard_4b284b84b30c4pyseed4/events.out.tfevents.1683974183.nid006518.56357.0 new file mode 100644 index 0000000000000000000000000000000000000000..4329fb557a53de22f6037093aa3659afb4a8d920 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b30c4pyseed4/events.out.tfevents.1683974183.nid006518.56357.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48d73b3a97e8aab12da691079d5a94edc1d4d14eb9611a1c5a150b2e265cfbe8 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b30c4pyseed4/events.out.tfevents.1683974488.nid006518.61660.0 b/tensorboard/tensorboard_4b284b84b30c4pyseed4/events.out.tfevents.1683974488.nid006518.61660.0 new file mode 100644 index 0000000000000000000000000000000000000000..05df972ff6c40301cce7523482f4dcc935c6357b --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b30c4pyseed4/events.out.tfevents.1683974488.nid006518.61660.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7855508c6b9e7e74b78d92746de898251b4b8c39f210c394b85b122e2f010b5 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b40c4pyseed1/events.out.tfevents.1683756785.nid006368.30413.0 b/tensorboard/tensorboard_4b284b84b40c4pyseed1/events.out.tfevents.1683756785.nid006368.30413.0 new file mode 100644 index 0000000000000000000000000000000000000000..8aa5d098f5ee2d03a30041fbee2c001e8dfe1610 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b40c4pyseed1/events.out.tfevents.1683756785.nid006368.30413.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ba4ba8c1a529d4d05b7fff201dfe12190518838d3af89ef08ec8707b4c17b4d +size 113192126 diff --git a/tensorboard/tensorboard_4b284b84b40c4pyseed1/events.out.tfevents.1683928169.nid006368.116696.0 b/tensorboard/tensorboard_4b284b84b40c4pyseed1/events.out.tfevents.1683928169.nid006368.116696.0 new file mode 100644 index 0000000000000000000000000000000000000000..0333757962c118d5ec059f3b3eb18efd58479d3b --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b40c4pyseed1/events.out.tfevents.1683928169.nid006368.116696.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edcb4429cd954028d813f099c9cbf6b8140751808f2fb9adc38b3d4087545372 +size 30444175 diff --git a/tensorboard/tensorboard_4b284b84b40c4pyseed1/events.out.tfevents.1683974110.nid006368.12166.0 b/tensorboard/tensorboard_4b284b84b40c4pyseed1/events.out.tfevents.1683974110.nid006368.12166.0 new file mode 100644 index 0000000000000000000000000000000000000000..e2805c8670593caadfae22652b4366a24561e723 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b40c4pyseed1/events.out.tfevents.1683974110.nid006368.12166.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86e149c93bdcdf68718f6660ba1903275413f1022f030828ec337603362ecfb0 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b40c4pyseed1/events.out.tfevents.1683974400.nid006368.17695.0 b/tensorboard/tensorboard_4b284b84b40c4pyseed1/events.out.tfevents.1683974400.nid006368.17695.0 new file mode 100644 index 0000000000000000000000000000000000000000..3edb0541d33c259d6cc00b043d744b1dbbb52dd6 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b40c4pyseed1/events.out.tfevents.1683974400.nid006368.17695.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3536d7fcf0c38ba6297033608af72f511ee63f7c10f48621ac1792316610348 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b40c4pyseed2/events.out.tfevents.1683756785.nid006848.95802.0 b/tensorboard/tensorboard_4b284b84b40c4pyseed2/events.out.tfevents.1683756785.nid006848.95802.0 new file mode 100644 index 0000000000000000000000000000000000000000..2f0732f47593e34e7fcb8a12967a199b7598b85e --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b40c4pyseed2/events.out.tfevents.1683756785.nid006848.95802.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:731febfd319d48cf6d18626670a10ac90c8711145e71bb83f926da9b97b9f20a +size 112689218 diff --git a/tensorboard/tensorboard_4b284b84b40c4pyseed2/events.out.tfevents.1683928168.nid006848.79440.0 b/tensorboard/tensorboard_4b284b84b40c4pyseed2/events.out.tfevents.1683928168.nid006848.79440.0 new file mode 100644 index 0000000000000000000000000000000000000000..6de90201889c9bf2912728bb630495660e63dd3e --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b40c4pyseed2/events.out.tfevents.1683928168.nid006848.79440.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aceb80fc097e62a6211cfa5185f9aa68adb6d1add8b27b8eed3b9a9473b5878 +size 30947083 diff --git a/tensorboard/tensorboard_4b284b84b40c4pyseed2/events.out.tfevents.1683975192.nid006518.72674.0 b/tensorboard/tensorboard_4b284b84b40c4pyseed2/events.out.tfevents.1683975192.nid006518.72674.0 new file mode 100644 index 0000000000000000000000000000000000000000..6ff1ddd9687ad654cfe2d50862f7dc42ef782a46 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b40c4pyseed2/events.out.tfevents.1683975192.nid006518.72674.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9de986a4c73357cb181f9ec5f98884ba070e3f6dc6fbed1016445e4fb4f696d +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b40c4pyseed2/events.out.tfevents.1683975467.nid006518.77944.0 b/tensorboard/tensorboard_4b284b84b40c4pyseed2/events.out.tfevents.1683975467.nid006518.77944.0 new file mode 100644 index 0000000000000000000000000000000000000000..4fbbdf7119f4307a6e38a7ec6d91fb9b47427a63 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b40c4pyseed2/events.out.tfevents.1683975467.nid006518.77944.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18db9cfe5594d58c39cc406836ff7991c27ed62291e993c3a50f03faf62b7875 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b40c4pyseed3/events.out.tfevents.1683756785.nid007236.84849.0 b/tensorboard/tensorboard_4b284b84b40c4pyseed3/events.out.tfevents.1683756785.nid007236.84849.0 new file mode 100644 index 0000000000000000000000000000000000000000..d0822c09ac264360a80d1f470de1248db90adf21 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b40c4pyseed3/events.out.tfevents.1683756785.nid007236.84849.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd74e258a091ba21cf993a38f1b274914c653dc17f50f2a7bed1e9c9306c6ceb +size 47047555 diff --git a/tensorboard/tensorboard_4b284b84b40c4pyseed3/events.out.tfevents.1683959527.nid006037.101508.0 b/tensorboard/tensorboard_4b284b84b40c4pyseed3/events.out.tfevents.1683959527.nid006037.101508.0 new file mode 100644 index 0000000000000000000000000000000000000000..5b7208fcb439ba04bf6941bf6f98558d86b2775c --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b40c4pyseed3/events.out.tfevents.1683959527.nid006037.101508.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be5bb3ee15dabf61de7ff00621e71054b0febad1094861147ad4e125b5684ee4 +size 107941510 diff --git a/tensorboard/tensorboard_4b284b84b40c4pyseed3/events.out.tfevents.1684122309.nid006037.61329.0 b/tensorboard/tensorboard_4b284b84b40c4pyseed3/events.out.tfevents.1684122309.nid006037.61329.0 new file mode 100644 index 0000000000000000000000000000000000000000..8caab74cb3dcca5ce6b4add0409d7bf75094f5e3 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b40c4pyseed3/events.out.tfevents.1684122309.nid006037.61329.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78eee6723644b668bc1c9f7f2de90372e29d6492a3dc8583da28a96961ca28fa +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b40c4pyseed3/events.out.tfevents.1684122637.nid005976.60301.0 b/tensorboard/tensorboard_4b284b84b40c4pyseed3/events.out.tfevents.1684122637.nid005976.60301.0 new file mode 100644 index 0000000000000000000000000000000000000000..354b970584fa16d8a68942d918cff5960b9c8c4a --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b40c4pyseed3/events.out.tfevents.1684122637.nid005976.60301.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d0546ec9128e6a8ffbbb956a228f9a7148e4b83f4d42c7a76dc5f83fe703780 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b40c4pyseed3/events.out.tfevents.1684122989.nid006037.70859.0 b/tensorboard/tensorboard_4b284b84b40c4pyseed3/events.out.tfevents.1684122989.nid006037.70859.0 new file mode 100644 index 0000000000000000000000000000000000000000..44a625d626295367301772b9b4229eccbd8bed87 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b40c4pyseed3/events.out.tfevents.1684122989.nid006037.70859.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d30070994b868531c99b5eaee7d44c22eb0ea3821a0a9d10299a785a99050a64 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b40c4pyseed4/events.out.tfevents.1683756785.nid006245.65368.0 b/tensorboard/tensorboard_4b284b84b40c4pyseed4/events.out.tfevents.1683756785.nid006245.65368.0 new file mode 100644 index 0000000000000000000000000000000000000000..63096f9a3eb722311a8b1987948891739d8ae8ed --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b40c4pyseed4/events.out.tfevents.1683756785.nid006245.65368.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e8c9060f70ab96b23ff7526b05e7ae80fc89b9e3b2d492cbbbe21e4cb4e3352 +size 40 diff --git a/tensorboard/tensorboard_4b284b84b40c4pyseed4/events.out.tfevents.1683758855.nid006245.80136.0 b/tensorboard/tensorboard_4b284b84b40c4pyseed4/events.out.tfevents.1683758855.nid006245.80136.0 new file mode 100644 index 0000000000000000000000000000000000000000..d239e35f9739ed681cb63992cc501f95607c2d40 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b40c4pyseed4/events.out.tfevents.1683758855.nid006245.80136.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b172afcbe9c377d14e0a68631af66a00791ffe930ffd9adb5814b4bd8a94de7 +size 40 diff --git a/tensorboard/tensorboard_4b284b84b40c4pyseed4/events.out.tfevents.1683761860.nid006353.97530.0 b/tensorboard/tensorboard_4b284b84b40c4pyseed4/events.out.tfevents.1683761860.nid006353.97530.0 new file mode 100644 index 0000000000000000000000000000000000000000..2773c26ca163507ea186424b65670661a10af8a3 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b40c4pyseed4/events.out.tfevents.1683761860.nid006353.97530.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6782b47f21c588028e34d706d820a7126aa867b304a5076d22448225187f0c80 +size 107814911 diff --git a/tensorboard/tensorboard_4b284b84b40c4pyseed4/events.out.tfevents.1683995643.nid006741.12263.0 b/tensorboard/tensorboard_4b284b84b40c4pyseed4/events.out.tfevents.1683995643.nid006741.12263.0 new file mode 100644 index 0000000000000000000000000000000000000000..af9c3fb80914850d33cc817c826af6cdf8d9435e --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b40c4pyseed4/events.out.tfevents.1683995643.nid006741.12263.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6d2ee0f65307b811c94c3f4595fc1df150965242983c37436f444cd1dcb9589 +size 35821390 diff --git a/tensorboard/tensorboard_4b284b84b50c4pyseed1/events.out.tfevents.1683756785.nid006598.37611.0 b/tensorboard/tensorboard_4b284b84b50c4pyseed1/events.out.tfevents.1683756785.nid006598.37611.0 new file mode 100644 index 0000000000000000000000000000000000000000..37ad2374040c5a96bf164f7c45feb776f5b72e36 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b50c4pyseed1/events.out.tfevents.1683756785.nid006598.37611.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42dc96596170bee218da16674ad1788f012031225919e2f40f48f31f89d547c3 +size 113280212 diff --git a/tensorboard/tensorboard_4b284b84b50c4pyseed1/events.out.tfevents.1683928269.nid006598.124121.0 b/tensorboard/tensorboard_4b284b84b50c4pyseed1/events.out.tfevents.1683928269.nid006598.124121.0 new file mode 100644 index 0000000000000000000000000000000000000000..aa90ca0f5707a791cb4a779ff13e31609fab05e4 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b50c4pyseed1/events.out.tfevents.1683928269.nid006598.124121.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f22ccb73f2075c083dd2bc7d0861b5af8104931ce749e42a26222d8e6c3af73 +size 30356088 diff --git a/tensorboard/tensorboard_4b284b84b50c4pyseed1/events.out.tfevents.1683974208.nid006906.98764.0 b/tensorboard/tensorboard_4b284b84b50c4pyseed1/events.out.tfevents.1683974208.nid006906.98764.0 new file mode 100644 index 0000000000000000000000000000000000000000..68eb7327d96a1f389b42ace649acc7be4ba49439 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b50c4pyseed1/events.out.tfevents.1683974208.nid006906.98764.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:969b3c22793c70188eec47832c32dd498a7c0342aba9bb8470014fe2b20db047 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b50c4pyseed1/events.out.tfevents.1683974488.nid005608.96015.0 b/tensorboard/tensorboard_4b284b84b50c4pyseed1/events.out.tfevents.1683974488.nid005608.96015.0 new file mode 100644 index 0000000000000000000000000000000000000000..77076fd9e64d7572c31fd992e37d0233011dfed2 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b50c4pyseed1/events.out.tfevents.1683974488.nid005608.96015.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbadef6c890b362be83e3385337fcba23aba7de4f849bef03aef5f79d2a4bde8 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b50c4pyseed2/events.out.tfevents.1683756785.nid006454.29972.0 b/tensorboard/tensorboard_4b284b84b50c4pyseed2/events.out.tfevents.1683756785.nid006454.29972.0 new file mode 100644 index 0000000000000000000000000000000000000000..2f0244a646f3a9deaa6e542e1a32622366d33574 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b50c4pyseed2/events.out.tfevents.1683756785.nid006454.29972.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1948b0c19e1711e5df5bcbad150944c2a5683cd91948c453d4fea004a012902e +size 113193845 diff --git a/tensorboard/tensorboard_4b284b84b50c4pyseed2/events.out.tfevents.1683933342.nid007121.44982.0 b/tensorboard/tensorboard_4b284b84b50c4pyseed2/events.out.tfevents.1683933342.nid007121.44982.0 new file mode 100644 index 0000000000000000000000000000000000000000..d587159d72e92cd948c76e2cd4b29c372226efe7 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b50c4pyseed2/events.out.tfevents.1683933342.nid007121.44982.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7155e5b015d0b2ea79a97be668291501968454e2c61bb8b554ebaccaa44470a5 +size 30442456 diff --git a/tensorboard/tensorboard_4b284b84b50c4pyseed2/events.out.tfevents.1683979297.nid006500.90953.0 b/tensorboard/tensorboard_4b284b84b50c4pyseed2/events.out.tfevents.1683979297.nid006500.90953.0 new file mode 100644 index 0000000000000000000000000000000000000000..b8e2b80ffb765c7bd4378988ff3daa075682e287 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b50c4pyseed2/events.out.tfevents.1683979297.nid006500.90953.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:878fa803e6272fb5b121daa9287a31f8a6f8755fa16c7798eeeca400600c3fdd +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b50c4pyseed2/events.out.tfevents.1683979588.nid006175.89991.0 b/tensorboard/tensorboard_4b284b84b50c4pyseed2/events.out.tfevents.1683979588.nid006175.89991.0 new file mode 100644 index 0000000000000000000000000000000000000000..6bdfc144e454a4ba4673a0091edfe59aa4d8c6f5 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b50c4pyseed2/events.out.tfevents.1683979588.nid006175.89991.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bbf3ba6a849715d3729b53ca1fb4f3c35231349e27019d896f42b22a7a17e17 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b50c4pyseed3/events.out.tfevents.1683756785.nid006906.84696.0 b/tensorboard/tensorboard_4b284b84b50c4pyseed3/events.out.tfevents.1683756785.nid006906.84696.0 new file mode 100644 index 0000000000000000000000000000000000000000..5023a9e5e73f0b6af32a531b596ae93b9dd379ef --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b50c4pyseed3/events.out.tfevents.1683756785.nid006906.84696.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55e571a7289af6b133a7210ddc8a759dd012a73fdf385dfc0e41cc771332dfd8 +size 112578014 diff --git a/tensorboard/tensorboard_4b284b84b50c4pyseed3/events.out.tfevents.1683933342.nid006575.28541.0 b/tensorboard/tensorboard_4b284b84b50c4pyseed3/events.out.tfevents.1683933342.nid006575.28541.0 new file mode 100644 index 0000000000000000000000000000000000000000..d807b0086b53c2d25e77b5f1e4e03c4053e1fd24 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b50c4pyseed3/events.out.tfevents.1683933342.nid006575.28541.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44f4b5899b2d36b2033de9c9a65319e6522cfd6ce493553592b1882eced0e389 +size 31058287 diff --git a/tensorboard/tensorboard_4b284b84b50c4pyseed3/events.out.tfevents.1683980228.nid005815.118183.0 b/tensorboard/tensorboard_4b284b84b50c4pyseed3/events.out.tfevents.1683980228.nid005815.118183.0 new file mode 100644 index 0000000000000000000000000000000000000000..183aecda96c5c70e1bad8b403cfe513cf21f17eb --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b50c4pyseed3/events.out.tfevents.1683980228.nid005815.118183.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8220f3f763bb15f8ef8cd35f8898ea1486052ad59273d60bf6d92f8643c2c12c +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b50c4pyseed4/events.out.tfevents.1683756785.nid006159.94330.0 b/tensorboard/tensorboard_4b284b84b50c4pyseed4/events.out.tfevents.1683756785.nid006159.94330.0 new file mode 100644 index 0000000000000000000000000000000000000000..a5a5e730eb3892be6bc9d97cc625e3208d765f8c --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b50c4pyseed4/events.out.tfevents.1683756785.nid006159.94330.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e95ca2d44ad6539e3dc11ee4eadf09ac99220253acf247351a42121880e7a10d +size 113192126 diff --git a/tensorboard/tensorboard_4b284b84b50c4pyseed4/events.out.tfevents.1683928355.nid006454.115744.0 b/tensorboard/tensorboard_4b284b84b50c4pyseed4/events.out.tfevents.1683928355.nid006454.115744.0 new file mode 100644 index 0000000000000000000000000000000000000000..414062ca73a2762f0884eeda68a2fb3d4d8e394a --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b50c4pyseed4/events.out.tfevents.1683928355.nid006454.115744.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df6fd098e4f475e9db87e4250e7042f9f5c91f75f7bfacb40015e6384a96b9c +size 30444175 diff --git a/tensorboard/tensorboard_4b284b84b50c4pyseed4/events.out.tfevents.1683974362.nid005322.7273.0 b/tensorboard/tensorboard_4b284b84b50c4pyseed4/events.out.tfevents.1683974362.nid005322.7273.0 new file mode 100644 index 0000000000000000000000000000000000000000..f9b54d60da539ce547e602eb8b8a05f3161660fd --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b50c4pyseed4/events.out.tfevents.1683974362.nid005322.7273.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3111e941b25f125c7b290c01264720725e807827f830b3d9ce6b7df549fe67a +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b50c4pyseed4/events.out.tfevents.1683974626.nid006531.29097.0 b/tensorboard/tensorboard_4b284b84b50c4pyseed4/events.out.tfevents.1683974626.nid006531.29097.0 new file mode 100644 index 0000000000000000000000000000000000000000..6720d7f12795d148ad2b6d2c5948b5f40ee9595a --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b50c4pyseed4/events.out.tfevents.1683974626.nid006531.29097.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:292418434eea1edc6079df12500cfa3e98bcb960bd0e6095fdff09e42882a012 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b60c4pyseed1/events.out.tfevents.1683756785.nid006703.6467.0 b/tensorboard/tensorboard_4b284b84b60c4pyseed1/events.out.tfevents.1683756785.nid006703.6467.0 new file mode 100644 index 0000000000000000000000000000000000000000..c50d796677af4de6554193338a2b5fb65a50b471 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b60c4pyseed1/events.out.tfevents.1683756785.nid006703.6467.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:516fdd5f5ad32f9246620366d4444f89d6eaf28bfc4a6fb8d366ad7c6d80533d +size 112548968 diff --git a/tensorboard/tensorboard_4b284b84b60c4pyseed1/events.out.tfevents.1683928355.nid006703.95013.0 b/tensorboard/tensorboard_4b284b84b60c4pyseed1/events.out.tfevents.1683928355.nid006703.95013.0 new file mode 100644 index 0000000000000000000000000000000000000000..25f22286e2c42a58eec3287864a8045d5604a70c --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b60c4pyseed1/events.out.tfevents.1683928355.nid006703.95013.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adddddafad5b1465eef7a85dff22fefd29700b5d126116f9ed4072d1bfdcfaa1 +size 31087333 diff --git a/tensorboard/tensorboard_4b284b84b60c4pyseed1/events.out.tfevents.1683975439.nid007019.3666.0 b/tensorboard/tensorboard_4b284b84b60c4pyseed1/events.out.tfevents.1683975439.nid007019.3666.0 new file mode 100644 index 0000000000000000000000000000000000000000..f9975f2f771494c4c1b34602fc4ccc1120d4c16f --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b60c4pyseed1/events.out.tfevents.1683975439.nid007019.3666.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebcb6f2f2901a67a58cd121c2f7f0e9c2242405c927cafcb564fde75bba5ce29 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b60c4pyseed2/events.out.tfevents.1683756785.nid007048.71183.0 b/tensorboard/tensorboard_4b284b84b60c4pyseed2/events.out.tfevents.1683756785.nid007048.71183.0 new file mode 100644 index 0000000000000000000000000000000000000000..ee9a63f835ab494e4ce53e9e3575c87cdcaf2110 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b60c4pyseed2/events.out.tfevents.1683756785.nid007048.71183.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2aa25b12b2f0ef044adca84f6077d117a725082245ae8b3edb931c41f90eaec +size 113263970 diff --git a/tensorboard/tensorboard_4b284b84b60c4pyseed2/events.out.tfevents.1683928355.nid006906.72116.0 b/tensorboard/tensorboard_4b284b84b60c4pyseed2/events.out.tfevents.1683928355.nid006906.72116.0 new file mode 100644 index 0000000000000000000000000000000000000000..93c097753a2843ce079608a79eb9d71929fed949 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b60c4pyseed2/events.out.tfevents.1683928355.nid006906.72116.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7e51c86d80b2a4cca38b1865d63e7d3ffced53b8c500365ba7d76e62791d192 +size 30372331 diff --git a/tensorboard/tensorboard_4b284b84b60c4pyseed2/events.out.tfevents.1683974183.nid005608.90695.0 b/tensorboard/tensorboard_4b284b84b60c4pyseed2/events.out.tfevents.1683974183.nid005608.90695.0 new file mode 100644 index 0000000000000000000000000000000000000000..4209e0bcfc284e41ed599908670f9df905af7517 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b60c4pyseed2/events.out.tfevents.1683974183.nid005608.90695.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0d5c7b20cc0901591b6d6efa4521ae6df8ab2da863e9ba4e5b9bda8f7bc5b4f +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b60c4pyseed2/events.out.tfevents.1683974488.nid005878.43858.0 b/tensorboard/tensorboard_4b284b84b60c4pyseed2/events.out.tfevents.1683974488.nid005878.43858.0 new file mode 100644 index 0000000000000000000000000000000000000000..b1ce3b1a060861a942fe81e77fdae4c838f847b0 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b60c4pyseed2/events.out.tfevents.1683974488.nid005878.43858.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88cac4988d8d8abaf91325ad41e2b64f45ee2b23b8bffbabde41d4ad9486de34 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b60c4pyseed3/events.out.tfevents.1683756785.nid006112.91270.0 b/tensorboard/tensorboard_4b284b84b60c4pyseed3/events.out.tfevents.1683756785.nid006112.91270.0 new file mode 100644 index 0000000000000000000000000000000000000000..d0045e2c25ebffa5d302faca4b19fd33cc1d98a0 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b60c4pyseed3/events.out.tfevents.1683756785.nid006112.91270.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8dbf352310957e5ac4034c4ce57a7d2fcca2d657e813dcecae1811d45ca50d9 +size 112592537 diff --git a/tensorboard/tensorboard_4b284b84b60c4pyseed3/events.out.tfevents.1683928355.nid006915.106150.0 b/tensorboard/tensorboard_4b284b84b60c4pyseed3/events.out.tfevents.1683928355.nid006915.106150.0 new file mode 100644 index 0000000000000000000000000000000000000000..c95c28ca1007c9e4fbefe511e01af02a79607379 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b60c4pyseed3/events.out.tfevents.1683928355.nid006915.106150.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25be5ce9c0e4da5cb3e301b302f4cb8dd27608b0de2148272dd9dcc2135b1db9 +size 31043764 diff --git a/tensorboard/tensorboard_4b284b84b60c4pyseed3/events.out.tfevents.1683975388.nid006175.36022.0 b/tensorboard/tensorboard_4b284b84b60c4pyseed3/events.out.tfevents.1683975388.nid006175.36022.0 new file mode 100644 index 0000000000000000000000000000000000000000..375c8319131467c8e99c27110719f5cc0693f759 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b60c4pyseed3/events.out.tfevents.1683975388.nid006175.36022.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1033040eb4f2e8d8af6bb8a8c1f641a79ed7bb7035b28a5031fcabc5abdb9548 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b60c4pyseed3/events.out.tfevents.1683975690.nid006647.127430.0 b/tensorboard/tensorboard_4b284b84b60c4pyseed3/events.out.tfevents.1683975690.nid006647.127430.0 new file mode 100644 index 0000000000000000000000000000000000000000..2d9df47919bfff4e44e0b03f905bd4fb499e01a7 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b60c4pyseed3/events.out.tfevents.1683975690.nid006647.127430.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31d6dab97443342d620fa0c377cf94de26b5a059bbe8509a2498f7d7dac94f72 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b60c4pyseed4/events.out.tfevents.1683756785.nid006480.41957.0 b/tensorboard/tensorboard_4b284b84b60c4pyseed4/events.out.tfevents.1683756785.nid006480.41957.0 new file mode 100644 index 0000000000000000000000000000000000000000..9df2b8944315d27064f895949fd3da67c3cf8cc5 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b60c4pyseed4/events.out.tfevents.1683756785.nid006480.41957.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54e482ab12a892cd3e8a98358bb81bb24bdfdbc337e6e7398178d83add860bd5 +size 113520581 diff --git a/tensorboard/tensorboard_4b284b84b60c4pyseed4/events.out.tfevents.1683928375.nid007048.25018.0 b/tensorboard/tensorboard_4b284b84b60c4pyseed4/events.out.tfevents.1683928375.nid007048.25018.0 new file mode 100644 index 0000000000000000000000000000000000000000..34fc902dad221ce1f3b3333b2e8c1eb260ed8726 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b60c4pyseed4/events.out.tfevents.1683928375.nid007048.25018.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e132cff8d6a286ea0eb28a94f15ceada033b5d7cc56ddce0a2e8219339e7238 +size 30115720 diff --git a/tensorboard/tensorboard_4b284b84b60c4pyseed4/events.out.tfevents.1683973923.nid007005.87354.0 b/tensorboard/tensorboard_4b284b84b60c4pyseed4/events.out.tfevents.1683973923.nid007005.87354.0 new file mode 100644 index 0000000000000000000000000000000000000000..18c0515bc20134e0a31345617c712599d8953158 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b60c4pyseed4/events.out.tfevents.1683973923.nid007005.87354.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eaa4dc92818d1536e8dcc3c1272da0cd74e2a05702e7dccef0fd09e0329af54 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b60c4pyseed4/events.out.tfevents.1683974183.nid007035.121489.0 b/tensorboard/tensorboard_4b284b84b60c4pyseed4/events.out.tfevents.1683974183.nid007035.121489.0 new file mode 100644 index 0000000000000000000000000000000000000000..2bdd9c704235802561e97ee94d3d06772351dc72 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b60c4pyseed4/events.out.tfevents.1683974183.nid007035.121489.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02e208d6c5a57be65bc5e9ef039c4c1fe87eb88f697a4ba9974af81938381c11 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b70c4pyseed1/events.out.tfevents.1683757092.nid007122.17875.0 b/tensorboard/tensorboard_4b284b84b70c4pyseed1/events.out.tfevents.1683757092.nid007122.17875.0 new file mode 100644 index 0000000000000000000000000000000000000000..a15ce27f3ad7417b6483d214e4b3724e5d4f37c8 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b70c4pyseed1/events.out.tfevents.1683757092.nid007122.17875.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ff4fdcbbe0782c3d4ea9a066cbd40d28c2d402fe9f97437d90535b6eb573dd1 +size 36753193 diff --git a/tensorboard/tensorboard_4b284b84b70c4pyseed1/events.out.tfevents.1683815243.nid007122.82070.0 b/tensorboard/tensorboard_4b284b84b70c4pyseed1/events.out.tfevents.1683815243.nid007122.82070.0 new file mode 100644 index 0000000000000000000000000000000000000000..1efbef77b080f79c682b6dd3704bba01ea8b154d --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b70c4pyseed1/events.out.tfevents.1683815243.nid007122.82070.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e9c648d62f7ab1f7e65cdecc6ba0d845c0219d363abc72b70f7931e032635fe +size 6747813 diff --git a/tensorboard/tensorboard_4b284b84b70c4pyseed1/events.out.tfevents.1683826192.nid006755.7896.0 b/tensorboard/tensorboard_4b284b84b70c4pyseed1/events.out.tfevents.1683826192.nid006755.7896.0 new file mode 100644 index 0000000000000000000000000000000000000000..8e623fe644a5e0126a9a17c6df710e2d910b4287 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b70c4pyseed1/events.out.tfevents.1683826192.nid006755.7896.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95292e5e9cd8ec83932f1ab2559faccf989c8551bc16d7058882d8f849776c33 +size 107941510 diff --git a/tensorboard/tensorboard_4b284b84b70c4pyseed1/events.out.tfevents.1683989512.nid005976.2606.0 b/tensorboard/tensorboard_4b284b84b70c4pyseed1/events.out.tfevents.1683989512.nid005976.2606.0 new file mode 100644 index 0000000000000000000000000000000000000000..64ee37d14a927db5d144b16c278a31011e3bee3a --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b70c4pyseed1/events.out.tfevents.1683989512.nid005976.2606.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a985a61f27d0646eb478675792fed19c90bfd6dc1a69d0bfcfe677d5748df0d8 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b70c4pyseed2/events.out.tfevents.1683757092.nid007076.26934.0 b/tensorboard/tensorboard_4b284b84b70c4pyseed2/events.out.tfevents.1683757092.nid007076.26934.0 new file mode 100644 index 0000000000000000000000000000000000000000..c16ea1344283e3dc666c38233183fc30b59678eb --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b70c4pyseed2/events.out.tfevents.1683757092.nid007076.26934.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b4df3e38dd918e1bab99541c4d1c675a6a37ddac89895ae1e4f638ecea182e0 +size 112173506 diff --git a/tensorboard/tensorboard_4b284b84b70c4pyseed2/events.out.tfevents.1683928415.nid006767.127973.0 b/tensorboard/tensorboard_4b284b84b70c4pyseed2/events.out.tfevents.1683928415.nid006767.127973.0 new file mode 100644 index 0000000000000000000000000000000000000000..0fbd93951f639c1649bf1351cf5a4f0bd1a9aadd --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b70c4pyseed2/events.out.tfevents.1683928415.nid006767.127973.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:179c5b47ad730a037bbd513e5cadcf645c5ad0e96fa3ed3917cecc38825d5dea +size 31462795 diff --git a/tensorboard/tensorboard_4b284b84b70c4pyseed2/events.out.tfevents.1683976409.nid006848.121247.0 b/tensorboard/tensorboard_4b284b84b70c4pyseed2/events.out.tfevents.1683976409.nid006848.121247.0 new file mode 100644 index 0000000000000000000000000000000000000000..03f7921e82bb9f4daf9a3d61b90c5420e6dacdd6 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b70c4pyseed2/events.out.tfevents.1683976409.nid006848.121247.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70165e13a7510e759b3b9d92444e3ac6bd8ddf85c2c48ef59cdfc5fadde7d34a +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b70c4pyseed2/events.out.tfevents.1683976714.nid006500.57058.0 b/tensorboard/tensorboard_4b284b84b70c4pyseed2/events.out.tfevents.1683976714.nid006500.57058.0 new file mode 100644 index 0000000000000000000000000000000000000000..31f577a60992c5ecd03fa3f1349ac13d5668af91 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b70c4pyseed2/events.out.tfevents.1683976714.nid006500.57058.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63c6b29111fae034f2e4b1e31d3a44448676a9e35fa4b038c50df4c1680ebffe +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b70c4pyseed3/events.out.tfevents.1683757092.nid006768.124317.0 b/tensorboard/tensorboard_4b284b84b70c4pyseed3/events.out.tfevents.1683757092.nid006768.124317.0 new file mode 100644 index 0000000000000000000000000000000000000000..1863abc3dd206a3ca583daa0f345761d1f1f7e2a --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b70c4pyseed3/events.out.tfevents.1683757092.nid006768.124317.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3353aa9dcd1fe177873167939b805f0a7de04e63c27eafd2d76433e70c12181 +size 112908188 diff --git a/tensorboard/tensorboard_4b284b84b70c4pyseed3/events.out.tfevents.1683928375.nid006480.130442.0 b/tensorboard/tensorboard_4b284b84b70c4pyseed3/events.out.tfevents.1683928375.nid006480.130442.0 new file mode 100644 index 0000000000000000000000000000000000000000..38b1fe9b468ffd13194c41d158047a87033d2618 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b70c4pyseed3/events.out.tfevents.1683928375.nid006480.130442.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15fd9bbafeba7d93dae87549d92f241e5eb126de1cd6f520258f69a54740aab3 +size 30728112 diff --git a/tensorboard/tensorboard_4b284b84b70c4pyseed3/events.out.tfevents.1683974713.nid006480.28035.0 b/tensorboard/tensorboard_4b284b84b70c4pyseed3/events.out.tfevents.1683974713.nid006480.28035.0 new file mode 100644 index 0000000000000000000000000000000000000000..fa3fbeb54a8ce88ea556ddb804228b30e89b3593 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b70c4pyseed3/events.out.tfevents.1683974713.nid006480.28035.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b630ea79fa2df1ae6b7a876d962e33356307f07398f837a61554a49494f49dce +size 40 diff --git a/tensorboard/tensorboard_4b284b84b70c4pyseed3/events.out.tfevents.1683974921.nid006500.33092.0 b/tensorboard/tensorboard_4b284b84b70c4pyseed3/events.out.tfevents.1683974921.nid006500.33092.0 new file mode 100644 index 0000000000000000000000000000000000000000..bf67655ab8256dd9ec18e040f95a928c213f9235 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b70c4pyseed3/events.out.tfevents.1683974921.nid006500.33092.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79629d6e9e9ba109e7b195c253b10266b177634e337af85d979e8df5c7307eb5 +size 40 diff --git a/tensorboard/tensorboard_4b284b84b70c4pyseed4/events.out.tfevents.1683757092.nid006443.98262.0 b/tensorboard/tensorboard_4b284b84b70c4pyseed4/events.out.tfevents.1683757092.nid006443.98262.0 new file mode 100644 index 0000000000000000000000000000000000000000..ea17b07563c0206d9b94c9d5a7c77fcdc5b286a0 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b70c4pyseed4/events.out.tfevents.1683757092.nid006443.98262.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:022a4bf1b4dc4e7cdc0acbcac439c0a8467e12e6db7357eadd6b33728fd38838 +size 113281931 diff --git a/tensorboard/tensorboard_4b284b84b70c4pyseed4/events.out.tfevents.1683937613.nid006472.54032.0 b/tensorboard/tensorboard_4b284b84b70c4pyseed4/events.out.tfevents.1683937613.nid006472.54032.0 new file mode 100644 index 0000000000000000000000000000000000000000..977ae50b4bd2f7330fb38716147409ca5150148c --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b70c4pyseed4/events.out.tfevents.1683937613.nid006472.54032.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f064192901e29a00b2e23bb0e5c43524fc3b5a4cad58e6c0f6c5917d7f165a6 +size 30354370 diff --git a/tensorboard/tensorboard_4b284b84b70c4pyseed4/events.out.tfevents.1683983649.nid005976.85672.0 b/tensorboard/tensorboard_4b284b84b70c4pyseed4/events.out.tfevents.1683983649.nid005976.85672.0 new file mode 100644 index 0000000000000000000000000000000000000000..b56c614a434a912a7e51b7708445a4113053454a --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b70c4pyseed4/events.out.tfevents.1683983649.nid005976.85672.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:446764d61bcbb8741071aff1f423bf5a2ee507d156afae2a7c35a6f6dd65a1fe +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b70c4pyseed4/events.out.tfevents.1683983977.nid005976.91479.0 b/tensorboard/tensorboard_4b284b84b70c4pyseed4/events.out.tfevents.1683983977.nid005976.91479.0 new file mode 100644 index 0000000000000000000000000000000000000000..f7e701cf188ceb6da9487e302f5ff53034f38fd0 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b70c4pyseed4/events.out.tfevents.1683983977.nid005976.91479.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35a794efc31271d0533ba172276d87c6858b2f9281c411f2b0ef7b02c393894c +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b80c4pyseed1/events.out.tfevents.1683757092.nid007131.96964.0 b/tensorboard/tensorboard_4b284b84b80c4pyseed1/events.out.tfevents.1683757092.nid007131.96964.0 new file mode 100644 index 0000000000000000000000000000000000000000..3ab4e09ef32f7cc21d5d2883ee63be84c6df9054 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b80c4pyseed1/events.out.tfevents.1683757092.nid007131.96964.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:982da350e5598e2573a71185ba5eae4d368581a3b8513e8dd68b2e08a83df36e +size 111203612 diff --git a/tensorboard/tensorboard_4b284b84b80c4pyseed1/events.out.tfevents.1683928578.nid006937.28397.0 b/tensorboard/tensorboard_4b284b84b80c4pyseed1/events.out.tfevents.1683928578.nid006937.28397.0 new file mode 100644 index 0000000000000000000000000000000000000000..69143fb1f44e79bfc257124305c31a6f879b6142 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b80c4pyseed1/events.out.tfevents.1683928578.nid006937.28397.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d28d64ea16d07856c9508324620267957bdbd9d196869a75ce4270436a2eb100 +size 32432689 diff --git a/tensorboard/tensorboard_4b284b84b80c4pyseed1/events.out.tfevents.1683977530.nid006500.72000.0 b/tensorboard/tensorboard_4b284b84b80c4pyseed1/events.out.tfevents.1683977530.nid006500.72000.0 new file mode 100644 index 0000000000000000000000000000000000000000..ebbfec48bf78b6d2b25ad0c33d9cee963fcaab9d --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b80c4pyseed1/events.out.tfevents.1683977530.nid006500.72000.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e62901cb97392578448e7ba628e37f43e168c615816978d439bd3375ada28bbe +size 40 diff --git a/tensorboard/tensorboard_4b284b84b80c4pyseed2/events.out.tfevents.1683757092.nid006937.74303.0 b/tensorboard/tensorboard_4b284b84b80c4pyseed2/events.out.tfevents.1683757092.nid006937.74303.0 new file mode 100644 index 0000000000000000000000000000000000000000..21fe8d0ec14d763153cffd8d03d1b80a075def52 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b80c4pyseed2/events.out.tfevents.1683757092.nid006937.74303.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b136786d960c5db3ae41f7a3143c9ab90fbff6e318efad3ba9bb9e3fcb59188c +size 110291810 diff --git a/tensorboard/tensorboard_4b284b84b80c4pyseed2/events.out.tfevents.1683928552.nid007076.113818.0 b/tensorboard/tensorboard_4b284b84b80c4pyseed2/events.out.tfevents.1683928552.nid007076.113818.0 new file mode 100644 index 0000000000000000000000000000000000000000..816ecf2f1497931a7ce95ac3de0254d634ab4acf --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b80c4pyseed2/events.out.tfevents.1683928552.nid007076.113818.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f450a3d87a22d6c1e3b6c0c4e2f4e50b3253a1b07e299e21cb7619f9951abfd +size 6200475 diff --git a/tensorboard/tensorboard_4b284b84b80c4pyseed2/events.out.tfevents.1683939978.nid007076.59364.0 b/tensorboard/tensorboard_4b284b84b80c4pyseed2/events.out.tfevents.1683939978.nid007076.59364.0 new file mode 100644 index 0000000000000000000000000000000000000000..9e43a93fb7046d395bfa93d6237f9044799508a5 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b80c4pyseed2/events.out.tfevents.1683939978.nid007076.59364.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5fee94a78e0066616469f0cc2bb48785acb9b0a1e56ee8dcc3c011257148fd2 +size 33344491 diff --git a/tensorboard/tensorboard_4b284b84b80c4pyseed2/events.out.tfevents.1683990712.nid005976.13811.0 b/tensorboard/tensorboard_4b284b84b80c4pyseed2/events.out.tfevents.1683990712.nid005976.13811.0 new file mode 100644 index 0000000000000000000000000000000000000000..b21b8dc42870a58395017cca3750033ad0bf795b --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b80c4pyseed2/events.out.tfevents.1683990712.nid005976.13811.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26d9723268ab227237be1a3b15d2131210a3c97a20eba94970d17e551da2afa5 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b80c4pyseed3/events.out.tfevents.1683757092.nid007239.113250.0 b/tensorboard/tensorboard_4b284b84b80c4pyseed3/events.out.tfevents.1683757092.nid007239.113250.0 new file mode 100644 index 0000000000000000000000000000000000000000..066a9a3b4e990efc5aeff503c88153366f52b641 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b80c4pyseed3/events.out.tfevents.1683757092.nid007239.113250.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c8af0e7eff25e77837b9fee637a97f419cb604bff231c5445258b82b9421fd7 +size 111588440 diff --git a/tensorboard/tensorboard_4b284b84b80c4pyseed3/events.out.tfevents.1683928579.nid006608.51282.0 b/tensorboard/tensorboard_4b284b84b80c4pyseed3/events.out.tfevents.1683928579.nid006608.51282.0 new file mode 100644 index 0000000000000000000000000000000000000000..1284d951aaa4ebfda7e3ade11e4bd6c8adf2609c --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b80c4pyseed3/events.out.tfevents.1683928579.nid006608.51282.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84d5b500a327076da697199598f646c76d827baf56540e414aad7f32efea5a29 +size 32047861 diff --git a/tensorboard/tensorboard_4b284b84b80c4pyseed3/events.out.tfevents.1683977239.nid006500.66520.0 b/tensorboard/tensorboard_4b284b84b80c4pyseed3/events.out.tfevents.1683977239.nid006500.66520.0 new file mode 100644 index 0000000000000000000000000000000000000000..ff1658a71a6bb6fe8aac9aa6be14a0d7560b2ff6 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b80c4pyseed3/events.out.tfevents.1683977239.nid006500.66520.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:315022f79cd4b2dafc5db86b636994e2f229aa980f0e7d85b5ce8ffde6ea0d11 +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b80c4pyseed3/events.out.tfevents.1683977530.nid006848.1513.0 b/tensorboard/tensorboard_4b284b84b80c4pyseed3/events.out.tfevents.1683977530.nid006848.1513.0 new file mode 100644 index 0000000000000000000000000000000000000000..5c3a80f91f182b2d7b4969e2bf2dd6f421532ff2 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b80c4pyseed3/events.out.tfevents.1683977530.nid006848.1513.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e05447e75a39bd2845df9749ac3b13de3a5bcc32eb6f070fa2603bbd47933e0 +size 40 diff --git a/tensorboard/tensorboard_4b284b84b80c4pyseed4/events.out.tfevents.1683757092.nid007068.74992.0 b/tensorboard/tensorboard_4b284b84b80c4pyseed4/events.out.tfevents.1683757092.nid007068.74992.0 new file mode 100644 index 0000000000000000000000000000000000000000..93ec53ec2a1b372e386178083dba6e98486d7801 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b80c4pyseed4/events.out.tfevents.1683757092.nid007068.74992.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9b6128f23801baa972c3daef3f0079d51939dd9e51bc0080dd2334e2597325f +size 112638773 diff --git a/tensorboard/tensorboard_4b284b84b80c4pyseed4/events.out.tfevents.1683928601.nid006861.43876.0 b/tensorboard/tensorboard_4b284b84b80c4pyseed4/events.out.tfevents.1683928601.nid006861.43876.0 new file mode 100644 index 0000000000000000000000000000000000000000..dc017ba7c994b77708fd716607e44b7d9912823e --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b80c4pyseed4/events.out.tfevents.1683928601.nid006861.43876.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba321f7cbd12ca2fdbda99c3df80c97950940b760f48a96482055a862f8c138a +size 30997528 diff --git a/tensorboard/tensorboard_4b284b84b80c4pyseed4/events.out.tfevents.1683975439.nid006619.45199.0 b/tensorboard/tensorboard_4b284b84b80c4pyseed4/events.out.tfevents.1683975439.nid006619.45199.0 new file mode 100644 index 0000000000000000000000000000000000000000..64abb129dd09c8a02f8d2e38190504801a92c698 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b80c4pyseed4/events.out.tfevents.1683975439.nid006619.45199.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a7143720e1ab99a75c0cce9a77af65445c723738ce07aceb3c8d24fd8ca99ed +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b80c4pyseed4/events.out.tfevents.1683975690.nid005486.94515.0 b/tensorboard/tensorboard_4b284b84b80c4pyseed4/events.out.tfevents.1683975690.nid005486.94515.0 new file mode 100644 index 0000000000000000000000000000000000000000..0e915afbd640115734fcec9a8add8cd4d6dc869a --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b80c4pyseed4/events.out.tfevents.1683975690.nid005486.94515.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5cc4d2dd48573330359e32cdbeda7a1ef7666c663354bbf17ecc973b0d5c13c +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b90c4pyseed1/events.out.tfevents.1683760252.nid005976.48400.0 b/tensorboard/tensorboard_4b284b84b90c4pyseed1/events.out.tfevents.1683760252.nid005976.48400.0 new file mode 100644 index 0000000000000000000000000000000000000000..07882077ddbdcd52aa0f97d60fee0cda575c7b3e --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b90c4pyseed1/events.out.tfevents.1683760252.nid005976.48400.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c54b51718deac9f7ba927dedee55da599b3ed71bdd5e98fa088d84e8461b201c +size 109190261 diff --git a/tensorboard/tensorboard_4b284b84b90c4pyseed1/events.out.tfevents.1683928843.nid006353.21667.0 b/tensorboard/tensorboard_4b284b84b90c4pyseed1/events.out.tfevents.1683928843.nid006353.21667.0 new file mode 100644 index 0000000000000000000000000000000000000000..4b39e3342161ef5c8e2a2a78d7b34b405ce11853 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b90c4pyseed1/events.out.tfevents.1683928843.nid006353.21667.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e51db4ada04964ad47bee28be0d1aa3189491ee0f3dcf5d05153444941bbed9 +size 34446040 diff --git a/tensorboard/tensorboard_4b284b84b90c4pyseed1/events.out.tfevents.1683981891.nid007127.12414.0 b/tensorboard/tensorboard_4b284b84b90c4pyseed1/events.out.tfevents.1683981891.nid007127.12414.0 new file mode 100644 index 0000000000000000000000000000000000000000..b83a603e7ab1063dbc98e0aa7d6a89adbb1ffe22 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b90c4pyseed1/events.out.tfevents.1683981891.nid007127.12414.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb7f6079fed1a7d0fc38a6dbf57a794bbe70dff3342e27d03cc7fb693a61a0b8 +size 40 diff --git a/tensorboard/tensorboard_4b284b84b90c4pyseed1/events.out.tfevents.1683982063.nid006848.69099.0 b/tensorboard/tensorboard_4b284b84b90c4pyseed1/events.out.tfevents.1683982063.nid006848.69099.0 new file mode 100644 index 0000000000000000000000000000000000000000..6862455b7c05793a62c9a6348ab44e2917b3bb9e --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b90c4pyseed1/events.out.tfevents.1683982063.nid006848.69099.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88a003f59ffd6de49d2cb0022972f826a32b2b1cbee3e33a0c0befe31d3241cf +size 40 diff --git a/tensorboard/tensorboard_4b284b84b90c4pyseed2/events.out.tfevents.1683760252.nid007191.60298.0 b/tensorboard/tensorboard_4b284b84b90c4pyseed2/events.out.tfevents.1683760252.nid007191.60298.0 new file mode 100644 index 0000000000000000000000000000000000000000..241e2a216cc2cb81f443c48c4bde2a70af12ebdb --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b90c4pyseed2/events.out.tfevents.1683760252.nid007191.60298.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f36c74fabb4b3e9873078cbfc672a2e06b65ccc00362ba877c8e2d6f891f8fac +size 108659078 diff --git a/tensorboard/tensorboard_4b284b84b90c4pyseed2/events.out.tfevents.1683928843.nid007191.26713.0 b/tensorboard/tensorboard_4b284b84b90c4pyseed2/events.out.tfevents.1683928843.nid007191.26713.0 new file mode 100644 index 0000000000000000000000000000000000000000..5b8b6bdd3f3372e06cf7c01375ff3a31adaee2ea --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b90c4pyseed2/events.out.tfevents.1683928843.nid007191.26713.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a7a41ea7bd84b0575ba67b4c8d400386571b5a8ee72fad86295139f4bef1e54 +size 34977223 diff --git a/tensorboard/tensorboard_4b284b84b90c4pyseed2/events.out.tfevents.1683981891.nid006906.55655.0 b/tensorboard/tensorboard_4b284b84b90c4pyseed2/events.out.tfevents.1683981891.nid006906.55655.0 new file mode 100644 index 0000000000000000000000000000000000000000..cbad1efa216226f31506f4c9bec35dfa7252f24f --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b90c4pyseed2/events.out.tfevents.1683981891.nid006906.55655.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:478a10a6d837b6ef921391b9a563d5a4a60f9da727512754929ce667af03673a +size 40 diff --git a/tensorboard/tensorboard_4b284b84b90c4pyseed2/events.out.tfevents.1683982063.nid006500.13113.0 b/tensorboard/tensorboard_4b284b84b90c4pyseed2/events.out.tfevents.1683982063.nid006500.13113.0 new file mode 100644 index 0000000000000000000000000000000000000000..02b07488547d7cb98e121d549609f7999a76483e --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b90c4pyseed2/events.out.tfevents.1683982063.nid006500.13113.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48128a5d19d9a80099a8040971515f67295676ff1f81e5dcdf057658a0644dd8 +size 40 diff --git a/tensorboard/tensorboard_4b284b84b90c4pyseed3/events.out.tfevents.1683760252.nid006895.88968.0 b/tensorboard/tensorboard_4b284b84b90c4pyseed3/events.out.tfevents.1683760252.nid006895.88968.0 new file mode 100644 index 0000000000000000000000000000000000000000..dac0b40acc33f929ee3c2fc5dd483ea29b209525 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b90c4pyseed3/events.out.tfevents.1683760252.nid006895.88968.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e549893bcca06718b75eb10fcfde01b8b6356df6f140ae7714d39a6645847ca5 +size 108588953 diff --git a/tensorboard/tensorboard_4b284b84b90c4pyseed3/events.out.tfevents.1683928843.nid005976.119471.0 b/tensorboard/tensorboard_4b284b84b90c4pyseed3/events.out.tfevents.1683928843.nid005976.119471.0 new file mode 100644 index 0000000000000000000000000000000000000000..03fae9ba31b79079f5bfc934207f4144967a67b7 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b90c4pyseed3/events.out.tfevents.1683928843.nid005976.119471.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f33a206a904c28677949a07e4c3b89fd4d7df4ab4503046ffbbb0497e127f04f +size 35047348 diff --git a/tensorboard/tensorboard_4b284b84b90c4pyseed3/events.out.tfevents.1683981720.nid007019.75939.0 b/tensorboard/tensorboard_4b284b84b90c4pyseed3/events.out.tfevents.1683981720.nid007019.75939.0 new file mode 100644 index 0000000000000000000000000000000000000000..f64ecc28a303e3b73696d7461f3736e280db98f0 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b90c4pyseed3/events.out.tfevents.1683981720.nid007019.75939.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cda0799603a7bb8f8013e962dd3c3c6a574a34b4ac7f0d1c6650c916c2d18fd +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b90c4pyseed3/events.out.tfevents.1683981999.nid005976.62833.0 b/tensorboard/tensorboard_4b284b84b90c4pyseed3/events.out.tfevents.1683981999.nid005976.62833.0 new file mode 100644 index 0000000000000000000000000000000000000000..49f6b9a597f7b7f33005c18ee9a5081fde1ab787 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b90c4pyseed3/events.out.tfevents.1683981999.nid005976.62833.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5ea86a3bde442cbc98ef7609f3a6217444ddb273a0a0d0ad0f0e06f71060340 +size 40 diff --git a/tensorboard/tensorboard_4b284b84b90c4pyseed4/events.out.tfevents.1683760252.nid005684.97093.0 b/tensorboard/tensorboard_4b284b84b90c4pyseed4/events.out.tfevents.1683760252.nid005684.97093.0 new file mode 100644 index 0000000000000000000000000000000000000000..3e6ad1656558e3a7ada3891e607e81d11696aec9 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b90c4pyseed4/events.out.tfevents.1683760252.nid005684.97093.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f01538f4fad536c7735352faae6e2bce3b7a56097bd0d194c0dd086fc9e9d887 +size 108822446 diff --git a/tensorboard/tensorboard_4b284b84b90c4pyseed4/events.out.tfevents.1683928725.nid005684.33165.0 b/tensorboard/tensorboard_4b284b84b90c4pyseed4/events.out.tfevents.1683928725.nid005684.33165.0 new file mode 100644 index 0000000000000000000000000000000000000000..3e7d9f208ee6a95b1ef072ccd5ec7f47b7345a70 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b90c4pyseed4/events.out.tfevents.1683928725.nid005684.33165.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25e39522067cfdaba85b53c4d242e0befdcbb73a72321e2213d0b0285ac3fbb0 +size 34813854 diff --git a/tensorboard/tensorboard_4b284b84b90c4pyseed4/events.out.tfevents.1683981481.nid005684.102018.0 b/tensorboard/tensorboard_4b284b84b90c4pyseed4/events.out.tfevents.1683981481.nid005684.102018.0 new file mode 100644 index 0000000000000000000000000000000000000000..b31e76a897a7ada292b8020b7f6697be3e26cded --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b90c4pyseed4/events.out.tfevents.1683981481.nid005684.102018.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9bb977ff2cd0df92ad635183bc1a41b7987cdfdee9134ca1a76cf2691b1f37d +size 22933 diff --git a/tensorboard/tensorboard_4b284b84b90c4pyseed4/events.out.tfevents.1683981720.nid006848.63340.0 b/tensorboard/tensorboard_4b284b84b90c4pyseed4/events.out.tfevents.1683981720.nid006848.63340.0 new file mode 100644 index 0000000000000000000000000000000000000000..8425b73df549e5322be420034b82f97d52d60564 --- /dev/null +++ b/tensorboard/tensorboard_4b284b84b90c4pyseed4/events.out.tfevents.1683981720.nid006848.63340.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a1c1568ce50bad10a38686a397e75fa1476b51e2ed6bc94053d81f8ec331c02 +size 22933