File size: 3,341 Bytes
8ebda9e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
#!/bin/bash
#SBATCH --job-name=deep_vae_pretrain
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=32 #
#SBATCH --gres=gpu:1 # number of gpus
#SBATCH -o xxx/outputs/deep_vae/logs/slurm/%x-%j.log
#SBATCH -e xxx/outputs/deep_vae/logs/slurm/%x-%j.err
# SBATCH --requeue
# SBATCH --qos=preemptive
set -x -e
ulimit -s unlimited
echo "START TIME: $(date)"
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
# export MASTER_ADDR=127.0.0.1
export MASTER_PORT=$[RANDOM%10000+50000]
MICRO_BATCH_SIZE=64
ZERO_STAGE=0
ROOT_PATH=xxxx
config_json=${ROOT_PATH}/job_out/ds_config.json
# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
"steps_per_print": 100,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE,
"contiguous_gradients": false,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 50000000,
"allgather_bucket_size": 500000000
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 1e-5,
"betas": [
0.9,
0.95
],
"eps": 1e-8,
"weight_decay": 1e-2
}
},
"scheduler": {
"type": "WarmupLR",
"params":{
"warmup_min_lr": 5e-6,
"warmup_max_lr": 1e-5
}
},
"zero_allow_untested_optimizer": false,
"fp16": {
"enabled": false,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"activation_checkpointing": {
"partition_activations": false,
"contiguous_memory_optimization": false
},
"wall_clock_breakdown": false
}
EOT
export PL_DEEPSPEED_CONFIG_PATH=$config_json
export TORCH_EXTENSIONS_DIR=~/tmp
# NOTE both encoder and decoder use the same model
GPT2_MODEL_PATH=xxx
VAE_ARGS="
--gpt2_model_path $GPT2_MODEL_PATH \
--latent_dim 32 \
--beta_kl_constraints_start 1e-5 \
--beta_kl_constraints_stop 1. \
--beta_n_cycles 40 \
"
CHECKPOINT_SAVE_PATH=${ROOT_PATH}/checkpoints
MODEL_CHECKPOINT_ARGS="\
--monitor val_recon_loss \
--save_top_k 1 \
--mode min \
--every_n_train_steps 1000 \
--save_weights_only True \
--dirpath $CHECKPOINT_SAVE_PATH \
--filename checkpoint-{epoch}-{step}-filenum_20_dim_32_beta_1e-5_1_zh_finance \
"
TRAINER_ARGS="
--max_epochs 40 \
--gpus 1 \
--num_nodes 1 \
--precision 16 \
--val_check_interval 1000 \
--learning_rate 5e-5 \
--warmup_steps 10000 \
--weight_decay 0.01 \
--default_root_dir ${ROOT_PATH} \
--log_every_n_steps 50 \
--strategy deepspeed_stage_2 \
"
# --strategy deepspeed_stage_2 \
# note we use wudao optimus instead of recreating a deepVAE dataset
DATA_ARGS="
--train_batchsize $MICRO_BATCH_SIZE \
--eval_batchsize $MICRO_BATCH_SIZE \
--test_batchsize $MICRO_BATCH_SIZE \
--num_workers 32 \
--ds_name zh_finance
"
# --ds_name wudao_tdvae, ner_re_data, zh_finance
# --CVAE
SCRIPTS_PATH=xxx/fengshen/examples/pretrain_vae
export CMD=" \
$SCRIPTS_PATH/pretrain_deep_vae.py \
$TRAINER_ARGS \
$MODEL_CHECKPOINT_ARGS \
$VAE_ARGS \
$DATA_ARGS \
"
# srun python $CMD
# python -m debugpy --listen 5678 --wait-for-client $CMD
python $CMD |