File size: 3,341 Bytes
8ebda9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/bash

#SBATCH --job-name=deep_vae_pretrain
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=32 #
#SBATCH --gres=gpu:1                 # number of gpus
#SBATCH -o xxx/outputs/deep_vae/logs/slurm/%x-%j.log
#SBATCH -e xxx/outputs/deep_vae/logs/slurm/%x-%j.err
# SBATCH --requeue
# SBATCH --qos=preemptive

set -x -e

ulimit -s unlimited
echo "START TIME: $(date)"

MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
# export MASTER_ADDR=127.0.0.1
export MASTER_PORT=$[RANDOM%10000+50000]

MICRO_BATCH_SIZE=64
ZERO_STAGE=0

ROOT_PATH=xxxx
config_json=${ROOT_PATH}/job_out/ds_config.json

# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
cat <<EOT > $config_json
{
  "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE},
  "steps_per_print": 100,
  "gradient_clipping": 1.0,
  "zero_optimization": {
    "stage": $ZERO_STAGE,
    "contiguous_gradients": false,
    "overlap_comm": true,
    "reduce_scatter": true,
    "reduce_bucket_size": 50000000,
    "allgather_bucket_size": 500000000
  },
  "optimizer": {
    "type": "Adam",
    "params": {
      "lr": 1e-5,
      "betas": [
        0.9,
        0.95
      ],
      "eps": 1e-8,
      "weight_decay": 1e-2
    }
  },
  "scheduler": {
    "type": "WarmupLR",
    "params":{
      "warmup_min_lr": 5e-6,
      "warmup_max_lr": 1e-5
    }
  },
  "zero_allow_untested_optimizer": false,
  "fp16": {
    "enabled": false,
    "loss_scale": 0,
    "loss_scale_window": 1000,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "activation_checkpointing": {
    "partition_activations": false,
    "contiguous_memory_optimization": false
  },
  "wall_clock_breakdown": false
}
EOT
export PL_DEEPSPEED_CONFIG_PATH=$config_json
export TORCH_EXTENSIONS_DIR=~/tmp

# NOTE both encoder and decoder use the same model
GPT2_MODEL_PATH=xxx
VAE_ARGS="
    --gpt2_model_path $GPT2_MODEL_PATH \
    --latent_dim 32 \
    --beta_kl_constraints_start 1e-5 \
    --beta_kl_constraints_stop 1. \
    --beta_n_cycles 40 \
"


CHECKPOINT_SAVE_PATH=${ROOT_PATH}/checkpoints
MODEL_CHECKPOINT_ARGS="\
        --monitor val_recon_loss \
        --save_top_k 1 \
        --mode min \
        --every_n_train_steps 1000 \
        --save_weights_only True \
        --dirpath $CHECKPOINT_SAVE_PATH \
        --filename checkpoint-{epoch}-{step}-filenum_20_dim_32_beta_1e-5_1_zh_finance \
        "

TRAINER_ARGS="
    --max_epochs 40 \
    --gpus 1 \
    --num_nodes 1 \
    --precision 16 \
    --val_check_interval 1000 \
    --learning_rate 5e-5 \
    --warmup_steps 10000 \
    --weight_decay 0.01 \
    --default_root_dir ${ROOT_PATH} \
    --log_every_n_steps 50 \
    --strategy deepspeed_stage_2 \
"
# --strategy deepspeed_stage_2 \

# note we use wudao optimus instead of recreating a deepVAE dataset
DATA_ARGS="
    --train_batchsize $MICRO_BATCH_SIZE \
    --eval_batchsize $MICRO_BATCH_SIZE \
    --test_batchsize $MICRO_BATCH_SIZE \
    --num_workers 32 \
    --ds_name zh_finance 
"
# --ds_name wudao_tdvae, ner_re_data, zh_finance
# --CVAE
SCRIPTS_PATH=xxx/fengshen/examples/pretrain_vae

export CMD=" \
    $SCRIPTS_PATH/pretrain_deep_vae.py \
    $TRAINER_ARGS \
    $MODEL_CHECKPOINT_ARGS \
    $VAE_ARGS \
    $DATA_ARGS \
    "
# srun python $CMD
# python -m debugpy --listen 5678 --wait-for-client $CMD
python $CMD