Spaces:

fclong
/

summary

Runtime error

App Files Files Community

summary / fengshen /examples /hubert /pretrain_hubert_base.sh

fclong

Upload 396 files

8ebda9e about 2 years ago

raw

history blame

3.5 kB

	#!/bin/bash
	#SBATCH --job-name=pretrain_bart # create a short name for your job
	#SBATCH --nodes=1 # node count
	#SBATCH --ntasks-per-node=8 # number of tasks to run per node
	#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks)
	#SBATCH --gres=gpu:8 # number of gpus per node
	#SBATCH -o %x-%j.log # output and error log file names (%x for job id)
	#SBATCH -x dgx050

	MODEL_NAME=hubert-base-ls960
	config_json="./$MODEL_NAME.ds_config.json"
	export MASTER_PORT=29503
	MICRO_BATCH_SIZE=8
	ZERO_STAGE=1

	# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
	cat <<EOT > $config_json
	{
	"zero_optimization": {
	"stage": ${ZERO_STAGE}
	},
	"fp16": {
	"enabled": true,
	"loss_scale": 0,
	"loss_scale_window": 1000,
	"initial_scale_power": 16,
	"hysteresis": 2,
	"min_loss_scale": 1
	},
	"tensorboard": {
	"enabled": true,
	"output_path": "/data/training_model/fengshen-${MODEL_NAME}/ds-tb-logs",
	"job_name": "${MODEL_NAME}"
	},
	"#flops_profiler": {
	"enabled": true,
	"profile_step": 200,
	"detailed": true,
	"output_file": null
	},
	"steps_per_print": 100,
	"gradient_clipping": 1,
	"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
	"zero_allow_untested_optimizer": false
	}
	EOT

	export PL_DEEPSPEED_CONFIG_PATH=$config_json
	export TORCH_EXTENSIONS_DIR=/home/gaoxinyu/torch_extendsions

	DATA_DIR=/data/common_data/librispeech_tsv/datas
	LABELS_DIR=/data/common_data/librispeech_tsv/labels

	DATA_ARGS="\
	--dataloader_workers 2 \
	--train_batchsize $MICRO_BATCH_SIZE \
	--val_batchsize 32 \
	--test_batchsize 8 \
	--val_datasets_field valid \
	--test_datasets_field valid \
	--sampler_type random \
	--data ${DATA_DIR} \
	--label_dir ${LABELS_DIR} \
	--labels km \
	--label_rate 100 \
	--max_sample_size 250000 \
	--min_sample_size 32000 \
	--pad_audio False \
	--random_crop True \
	--normalize False \
	"

	MODEL_ARGS="\
	--model_path /data/pretrained_model/$MODEL_NAME/ \
	--learning_rate 1e-4 \
	--weight_decay 1e-2 \
	--warmup_ratio 0.01 \
	--pred_masked_weight 1.0 \
	--loss_weights 10 \
	"

	MODEL_CHECKPOINT_ARGS="\
	--monitor train_loss \
	--save_top_k 0 \
	--mode min \
	--every_n_train_steps 10000 \
	--dirpath /data/training_model/ckpt/fengshen-$MODEL_NAME \
	--filename model-{step:02d}-{train_loss:.4f} \
	--every_n_epochs 0 \
	--save_last \
	--not_save_on_train_epoch_end \
	"

	# deepspeed_stage_${ZERO_STAGE} \
	TRAINER_ARGS="\
	--gradient_clip_val 1.0 \
	--max_epochs 10 \
	--gpus 2 \
	--num_nodes 1 \
	--strategy deepspeed_stage_${ZERO_STAGE} \
	--log_every_n_steps 100 \
	--val_check_interval 500 \
	--limit_val_batches 10 \
	--accumulate_grad_batches 1 \
	--precision 16 \
	--ckpt_path /data/training_model/ckpt/fengshen-${MODEL_NAME}/last.ckpt \
	--default_root_dir /data/training_model/fengshen-$MODEL_NAME \
	"


	export options=" \
	$DATA_ARGS \
	$MODEL_ARGS \
	$MODEL_CHECKPOINT_ARGS \
	$TRAINER_ARGS \
	"

	export SCRIPT_PATH=pretrain_hubert.py

	eval python3 -m debugpy --listen localhost:53005 --wait-for-client $SCRIPT_PATH $options