|
export NCCL_IB_HCA=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 |
|
export NCCL_IB_DISABLE=0 |
|
export NCCL_SOCKET_IFNAME=eth1 |
|
export NCCL_DEBUG=INFO |
|
export NCCL_NVLS_ENABLE=0 |
|
export MASTER_PORT=$2 |
|
export TEXT_ENCODER_NAME="google/t5-v1_1-xxl" |
|
export VISION_ENCODER_NAME="google/siglip-so400m-patch14-384" |
|
export OUTPUT_DIR="./checkpoints/$1" |
|
export CFLAGS="-I/usr/include" |
|
export LDFLAGS="-L/usr/lib/x86_64-linux-gnu" |
|
export CUTLASS_PATH="/home/jellyho/cutlass" |
|
|
|
export WANDB_PROJECT="robotics_diffusion_transformer" |
|
|
|
if [ ! -d "$OUTPUT_DIR" ]; then |
|
mkdir "$OUTPUT_DIR" |
|
echo "Folder '$OUTPUT_DIR' created" |
|
else |
|
echo "Folder '$OUTPUT_DIR' already exists" |
|
fi |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
accelerate launch --main_process_port $2 --num_processes 2 --num_machines 1 --mixed_precision bf16 main.py \ |
|
--deepspeed="./configs/zero2.json" \ |
|
--pretrained_model_name_or_path="robotics-diffusion-transformer/rdt-1b" \ |
|
--pretrained_text_encoder_name_or_path=$TEXT_ENCODER_NAME \ |
|
--pretrained_vision_encoder_name_or_path=$VISION_ENCODER_NAME \ |
|
--output_dir=$OUTPUT_DIR \ |
|
--train_batch_size=8 \ |
|
--sample_batch_size=8 \ |
|
--max_train_steps=50000 \ |
|
--checkpointing_period=5000 \ |
|
--sample_period=1000 \ |
|
--checkpoints_total_limit=10 \ |
|
--lr_scheduler="constant" \ |
|
--learning_rate=1e-4 \ |
|
--mixed_precision="bf16" \ |
|
--dataloader_num_workers=16 \ |
|
--image_aug \ |
|
--dataset_type="finetune" \ |
|
--gradient_accumulation_steps 1 \ |
|
--report_to=wandb \ |
|
--load_from_hdf5 \ |
|
--dataset_name $1 \ |
|
--precomp_lang_embed |
|
|
|
|
|
|
|
|
|
|
|
|
|
|