File size: 1,374 Bytes
da2e2ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
# agent="vadv2_4096_pdm_c512"
# bs=8
# lr=0.0001
# agent="vadv2_8192_pdm_vit_mult0.1_progress_lw2"
# bs=8
# lr=0.0002
# cache="navtrain_vadv2_4f_cache"
agent="vadv2_8192_pdm_vov_mult0.1_progress_lw2_img1024"
bs=2
lr=0.00005
cache="navtrain_vadv2+map_img256x1024_cache"
replicas=8
ngc batch run \
-in dgx1v.32g.8.norm \
--ace nv-us-west-2 \
--label _wl___computer_vision \
-n ml-model.lzx_train._wl___computer_vision \
--result /result \
-i nvcr.io/nvidian/swaiinf/lzx-navsim \
--workspace q-2TlPKESo62ktTxOc8rYg:/zhenxinl_nuplan \
--port 6007 \
--array-type "MPI" \
--replicas $replicas \
--total-runtime "4D" \
--commandline "
mpirun --allow-run-as-root -np $replicas -npernode 1 bash -c '
git pull; cd navsim/agents/backbones/ops_dcnv3; bash ./make.sh; cd /navsim_ours;
MASTER_PORT=29500 MASTER_ADDR=launcher-svc-\${NGC_JOB_ID} WORLD_SIZE=\${NGC_ARRAY_SIZE} NODE_RANK=\${NGC_ARRAY_INDEX} \
python \${NAVSIM_DEVKIT_ROOT}/navsim/planning/script/run_training.py \
agent=$agent \
trainer.params.num_nodes=$replicas \
dataloader.params.batch_size=$bs \
experiment_name=${agent}_ckpt \
cache_path=\${NAVSIM_EXP_ROOT}/$cache \
agent.config.ckpt_path=${agent}_ckpt \
agent.lr=$lr \
split=trainval \
scene_filter=navtrain;
'
sleep 0.1h;
" |