# agent="vadv2_4096_pdm_c512" | |
# bs=8 | |
# lr=0.0001 | |
# agent="vadv2_8192_pdm_vit_mult0.1_progress_lw2" | |
# bs=8 | |
# lr=0.0002 | |
# cache="navtrain_vadv2_4f_cache" | |
agent="vadv2_8192_pdm_vov_mult0.1_progress_lw2_img1024" | |
bs=2 | |
lr=0.00005 | |
cache="navtrain_vadv2+map_img256x1024_cache" | |
replicas=8 | |
ngc batch run \ | |
-in dgx1v.32g.8.norm \ | |
--ace nv-us-west-2 \ | |
--label _wl___computer_vision \ | |
-n ml-model.lzx_train._wl___computer_vision \ | |
--result /result \ | |
-i nvcr.io/nvidian/swaiinf/lzx-navsim \ | |
--workspace q-2TlPKESo62ktTxOc8rYg:/zhenxinl_nuplan \ | |
--port 6007 \ | |
--array-type "MPI" \ | |
--replicas $replicas \ | |
--total-runtime "4D" \ | |
--commandline " | |
mpirun --allow-run-as-root -np $replicas -npernode 1 bash -c ' | |
git pull; cd navsim/agents/backbones/ops_dcnv3; bash ./make.sh; cd /navsim_ours; | |
MASTER_PORT=29500 MASTER_ADDR=launcher-svc-\${NGC_JOB_ID} WORLD_SIZE=\${NGC_ARRAY_SIZE} NODE_RANK=\${NGC_ARRAY_INDEX} \ | |
python \${NAVSIM_DEVKIT_ROOT}/navsim/planning/script/run_training.py \ | |
agent=$agent \ | |
trainer.params.num_nodes=$replicas \ | |
dataloader.params.batch_size=$bs \ | |
experiment_name=${agent}_ckpt \ | |
cache_path=\${NAVSIM_EXP_ROOT}/$cache \ | |
agent.config.ckpt_path=${agent}_ckpt \ | |
agent.lr=$lr \ | |
split=trainval \ | |
scene_filter=navtrain; | |
' | |
sleep 0.1h; | |
" |