# agent="vadv2_4096_pdm_c512" # bs=8 # lr=0.0001 # agent="vadv2_8192_pdm_vit_mult0.1_progress_lw2" # bs=8 # lr=0.0002 # cache="navtrain_vadv2_4f_cache" agent="vadv2_8192_pdm_vov_mult0.1_progress_lw2_img1024" bs=2 lr=0.00005 cache="navtrain_vadv2+map_img256x1024_cache" replicas=8 ngc batch run \ -in dgx1v.32g.8.norm \ --ace nv-us-west-2 \ --label _wl___computer_vision \ -n ml-model.lzx_train._wl___computer_vision \ --result /result \ -i nvcr.io/nvidian/swaiinf/lzx-navsim \ --workspace q-2TlPKESo62ktTxOc8rYg:/zhenxinl_nuplan \ --port 6007 \ --array-type "MPI" \ --replicas $replicas \ --total-runtime "4D" \ --commandline " mpirun --allow-run-as-root -np $replicas -npernode 1 bash -c ' git pull; cd navsim/agents/backbones/ops_dcnv3; bash ./make.sh; cd /navsim_ours; MASTER_PORT=29500 MASTER_ADDR=launcher-svc-\${NGC_JOB_ID} WORLD_SIZE=\${NGC_ARRAY_SIZE} NODE_RANK=\${NGC_ARRAY_INDEX} \ python \${NAVSIM_DEVKIT_ROOT}/navsim/planning/script/run_training.py \ agent=$agent \ trainer.params.num_nodes=$replicas \ dataloader.params.batch_size=$bs \ experiment_name=${agent}_ckpt \ cache_path=\${NAVSIM_EXP_ROOT}/$cache \ agent.config.ckpt_path=${agent}_ckpt \ agent.lr=$lr \ split=trainval \ scene_filter=navtrain; ' sleep 0.1h; "