File size: 1,371 Bytes
da2e2ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
agent="hydra_offset"
bs=8
lr=0.0002
cache=null
resume="epoch09.ckpt"
config="competition_training"
epoch=20
replicas=8
dir=${agent}_vov_fixedpading_bs${bs}x${replicas}_ckpt

ngc batch run \
-in dgx1v.32g.8.norm \
--ace nv-us-west-2 \
--label _wl___computer_vision \
-n ml-model.lkl_train._wl___computer_vision \
--result /result \
-i nvcr.io/nvidian/swaiinf/lzx-navsim \
--workspace q-2TlPKESo62ktTxOc8rYg:/zhenxinl_nuplan \
--port 6007 \
--array-type "MPI" \
--replicas $replicas \
--total-runtime "4D" \
--commandline "
    mpirun --allow-run-as-root -np $replicas -npernode 1 bash -c '
    git pull;
    pip install --upgrade diffusers[torch];
    MASTER_PORT=29500 MASTER_ADDR=launcher-svc-\${NGC_JOB_ID} WORLD_SIZE=\${NGC_ARRAY_SIZE} NODE_RANK=\${NGC_ARRAY_INDEX} \
        python \${NAVSIM_DEVKIT_ROOT}/navsim/planning/script/run_training.py \
            --config-name $config \
            agent=$agent \
            +resume_ckpt_path=\${NAVSIM_EXP_ROOT}/$dir/$resume \
            trainer.params.num_nodes=$replicas \
            trainer.params.max_epochs=$epoch \
            ~trainer.params.strategy \
            dataloader.params.batch_size=$bs \
            experiment_name=$dir \
            cache_path=$cache \
            agent.config.ckpt_path=$dir \
            agent.lr=$lr \
            split=trainval \
            scene_filter=navtrain;
    '
    "