|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
module purge |
|
module load nvhpc cuda cray-mpich conda |
|
conda activate /glade/work/ksha/miniconda3/envs/credit |
|
|
|
nodes=( $( cat $PBS_NODEFILE ) ) |
|
head_node=${nodes[0]} |
|
head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}') |
|
|
|
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31" |
|
export LSCRATCH=/glade/derecho/scratch/ksha/ |
|
export LOGLEVEL=INFO |
|
|
|
|
|
export NCCL_SOCKET_IFNAME=hsn |
|
export NCCL_HOME=/glade/u/home/dhoward/work/nccl-ofi-plugin/install |
|
export LD_LIBRARY_PATH=$NCCL_HOME/lib:$NCCL_HOME/plugin/lib:$LD_LIBRARY_PATH |
|
|
|
export NCCL_NCHANNELS_PER_NET_PEER=4 |
|
export MPICH_GPU_SUPPORT_ENABLED=1 |
|
export MPICH_OFI_NIC_POLICY=GPU |
|
export MPICH_RDMA_ENABLED_CUDA=1 |
|
export NCCL_DISABLE_IB=1 |
|
export NCCL_CROSS_NIC=1 |
|
export FI_CXI_DISABLE_HOST_REGISTER=1 |
|
export FI_CXI_OPTIMIZED_MRS=false |
|
|
|
|
|
echo "Number of nodes: 8" |
|
echo "Number of GPUs per node: 4" |
|
echo "Total number of GPUs: 32" |
|
|
|
|
|
|
|
|
|
mpiexec -n 8 --ppn 1 --cpu-bind none torchrun --nnodes=8 --nproc-per-node=4 --rdzv-backend=c10d --rdzv-endpoint=$head_node_ip /glade/u/home/ksha/miles-credit/applications/rollout_to_netcdf.py -c /glade/work/ksha/CREDIT_runs/fuxi_6h/model_predict.yml |
|
|