#!/bin/bash #SBATCH --job-name=CRIS_repro #SBATCH --nodes=1 #SBATCH --gres=gpu:4 #SBATCH --time=0-12:00:00 #SBATCH --mem=60G #SBATCH --cpus-per-task=12 #SBATCH --output=CRIS_REPRO.txt source ${HOME}/.bashrc source ${HOME}/miniconda3/bin/activate base conda activate cris cd /home/s1/chaeyunkim/VerbCentric_CY # Trap SIGUSR1 to handle job requeueing max_restarts=3 function resubmit() { scontext=$(scontrol show job ${SLURM_JOB_ID}) restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2) if [[ $restarts -lt $max_restarts ]]; then echo "Resubmitting job (restart $restarts/$max_restarts)..." scontrol requeue ${SLURM_JOB_ID} exit 0 else echo "Job has exceeded the maximum restart limit ($max_restarts restarts)." exit 1 fi } trap 'resubmit' SIGUSR1 export NCCL_P2P_DISABLE=1 export NCCL_DEBUG=INFO export NCCL_SOCKET_IFNAME=^docker0,lo export CUDA_VISIBLE_DEVICES=0,1,2,3 python -u train_angular_verb.py --config /home/s1/chaeyunkim/VerbCentric_CY/config/cris_r50.yaml 2>&1 | tee debug.log & wait exit 0