#!/bin/bash #SBATCH --job-name=CRIS_AML_pos10_m20_t005 #SBATCH --nodes=1 #SBATCH --gres=gpu:4 #SBATCH --time=0-12:00:00 # d-hh:mm:ss, job time limit #SBATCH --mem=80G #SBATCH --cpus-per-task=12 #SBATCH --output=CRIS_ACLver1_OP2_Verbonly_p10_m20_t005_b64.txt source ${HOME}/.bashrc source ${HOME}/miniconda3/bin/activate base conda activate cris cd /home/s1/chaeyunkim/VerbCentric_CY if [ "$#" -ne 3 ]; then echo "Usage: $0 " exit 1 fi # Trap SIGUSR1 to handle job requeueing max_restarts=3 function resubmit() { scontext=$(scontrol show job ${SLURM_JOB_ID}) restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2) if [[ $restarts -lt $max_restarts ]]; then echo "Resubmitting job (restart $restarts/$max_restarts)..." scontrol requeue ${SLURM_JOB_ID} exit 0 else echo "Job has exceeded the maximum restart limit ($max_restarts restarts)." exit 1 fi } trap 'resubmit' SIGUSR1 # Use the first argument passed to the script as OUTPUT_DIR OUTPUT_DIR=$1 BATCH_SIZE=$2 EXP_NAME=$3 # Print variables for debugging echo "OUTPUT_DIR: $OUTPUT_DIR" echo "BATCH_SIZE: $BATCH_SIZE" echo "EXP_NAME: $EXP_NAME" # Create the directory if it does not exist if [[ ! -d "$OUTPUT_DIR" ]]; then echo "Directory $OUTPUT_DIR does not exist. Creating it..." mkdir -p "$OUTPUT_DIR" fi # Construct the argument list python_args="--config config/cris_verbonly_b64_nopos.yaml --opts TRAIN.metric_mode hardpos_only_op2 TRAIN.metric_loss_weight 0.1 TRAIN.hn_prob 0.0 TRAIN.resume latest TRAIN.batch_size ${BATCH_SIZE} TRAIN.margin_value 20 TRAIN.temperature 0.05 TRAIN.exp_name ${EXP_NAME} TRAIN.output_folder ${OUTPUT_DIR} Distributed.dist_url tcp://localhost:8845" # Print the final command for debugging echo "Final command: python -u train_angular_verb.py $python_args" # Set NCCL environment variables export NCCL_P2P_DISABLE=1 export NCCL_DEBUG=INFO export NCCL_SOCKET_IFNAME=^docker0,lo export CUDA_VISIBLE_DEVICES=0,1,2,3 # Run the Python training script python -u train_angular_verb.py $python_args 2>&1 | tee ${OUTPUT_DIR}/${EXP_NAME}/train_rev_version1.log & wait exit 0