File size: 2,193 Bytes
599450c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
#!/bin/bash
#SBATCH --job-name=CRIS_AML_pos10_m20_t005
#SBATCH --nodes=1
#SBATCH --gres=gpu:4
#SBATCH --time=0-12:00:00 # d-hh:mm:ss, job time limit
#SBATCH --mem=80G
#SBATCH --cpus-per-task=12
#SBATCH --output=CRIS_ACLver1_OP2_Verbonly_p10_m20_t005_b64.txt
source ${HOME}/.bashrc
source ${HOME}/miniconda3/bin/activate base
conda activate cris
cd /home/s1/chaeyunkim/VerbCentric_CY
if [ "$#" -ne 3 ]; then
echo "Usage: $0 <output_dir> <batch_size> <exp_name>"
exit 1
fi
# Trap SIGUSR1 to handle job requeueing
max_restarts=3
function resubmit() {
scontext=$(scontrol show job ${SLURM_JOB_ID})
restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2)
if [[ $restarts -lt $max_restarts ]]; then
echo "Resubmitting job (restart $restarts/$max_restarts)..."
scontrol requeue ${SLURM_JOB_ID}
exit 0
else
echo "Job has exceeded the maximum restart limit ($max_restarts restarts)."
exit 1
fi
}
trap 'resubmit' SIGUSR1
# Use the first argument passed to the script as OUTPUT_DIR
OUTPUT_DIR=$1
BATCH_SIZE=$2
EXP_NAME=$3
# Print variables for debugging
echo "OUTPUT_DIR: $OUTPUT_DIR"
echo "BATCH_SIZE: $BATCH_SIZE"
echo "EXP_NAME: $EXP_NAME"
# Create the directory if it does not exist
if [[ ! -d "$OUTPUT_DIR" ]]; then
echo "Directory $OUTPUT_DIR does not exist. Creating it..."
mkdir -p "$OUTPUT_DIR"
fi
# Construct the argument list
python_args="--config config/cris_verbonly_b64_nopos.yaml --opts TRAIN.metric_mode hardpos_only_op2 TRAIN.metric_loss_weight 0.1 TRAIN.hn_prob 0.0 TRAIN.resume latest TRAIN.batch_size ${BATCH_SIZE} TRAIN.margin_value 20 TRAIN.temperature 0.05 TRAIN.exp_name ${EXP_NAME} TRAIN.output_folder ${OUTPUT_DIR} Distributed.dist_url tcp://localhost:8845"
# Print the final command for debugging
echo "Final command: python -u train_angular_verb.py $python_args"
# Set NCCL environment variables
export NCCL_P2P_DISABLE=1
export NCCL_DEBUG=INFO
export NCCL_SOCKET_IFNAME=^docker0,lo
export CUDA_VISIBLE_DEVICES=0,1,2,3
# Run the Python training script
python -u train_angular_verb.py $python_args 2>&1 | tee ${OUTPUT_DIR}/${EXP_NAME}/train_rev_version1.log &
wait
exit 0 |