File size: 2,204 Bytes
599450c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/bin/bash
#SBATCH --job-name=CRIS_AML_pos10_m20_t005
#SBATCH --nodes=1
#SBATCH --gres=gpu:4
#SBATCH --partition=vip
#SBATCH --time=unlimited
#SBATCH --mem=80G
#SBATCH --cpus-per-task=12
#SBATCH --output=logs/CRIS_ACLver2_notgt_p10_m15_t005_b64.txt

source ${HOME}/.bashrc
source ${HOME}/miniconda3/bin/activate base
conda activate cris 

cd /home/s1/chaeyunkim/VerbCentric_CY

if [ "$#" -ne 3 ]; then
    echo "Usage: $0 <output_dir> <batch_size> <exp_name>"
    exit 1
fi

# Trap SIGUSR1 to handle job requeueing
max_restarts=3

function resubmit() {
    scontext=$(scontrol show job ${SLURM_JOB_ID})
    restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2)
    if [[ $restarts -lt $max_restarts ]]; then
        echo "Resubmitting job (restart $restarts/$max_restarts)..."
        scontrol requeue ${SLURM_JOB_ID}
        exit 0
    else
        echo "Job has exceeded the maximum restart limit ($max_restarts restarts)."
        exit 1
    fi
}
trap 'resubmit' SIGUSR1

# Use the first argument passed to the script as OUTPUT_DIR
OUTPUT_DIR=$1
BATCH_SIZE=$2
EXP_NAME=$3

# Print variables for debugging
echo "OUTPUT_DIR: $OUTPUT_DIR"
echo "BATCH_SIZE: $BATCH_SIZE"
echo "EXP_NAME: $EXP_NAME"

# Create the directory if it does not exist
if [[ ! -d "$OUTPUT_DIR" ]]; then
    echo "Directory $OUTPUT_DIR does not exist. Creating it..."
    mkdir -p "$OUTPUT_DIR"
fi

# Construct the argument list
python_args="--config config/cris_verbonly_b64_nopos.yaml \
--opts TRAIN.metric_mode hardpos_only_fin \
TRAIN.metric_loss_weight 0.1 \
TRAIN.hn_prob 0.0 \
TRAIN.resume latest \
TRAIN.batch_size ${BATCH_SIZE} \
TRAIN.margin_value 15 \
TRAIN.temperature 0.05 \
TRAIN.exp_name ${EXP_NAME} \
TRAIN.output_folder ${OUTPUT_DIR} \
Distributed.dist_url tcp://localhost:7023"

# Print the final command for debugging
echo "Final command: python -u train_angular_verb.py $python_args"

# Set NCCL environment variables
export NCCL_P2P_DISABLE=1
export NCCL_DEBUG=INFO
export NCCL_SOCKET_IFNAME=^docker0,lo
export CUDA_VISIBLE_DEVICES=0,1,2,3

# Run the Python training script
python -u train_angular_verb.py $python_args 2>&1 | tee ${OUTPUT_DIR}/${EXP_NAME}/train_rev_version1.log &

wait
exit 0