Upload folder using huggingface_hub
Browse files- scripts/test_verb.sh +53 -0
- scripts/train_notarget.sh +81 -0
- scripts/train_repro.sh +41 -0
- scripts/train_tmp.sh +53 -0
- scripts/train_tmp_seunghoon.sh +57 -0
- scripts/train_verb.sh +70 -0
- scripts/train_verb_vip.sh +71 -0
scripts/test_verb.sh
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
#SBATCH --job-name=EVAL
|
3 |
+
#SBATCH --nodes=1
|
4 |
+
#SBATCH --gres=gpu:1
|
5 |
+
#SBATCH --time=0-12:00:00 # d-hh:mm:ss, job time limit
|
6 |
+
#SBATCH --mem=50G
|
7 |
+
#SBATCH --cpus-per-task=8 # cpu ����
|
8 |
+
#SBATCH --output=./log_eval/ACLrevised1_VO_hp10_m20_tmp005_b64.txt
|
9 |
+
|
10 |
+
|
11 |
+
source ${HOME}/.bashrc
|
12 |
+
source ${HOME}/miniconda3/bin/activate base
|
13 |
+
conda activate cris
|
14 |
+
|
15 |
+
cd /home/s1/chaeyunkim/VerbCentric_CY
|
16 |
+
export NCCL_P2P_DISABLE=1
|
17 |
+
|
18 |
+
# constants
|
19 |
+
CONFIG=config/cris_r50.yaml
|
20 |
+
VAL_LMDB=datasets/lmdb/refcocog_u/val.lmdb
|
21 |
+
TEST_LMDB=datasets/lmdb/refcocog_u/test.lmdb
|
22 |
+
# variables
|
23 |
+
EXP_NAME=ACLrevised1_VO_hp10_m20_tmp005_b64
|
24 |
+
OPT_DIR=exp/refcocog_u/hardpos_loss_abl
|
25 |
+
|
26 |
+
|
27 |
+
# TEST
|
28 |
+
# test oIoU
|
29 |
+
CUDA_VISIBLE_DEVICES=0 python -u test_oiou.py --config $CONFIG\
|
30 |
+
--opts TRAIN.exp_name $EXP_NAME\
|
31 |
+
TRAIN.output_folder $OPT_DIR\
|
32 |
+
TEST.test_split test\
|
33 |
+
TEST.test_lmdb $TEST_LMDB
|
34 |
+
# test mIoU
|
35 |
+
CUDA_VISIBLE_DEVICES=0 python -u test.py --config $CONFIG\
|
36 |
+
--opts TRAIN.exp_name $EXP_NAME\
|
37 |
+
TRAIN.output_folder $OPT_DIR\
|
38 |
+
TEST.test_split test\
|
39 |
+
TEST.test_lmdb $TEST_LMDB
|
40 |
+
|
41 |
+
# VAL
|
42 |
+
# val oIoU
|
43 |
+
CUDA_VISIBLE_DEVICES=0 python -u test_oiou.py --config $CONFIG\
|
44 |
+
--opts TRAIN.exp_name $EXP_NAME\
|
45 |
+
TRAIN.output_folder $OPT_DIR\
|
46 |
+
TEST.test_split val-test\
|
47 |
+
TEST.test_lmdb $VAL_LMDB
|
48 |
+
# val mIoU
|
49 |
+
CUDA_VISIBLE_DEVICES=0 python -u test.py --config $CONFIG\
|
50 |
+
--opts TRAIN.exp_name $EXP_NAME\
|
51 |
+
TRAIN.output_folder $OPT_DIR\
|
52 |
+
TEST.test_split val-test\
|
53 |
+
TEST.test_lmdb $VAL_LMDB
|
scripts/train_notarget.sh
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
#SBATCH --job-name=CRIS_AML_pos10_m20_t005
|
3 |
+
#SBATCH --nodes=1
|
4 |
+
#SBATCH --gres=gpu:4
|
5 |
+
#SBATCH --partition=vip
|
6 |
+
#SBATCH --time=unlimited
|
7 |
+
#SBATCH --mem=80G
|
8 |
+
#SBATCH --cpus-per-task=12
|
9 |
+
#SBATCH --output=logs/CRIS_ACLver2_notgt_p10_m15_t005_b64.txt
|
10 |
+
|
11 |
+
source ${HOME}/.bashrc
|
12 |
+
source ${HOME}/miniconda3/bin/activate base
|
13 |
+
conda activate cris
|
14 |
+
|
15 |
+
cd /home/s1/chaeyunkim/VerbCentric_CY
|
16 |
+
|
17 |
+
if [ "$#" -ne 3 ]; then
|
18 |
+
echo "Usage: $0 <output_dir> <batch_size> <exp_name>"
|
19 |
+
exit 1
|
20 |
+
fi
|
21 |
+
|
22 |
+
# Trap SIGUSR1 to handle job requeueing
|
23 |
+
max_restarts=3
|
24 |
+
|
25 |
+
function resubmit() {
|
26 |
+
scontext=$(scontrol show job ${SLURM_JOB_ID})
|
27 |
+
restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2)
|
28 |
+
if [[ $restarts -lt $max_restarts ]]; then
|
29 |
+
echo "Resubmitting job (restart $restarts/$max_restarts)..."
|
30 |
+
scontrol requeue ${SLURM_JOB_ID}
|
31 |
+
exit 0
|
32 |
+
else
|
33 |
+
echo "Job has exceeded the maximum restart limit ($max_restarts restarts)."
|
34 |
+
exit 1
|
35 |
+
fi
|
36 |
+
}
|
37 |
+
trap 'resubmit' SIGUSR1
|
38 |
+
|
39 |
+
# Use the first argument passed to the script as OUTPUT_DIR
|
40 |
+
OUTPUT_DIR=$1
|
41 |
+
BATCH_SIZE=$2
|
42 |
+
EXP_NAME=$3
|
43 |
+
|
44 |
+
# Print variables for debugging
|
45 |
+
echo "OUTPUT_DIR: $OUTPUT_DIR"
|
46 |
+
echo "BATCH_SIZE: $BATCH_SIZE"
|
47 |
+
echo "EXP_NAME: $EXP_NAME"
|
48 |
+
|
49 |
+
# Create the directory if it does not exist
|
50 |
+
if [[ ! -d "$OUTPUT_DIR" ]]; then
|
51 |
+
echo "Directory $OUTPUT_DIR does not exist. Creating it..."
|
52 |
+
mkdir -p "$OUTPUT_DIR"
|
53 |
+
fi
|
54 |
+
|
55 |
+
# Construct the argument list
|
56 |
+
python_args="--config config/cris_verbonly_b64_nopos.yaml \
|
57 |
+
--opts TRAIN.metric_mode hardpos_only_fin \
|
58 |
+
TRAIN.metric_loss_weight 0.1 \
|
59 |
+
TRAIN.hn_prob 0.0 \
|
60 |
+
TRAIN.resume latest \
|
61 |
+
TRAIN.batch_size ${BATCH_SIZE} \
|
62 |
+
TRAIN.margin_value 15 \
|
63 |
+
TRAIN.temperature 0.05 \
|
64 |
+
TRAIN.exp_name ${EXP_NAME} \
|
65 |
+
TRAIN.output_folder ${OUTPUT_DIR} \
|
66 |
+
Distributed.dist_url tcp://localhost:7023"
|
67 |
+
|
68 |
+
# Print the final command for debugging
|
69 |
+
echo "Final command: python -u train_angular_verb.py $python_args"
|
70 |
+
|
71 |
+
# Set NCCL environment variables
|
72 |
+
export NCCL_P2P_DISABLE=1
|
73 |
+
export NCCL_DEBUG=INFO
|
74 |
+
export NCCL_SOCKET_IFNAME=^docker0,lo
|
75 |
+
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
76 |
+
|
77 |
+
# Run the Python training script
|
78 |
+
python -u train_angular_verb.py $python_args 2>&1 | tee ${OUTPUT_DIR}/${EXP_NAME}/train_rev_version1.log &
|
79 |
+
|
80 |
+
wait
|
81 |
+
exit 0
|
scripts/train_repro.sh
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
#SBATCH --job-name=CRIS_repro
|
3 |
+
#SBATCH --nodes=1
|
4 |
+
#SBATCH --gres=gpu:4
|
5 |
+
#SBATCH --time=0-12:00:00
|
6 |
+
#SBATCH --mem=60G
|
7 |
+
#SBATCH --cpus-per-task=12
|
8 |
+
#SBATCH --output=CRIS_REPRO.txt
|
9 |
+
|
10 |
+
source ${HOME}/.bashrc
|
11 |
+
source ${HOME}/miniconda3/bin/activate base
|
12 |
+
conda activate cris
|
13 |
+
|
14 |
+
cd /home/s1/chaeyunkim/VerbCentric_CY
|
15 |
+
|
16 |
+
|
17 |
+
# Trap SIGUSR1 to handle job requeueing
|
18 |
+
max_restarts=3
|
19 |
+
|
20 |
+
function resubmit() {
|
21 |
+
scontext=$(scontrol show job ${SLURM_JOB_ID})
|
22 |
+
restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2)
|
23 |
+
if [[ $restarts -lt $max_restarts ]]; then
|
24 |
+
echo "Resubmitting job (restart $restarts/$max_restarts)..."
|
25 |
+
scontrol requeue ${SLURM_JOB_ID}
|
26 |
+
exit 0
|
27 |
+
else
|
28 |
+
echo "Job has exceeded the maximum restart limit ($max_restarts restarts)."
|
29 |
+
exit 1
|
30 |
+
fi
|
31 |
+
}
|
32 |
+
trap 'resubmit' SIGUSR1
|
33 |
+
|
34 |
+
export NCCL_P2P_DISABLE=1
|
35 |
+
export NCCL_DEBUG=INFO
|
36 |
+
export NCCL_SOCKET_IFNAME=^docker0,lo
|
37 |
+
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
38 |
+
python -u train_angular_verb.py --config /home/s1/chaeyunkim/VerbCentric_CY/config/cris_r50.yaml 2>&1 | tee debug.log &
|
39 |
+
|
40 |
+
wait
|
41 |
+
exit 0
|
scripts/train_tmp.sh
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
#SBATCH --job-name=CRIS_AML_pos10_m20_t005
|
3 |
+
#SBATCH --nodes=1
|
4 |
+
#SBATCH --gres=gpu:1
|
5 |
+
#SBATCH --time=0-12:00:00 # d-hh:mm:ss, job time limit
|
6 |
+
#SBATCH --mem=20G
|
7 |
+
#SBATCH --cpus-per-task=4
|
8 |
+
#SBATCH --output=0_debug.txt
|
9 |
+
|
10 |
+
source ${HOME}/.bashrc
|
11 |
+
source ${HOME}/miniconda3/bin/activate base
|
12 |
+
conda activate cris
|
13 |
+
|
14 |
+
cd /home/s1/chaeyunkim/VerbCentric_CY
|
15 |
+
|
16 |
+
if [ "$#" -ne 3 ]; then
|
17 |
+
echo "Usage: $0 <output_dir> <batch_size> <exp_name>"
|
18 |
+
exit 1
|
19 |
+
fi
|
20 |
+
|
21 |
+
# Use the first argument passed to the script as OUTPUT_DIR
|
22 |
+
OUTPUT_DIR=$1
|
23 |
+
BATCH_SIZE=$2
|
24 |
+
EXP_NAME=$3
|
25 |
+
|
26 |
+
# Print variables for debugging
|
27 |
+
echo "OUTPUT_DIR: $OUTPUT_DIR"
|
28 |
+
echo "BATCH_SIZE: $BATCH_SIZE"
|
29 |
+
echo "EXP_NAME: $EXP_NAME"
|
30 |
+
|
31 |
+
# Create the directory if it does not exist
|
32 |
+
if [[ ! -d "$OUTPUT_DIR" ]]; then
|
33 |
+
echo "Directory $OUTPUT_DIR does not exist. Creating it..."
|
34 |
+
mkdir -p "$OUTPUT_DIR"
|
35 |
+
fi
|
36 |
+
|
37 |
+
# Construct the argument list
|
38 |
+
python_args="--config config/cris_verbonly_b64_nopos.yaml --opts TRAIN.metric_mode hardpos_only_op2 TRAIN.metric_loss_weight 0.1 TRAIN.hn_prob 0.0 TRAIN.resume latest TRAIN.batch_size ${BATCH_SIZE} TRAIN.margin_value 20 TRAIN.temperature 0.05 TRAIN.exp_name ${EXP_NAME} TRAIN.output_folder ${OUTPUT_DIR} Distributed.dist_url tcp://localhost:8845"
|
39 |
+
|
40 |
+
# Print the final command for debugging
|
41 |
+
echo "Final command: python -u train_angular_verb.py $python_args"
|
42 |
+
|
43 |
+
# Set NCCL environment variables
|
44 |
+
export NCCL_P2P_DISABLE=1
|
45 |
+
export NCCL_DEBUG=INFO
|
46 |
+
export NCCL_SOCKET_IFNAME=^docker0,lo
|
47 |
+
export CUDA_VISIBLE_DEVICES=0
|
48 |
+
|
49 |
+
# Run the Python training script
|
50 |
+
python -u train_angular_verb.py $python_args 2>&1 | tee curr-debug.log &
|
51 |
+
|
52 |
+
wait
|
53 |
+
exit 0
|
scripts/train_tmp_seunghoon.sh
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# 환경 설정
|
4 |
+
source ${HOME}/.bashrc
|
5 |
+
eval "$(conda shell.bash hook)"
|
6 |
+
conda activate cris
|
7 |
+
|
8 |
+
# CUDA 및 분산 학습 환경 설정
|
9 |
+
export NCCL_P2P_DISABLE=1
|
10 |
+
export NCCL_DEBUG=INFO
|
11 |
+
export NCCL_IB_DISABLE=1
|
12 |
+
export NCCL_SOCKET_IFNAME=^docker0,lo
|
13 |
+
export CUDA_VISIBLE_DEVICES=0,1
|
14 |
+
|
15 |
+
cd /home/s1/chaeyunkim/VerbCentric_CY
|
16 |
+
|
17 |
+
# 인자 확인
|
18 |
+
if [ "$#" -ne 3 ]; then
|
19 |
+
echo "Usage: $0 <output_dir> <batch_size> <exp_name>"
|
20 |
+
exit 1
|
21 |
+
fi
|
22 |
+
|
23 |
+
# 변수 설정
|
24 |
+
OUTPUT_DIR=$1
|
25 |
+
BATCH_SIZE=$2
|
26 |
+
EXP_NAME=$3
|
27 |
+
|
28 |
+
echo $OUTPUT_DIR
|
29 |
+
echo $BATCH_SIZE
|
30 |
+
echo $EXP_NAME
|
31 |
+
|
32 |
+
# 출력 디렉토리 생성
|
33 |
+
if [[ ! -d "$OUTPUT_DIR" ]]; then
|
34 |
+
echo "Directory $OUTPUT_DIR does not exist. Creating it..."
|
35 |
+
mkdir -p "$OUTPUT_DIR"
|
36 |
+
fi
|
37 |
+
|
38 |
+
# 모델 체크포인트 확인
|
39 |
+
FINAL_MODEL="${OUTPUT_DIR}/last_model.pth"
|
40 |
+
if [[ ! -f "$FINAL_MODEL" ]]; then
|
41 |
+
resume_arg=""
|
42 |
+
else
|
43 |
+
resume_arg="--resume"
|
44 |
+
model_weights="${FINAL_MODEL}"
|
45 |
+
fi
|
46 |
+
|
47 |
+
# Python 실행 인자 구성
|
48 |
+
python_args="--config config/cris_verbonly_b64_nopos.yaml --opts TRAIN.metric_mode hardpos_only TRAIN.metric_loss_weight 0.1 TRAIN.hn_prob 0.0 TRAIN.batch_size ${BATCH_SIZE} TRAIN.margin_value 15 TRAIN.temperature 0.05 TRAIN.exp_name ${EXP_NAME} TRAIN.output_folder ${OUTPUT_DIR} Distributed.dist_url tcp://localhost:7023"
|
49 |
+
|
50 |
+
# Resume 인자 추가
|
51 |
+
if [[ -n "$resume_arg" ]]; then
|
52 |
+
python_args="$resume_arg ${model_weights} $python_args"
|
53 |
+
fi
|
54 |
+
|
55 |
+
# 학습 실행 및 로그 저장
|
56 |
+
echo "Starting training..."
|
57 |
+
python -u train_angular_verb.py $python_args 2>&1 | tee ${OUTPUT_DIR}/training.log
|
scripts/train_verb.sh
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
#SBATCH --job-name=CRIS_AML_pos10_m20_t005
|
3 |
+
#SBATCH --nodes=1
|
4 |
+
#SBATCH --gres=gpu:4
|
5 |
+
#SBATCH --time=0-12:00:00 # d-hh:mm:ss, job time limit
|
6 |
+
#SBATCH --mem=80G
|
7 |
+
#SBATCH --cpus-per-task=12
|
8 |
+
#SBATCH --output=CRIS_ACLver1_OP2_Verbonly_p10_m20_t005_b64.txt
|
9 |
+
|
10 |
+
source ${HOME}/.bashrc
|
11 |
+
source ${HOME}/miniconda3/bin/activate base
|
12 |
+
conda activate cris
|
13 |
+
|
14 |
+
cd /home/s1/chaeyunkim/VerbCentric_CY
|
15 |
+
|
16 |
+
if [ "$#" -ne 3 ]; then
|
17 |
+
echo "Usage: $0 <output_dir> <batch_size> <exp_name>"
|
18 |
+
exit 1
|
19 |
+
fi
|
20 |
+
|
21 |
+
# Trap SIGUSR1 to handle job requeueing
|
22 |
+
max_restarts=3
|
23 |
+
|
24 |
+
function resubmit() {
|
25 |
+
scontext=$(scontrol show job ${SLURM_JOB_ID})
|
26 |
+
restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2)
|
27 |
+
if [[ $restarts -lt $max_restarts ]]; then
|
28 |
+
echo "Resubmitting job (restart $restarts/$max_restarts)..."
|
29 |
+
scontrol requeue ${SLURM_JOB_ID}
|
30 |
+
exit 0
|
31 |
+
else
|
32 |
+
echo "Job has exceeded the maximum restart limit ($max_restarts restarts)."
|
33 |
+
exit 1
|
34 |
+
fi
|
35 |
+
}
|
36 |
+
trap 'resubmit' SIGUSR1
|
37 |
+
|
38 |
+
# Use the first argument passed to the script as OUTPUT_DIR
|
39 |
+
OUTPUT_DIR=$1
|
40 |
+
BATCH_SIZE=$2
|
41 |
+
EXP_NAME=$3
|
42 |
+
|
43 |
+
# Print variables for debugging
|
44 |
+
echo "OUTPUT_DIR: $OUTPUT_DIR"
|
45 |
+
echo "BATCH_SIZE: $BATCH_SIZE"
|
46 |
+
echo "EXP_NAME: $EXP_NAME"
|
47 |
+
|
48 |
+
# Create the directory if it does not exist
|
49 |
+
if [[ ! -d "$OUTPUT_DIR" ]]; then
|
50 |
+
echo "Directory $OUTPUT_DIR does not exist. Creating it..."
|
51 |
+
mkdir -p "$OUTPUT_DIR"
|
52 |
+
fi
|
53 |
+
|
54 |
+
# Construct the argument list
|
55 |
+
python_args="--config config/cris_verbonly_b64_nopos.yaml --opts TRAIN.metric_mode hardpos_only_op2 TRAIN.metric_loss_weight 0.1 TRAIN.hn_prob 0.0 TRAIN.resume latest TRAIN.batch_size ${BATCH_SIZE} TRAIN.margin_value 20 TRAIN.temperature 0.05 TRAIN.exp_name ${EXP_NAME} TRAIN.output_folder ${OUTPUT_DIR} Distributed.dist_url tcp://localhost:8845"
|
56 |
+
|
57 |
+
# Print the final command for debugging
|
58 |
+
echo "Final command: python -u train_angular_verb.py $python_args"
|
59 |
+
|
60 |
+
# Set NCCL environment variables
|
61 |
+
export NCCL_P2P_DISABLE=1
|
62 |
+
export NCCL_DEBUG=INFO
|
63 |
+
export NCCL_SOCKET_IFNAME=^docker0,lo
|
64 |
+
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
65 |
+
|
66 |
+
# Run the Python training script
|
67 |
+
python -u train_angular_verb.py $python_args 2>&1 | tee ${OUTPUT_DIR}/${EXP_NAME}/train_rev_version1.log &
|
68 |
+
|
69 |
+
wait
|
70 |
+
exit 0
|
scripts/train_verb_vip.sh
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
#SBATCH --job-name=CRIS_AML_pos10_m20_t005
|
3 |
+
#SBATCH --nodes=1
|
4 |
+
#SBATCH --gres=gpu:4
|
5 |
+
#SBATCH --partition=vip
|
6 |
+
#SBATCH --time=unlimited
|
7 |
+
#SBATCH --mem=80G
|
8 |
+
#SBATCH --cpus-per-task=12
|
9 |
+
#SBATCH --output=CRIS_ACLver2_OP2_Verbonly_p10_m20_t005_b64.txt
|
10 |
+
|
11 |
+
source ${HOME}/.bashrc
|
12 |
+
source ${HOME}/miniconda3/bin/activate base
|
13 |
+
conda activate cris
|
14 |
+
|
15 |
+
cd /home/s1/chaeyunkim/VerbCentric_CY
|
16 |
+
|
17 |
+
if [ "$#" -ne 3 ]; then
|
18 |
+
echo "Usage: $0 <output_dir> <batch_size> <exp_name>"
|
19 |
+
exit 1
|
20 |
+
fi
|
21 |
+
|
22 |
+
# Trap SIGUSR1 to handle job requeueing
|
23 |
+
max_restarts=3
|
24 |
+
|
25 |
+
function resubmit() {
|
26 |
+
scontext=$(scontrol show job ${SLURM_JOB_ID})
|
27 |
+
restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2)
|
28 |
+
if [[ $restarts -lt $max_restarts ]]; then
|
29 |
+
echo "Resubmitting job (restart $restarts/$max_restarts)..."
|
30 |
+
scontrol requeue ${SLURM_JOB_ID}
|
31 |
+
exit 0
|
32 |
+
else
|
33 |
+
echo "Job has exceeded the maximum restart limit ($max_restarts restarts)."
|
34 |
+
exit 1
|
35 |
+
fi
|
36 |
+
}
|
37 |
+
trap 'resubmit' SIGUSR1
|
38 |
+
|
39 |
+
# Use the first argument passed to the script as OUTPUT_DIR
|
40 |
+
OUTPUT_DIR=$1
|
41 |
+
BATCH_SIZE=$2
|
42 |
+
EXP_NAME=$3
|
43 |
+
|
44 |
+
# Print variables for debugging
|
45 |
+
echo "OUTPUT_DIR: $OUTPUT_DIR"
|
46 |
+
echo "BATCH_SIZE: $BATCH_SIZE"
|
47 |
+
echo "EXP_NAME: $EXP_NAME"
|
48 |
+
|
49 |
+
# Create the directory if it does not exist
|
50 |
+
if [[ ! -d "$OUTPUT_DIR" ]]; then
|
51 |
+
echo "Directory $OUTPUT_DIR does not exist. Creating it..."
|
52 |
+
mkdir -p "$OUTPUT_DIR"
|
53 |
+
fi
|
54 |
+
|
55 |
+
# Construct the argument list
|
56 |
+
python_args="--config config/cris_verbonly_b64_nopos.yaml --opts TRAIN.metric_mode hardpos_only_rev_op2 TRAIN.metric_loss_weight 0.1 TRAIN.hn_prob 0.0 TRAIN.resume latest TRAIN.batch_size ${BATCH_SIZE} TRAIN.margin_value 20 TRAIN.temperature 0.05 TRAIN.exp_name ${EXP_NAME} TRAIN.output_folder ${OUTPUT_DIR} Distributed.dist_url tcp://localhost:8045"
|
57 |
+
|
58 |
+
# Print the final command for debugging
|
59 |
+
echo "Final command: python -u train_angular_verb.py $python_args"
|
60 |
+
|
61 |
+
# Set NCCL environment variables
|
62 |
+
export NCCL_P2P_DISABLE=1
|
63 |
+
export NCCL_DEBUG=INFO
|
64 |
+
export NCCL_SOCKET_IFNAME=^docker0,lo
|
65 |
+
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
66 |
+
|
67 |
+
# Run the Python training script
|
68 |
+
python -u train_angular_verb.py $python_args 2>&1 | tee ${OUTPUT_DIR}/${EXP_NAME}/train_rev_version1.log &
|
69 |
+
|
70 |
+
wait
|
71 |
+
exit 0
|