dianecy commited on
Commit
599450c
·
verified ·
1 Parent(s): 2b1ca99

Upload folder using huggingface_hub

Browse files
scripts/test_verb.sh ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --job-name=EVAL
3
+ #SBATCH --nodes=1
4
+ #SBATCH --gres=gpu:1
5
+ #SBATCH --time=0-12:00:00 # d-hh:mm:ss, job time limit
6
+ #SBATCH --mem=50G
7
+ #SBATCH --cpus-per-task=8 # cpu ����
8
+ #SBATCH --output=./log_eval/ACLrevised1_VO_hp10_m20_tmp005_b64.txt
9
+
10
+
11
+ source ${HOME}/.bashrc
12
+ source ${HOME}/miniconda3/bin/activate base
13
+ conda activate cris
14
+
15
+ cd /home/s1/chaeyunkim/VerbCentric_CY
16
+ export NCCL_P2P_DISABLE=1
17
+
18
+ # constants
19
+ CONFIG=config/cris_r50.yaml
20
+ VAL_LMDB=datasets/lmdb/refcocog_u/val.lmdb
21
+ TEST_LMDB=datasets/lmdb/refcocog_u/test.lmdb
22
+ # variables
23
+ EXP_NAME=ACLrevised1_VO_hp10_m20_tmp005_b64
24
+ OPT_DIR=exp/refcocog_u/hardpos_loss_abl
25
+
26
+
27
+ # TEST
28
+ # test oIoU
29
+ CUDA_VISIBLE_DEVICES=0 python -u test_oiou.py --config $CONFIG\
30
+ --opts TRAIN.exp_name $EXP_NAME\
31
+ TRAIN.output_folder $OPT_DIR\
32
+ TEST.test_split test\
33
+ TEST.test_lmdb $TEST_LMDB
34
+ # test mIoU
35
+ CUDA_VISIBLE_DEVICES=0 python -u test.py --config $CONFIG\
36
+ --opts TRAIN.exp_name $EXP_NAME\
37
+ TRAIN.output_folder $OPT_DIR\
38
+ TEST.test_split test\
39
+ TEST.test_lmdb $TEST_LMDB
40
+
41
+ # VAL
42
+ # val oIoU
43
+ CUDA_VISIBLE_DEVICES=0 python -u test_oiou.py --config $CONFIG\
44
+ --opts TRAIN.exp_name $EXP_NAME\
45
+ TRAIN.output_folder $OPT_DIR\
46
+ TEST.test_split val-test\
47
+ TEST.test_lmdb $VAL_LMDB
48
+ # val mIoU
49
+ CUDA_VISIBLE_DEVICES=0 python -u test.py --config $CONFIG\
50
+ --opts TRAIN.exp_name $EXP_NAME\
51
+ TRAIN.output_folder $OPT_DIR\
52
+ TEST.test_split val-test\
53
+ TEST.test_lmdb $VAL_LMDB
scripts/train_notarget.sh ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --job-name=CRIS_AML_pos10_m20_t005
3
+ #SBATCH --nodes=1
4
+ #SBATCH --gres=gpu:4
5
+ #SBATCH --partition=vip
6
+ #SBATCH --time=unlimited
7
+ #SBATCH --mem=80G
8
+ #SBATCH --cpus-per-task=12
9
+ #SBATCH --output=logs/CRIS_ACLver2_notgt_p10_m15_t005_b64.txt
10
+
11
+ source ${HOME}/.bashrc
12
+ source ${HOME}/miniconda3/bin/activate base
13
+ conda activate cris
14
+
15
+ cd /home/s1/chaeyunkim/VerbCentric_CY
16
+
17
+ if [ "$#" -ne 3 ]; then
18
+ echo "Usage: $0 <output_dir> <batch_size> <exp_name>"
19
+ exit 1
20
+ fi
21
+
22
+ # Trap SIGUSR1 to handle job requeueing
23
+ max_restarts=3
24
+
25
+ function resubmit() {
26
+ scontext=$(scontrol show job ${SLURM_JOB_ID})
27
+ restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2)
28
+ if [[ $restarts -lt $max_restarts ]]; then
29
+ echo "Resubmitting job (restart $restarts/$max_restarts)..."
30
+ scontrol requeue ${SLURM_JOB_ID}
31
+ exit 0
32
+ else
33
+ echo "Job has exceeded the maximum restart limit ($max_restarts restarts)."
34
+ exit 1
35
+ fi
36
+ }
37
+ trap 'resubmit' SIGUSR1
38
+
39
+ # Use the first argument passed to the script as OUTPUT_DIR
40
+ OUTPUT_DIR=$1
41
+ BATCH_SIZE=$2
42
+ EXP_NAME=$3
43
+
44
+ # Print variables for debugging
45
+ echo "OUTPUT_DIR: $OUTPUT_DIR"
46
+ echo "BATCH_SIZE: $BATCH_SIZE"
47
+ echo "EXP_NAME: $EXP_NAME"
48
+
49
+ # Create the directory if it does not exist
50
+ if [[ ! -d "$OUTPUT_DIR" ]]; then
51
+ echo "Directory $OUTPUT_DIR does not exist. Creating it..."
52
+ mkdir -p "$OUTPUT_DIR"
53
+ fi
54
+
55
+ # Construct the argument list
56
+ python_args="--config config/cris_verbonly_b64_nopos.yaml \
57
+ --opts TRAIN.metric_mode hardpos_only_fin \
58
+ TRAIN.metric_loss_weight 0.1 \
59
+ TRAIN.hn_prob 0.0 \
60
+ TRAIN.resume latest \
61
+ TRAIN.batch_size ${BATCH_SIZE} \
62
+ TRAIN.margin_value 15 \
63
+ TRAIN.temperature 0.05 \
64
+ TRAIN.exp_name ${EXP_NAME} \
65
+ TRAIN.output_folder ${OUTPUT_DIR} \
66
+ Distributed.dist_url tcp://localhost:7023"
67
+
68
+ # Print the final command for debugging
69
+ echo "Final command: python -u train_angular_verb.py $python_args"
70
+
71
+ # Set NCCL environment variables
72
+ export NCCL_P2P_DISABLE=1
73
+ export NCCL_DEBUG=INFO
74
+ export NCCL_SOCKET_IFNAME=^docker0,lo
75
+ export CUDA_VISIBLE_DEVICES=0,1,2,3
76
+
77
+ # Run the Python training script
78
+ python -u train_angular_verb.py $python_args 2>&1 | tee ${OUTPUT_DIR}/${EXP_NAME}/train_rev_version1.log &
79
+
80
+ wait
81
+ exit 0
scripts/train_repro.sh ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --job-name=CRIS_repro
3
+ #SBATCH --nodes=1
4
+ #SBATCH --gres=gpu:4
5
+ #SBATCH --time=0-12:00:00
6
+ #SBATCH --mem=60G
7
+ #SBATCH --cpus-per-task=12
8
+ #SBATCH --output=CRIS_REPRO.txt
9
+
10
+ source ${HOME}/.bashrc
11
+ source ${HOME}/miniconda3/bin/activate base
12
+ conda activate cris
13
+
14
+ cd /home/s1/chaeyunkim/VerbCentric_CY
15
+
16
+
17
+ # Trap SIGUSR1 to handle job requeueing
18
+ max_restarts=3
19
+
20
+ function resubmit() {
21
+ scontext=$(scontrol show job ${SLURM_JOB_ID})
22
+ restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2)
23
+ if [[ $restarts -lt $max_restarts ]]; then
24
+ echo "Resubmitting job (restart $restarts/$max_restarts)..."
25
+ scontrol requeue ${SLURM_JOB_ID}
26
+ exit 0
27
+ else
28
+ echo "Job has exceeded the maximum restart limit ($max_restarts restarts)."
29
+ exit 1
30
+ fi
31
+ }
32
+ trap 'resubmit' SIGUSR1
33
+
34
+ export NCCL_P2P_DISABLE=1
35
+ export NCCL_DEBUG=INFO
36
+ export NCCL_SOCKET_IFNAME=^docker0,lo
37
+ export CUDA_VISIBLE_DEVICES=0,1,2,3
38
+ python -u train_angular_verb.py --config /home/s1/chaeyunkim/VerbCentric_CY/config/cris_r50.yaml 2>&1 | tee debug.log &
39
+
40
+ wait
41
+ exit 0
scripts/train_tmp.sh ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --job-name=CRIS_AML_pos10_m20_t005
3
+ #SBATCH --nodes=1
4
+ #SBATCH --gres=gpu:1
5
+ #SBATCH --time=0-12:00:00 # d-hh:mm:ss, job time limit
6
+ #SBATCH --mem=20G
7
+ #SBATCH --cpus-per-task=4
8
+ #SBATCH --output=0_debug.txt
9
+
10
+ source ${HOME}/.bashrc
11
+ source ${HOME}/miniconda3/bin/activate base
12
+ conda activate cris
13
+
14
+ cd /home/s1/chaeyunkim/VerbCentric_CY
15
+
16
+ if [ "$#" -ne 3 ]; then
17
+ echo "Usage: $0 <output_dir> <batch_size> <exp_name>"
18
+ exit 1
19
+ fi
20
+
21
+ # Use the first argument passed to the script as OUTPUT_DIR
22
+ OUTPUT_DIR=$1
23
+ BATCH_SIZE=$2
24
+ EXP_NAME=$3
25
+
26
+ # Print variables for debugging
27
+ echo "OUTPUT_DIR: $OUTPUT_DIR"
28
+ echo "BATCH_SIZE: $BATCH_SIZE"
29
+ echo "EXP_NAME: $EXP_NAME"
30
+
31
+ # Create the directory if it does not exist
32
+ if [[ ! -d "$OUTPUT_DIR" ]]; then
33
+ echo "Directory $OUTPUT_DIR does not exist. Creating it..."
34
+ mkdir -p "$OUTPUT_DIR"
35
+ fi
36
+
37
+ # Construct the argument list
38
+ python_args="--config config/cris_verbonly_b64_nopos.yaml --opts TRAIN.metric_mode hardpos_only_op2 TRAIN.metric_loss_weight 0.1 TRAIN.hn_prob 0.0 TRAIN.resume latest TRAIN.batch_size ${BATCH_SIZE} TRAIN.margin_value 20 TRAIN.temperature 0.05 TRAIN.exp_name ${EXP_NAME} TRAIN.output_folder ${OUTPUT_DIR} Distributed.dist_url tcp://localhost:8845"
39
+
40
+ # Print the final command for debugging
41
+ echo "Final command: python -u train_angular_verb.py $python_args"
42
+
43
+ # Set NCCL environment variables
44
+ export NCCL_P2P_DISABLE=1
45
+ export NCCL_DEBUG=INFO
46
+ export NCCL_SOCKET_IFNAME=^docker0,lo
47
+ export CUDA_VISIBLE_DEVICES=0
48
+
49
+ # Run the Python training script
50
+ python -u train_angular_verb.py $python_args 2>&1 | tee curr-debug.log &
51
+
52
+ wait
53
+ exit 0
scripts/train_tmp_seunghoon.sh ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # 환경 설정
4
+ source ${HOME}/.bashrc
5
+ eval "$(conda shell.bash hook)"
6
+ conda activate cris
7
+
8
+ # CUDA 및 분산 학습 환경 설정
9
+ export NCCL_P2P_DISABLE=1
10
+ export NCCL_DEBUG=INFO
11
+ export NCCL_IB_DISABLE=1
12
+ export NCCL_SOCKET_IFNAME=^docker0,lo
13
+ export CUDA_VISIBLE_DEVICES=0,1
14
+
15
+ cd /home/s1/chaeyunkim/VerbCentric_CY
16
+
17
+ # 인자 확인
18
+ if [ "$#" -ne 3 ]; then
19
+ echo "Usage: $0 <output_dir> <batch_size> <exp_name>"
20
+ exit 1
21
+ fi
22
+
23
+ # 변수 설정
24
+ OUTPUT_DIR=$1
25
+ BATCH_SIZE=$2
26
+ EXP_NAME=$3
27
+
28
+ echo $OUTPUT_DIR
29
+ echo $BATCH_SIZE
30
+ echo $EXP_NAME
31
+
32
+ # 출력 디렉토리 생성
33
+ if [[ ! -d "$OUTPUT_DIR" ]]; then
34
+ echo "Directory $OUTPUT_DIR does not exist. Creating it..."
35
+ mkdir -p "$OUTPUT_DIR"
36
+ fi
37
+
38
+ # 모델 체크포인트 확인
39
+ FINAL_MODEL="${OUTPUT_DIR}/last_model.pth"
40
+ if [[ ! -f "$FINAL_MODEL" ]]; then
41
+ resume_arg=""
42
+ else
43
+ resume_arg="--resume"
44
+ model_weights="${FINAL_MODEL}"
45
+ fi
46
+
47
+ # Python 실행 인자 구성
48
+ python_args="--config config/cris_verbonly_b64_nopos.yaml --opts TRAIN.metric_mode hardpos_only TRAIN.metric_loss_weight 0.1 TRAIN.hn_prob 0.0 TRAIN.batch_size ${BATCH_SIZE} TRAIN.margin_value 15 TRAIN.temperature 0.05 TRAIN.exp_name ${EXP_NAME} TRAIN.output_folder ${OUTPUT_DIR} Distributed.dist_url tcp://localhost:7023"
49
+
50
+ # Resume 인자 추가
51
+ if [[ -n "$resume_arg" ]]; then
52
+ python_args="$resume_arg ${model_weights} $python_args"
53
+ fi
54
+
55
+ # 학습 실행 및 로그 저장
56
+ echo "Starting training..."
57
+ python -u train_angular_verb.py $python_args 2>&1 | tee ${OUTPUT_DIR}/training.log
scripts/train_verb.sh ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --job-name=CRIS_AML_pos10_m20_t005
3
+ #SBATCH --nodes=1
4
+ #SBATCH --gres=gpu:4
5
+ #SBATCH --time=0-12:00:00 # d-hh:mm:ss, job time limit
6
+ #SBATCH --mem=80G
7
+ #SBATCH --cpus-per-task=12
8
+ #SBATCH --output=CRIS_ACLver1_OP2_Verbonly_p10_m20_t005_b64.txt
9
+
10
+ source ${HOME}/.bashrc
11
+ source ${HOME}/miniconda3/bin/activate base
12
+ conda activate cris
13
+
14
+ cd /home/s1/chaeyunkim/VerbCentric_CY
15
+
16
+ if [ "$#" -ne 3 ]; then
17
+ echo "Usage: $0 <output_dir> <batch_size> <exp_name>"
18
+ exit 1
19
+ fi
20
+
21
+ # Trap SIGUSR1 to handle job requeueing
22
+ max_restarts=3
23
+
24
+ function resubmit() {
25
+ scontext=$(scontrol show job ${SLURM_JOB_ID})
26
+ restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2)
27
+ if [[ $restarts -lt $max_restarts ]]; then
28
+ echo "Resubmitting job (restart $restarts/$max_restarts)..."
29
+ scontrol requeue ${SLURM_JOB_ID}
30
+ exit 0
31
+ else
32
+ echo "Job has exceeded the maximum restart limit ($max_restarts restarts)."
33
+ exit 1
34
+ fi
35
+ }
36
+ trap 'resubmit' SIGUSR1
37
+
38
+ # Use the first argument passed to the script as OUTPUT_DIR
39
+ OUTPUT_DIR=$1
40
+ BATCH_SIZE=$2
41
+ EXP_NAME=$3
42
+
43
+ # Print variables for debugging
44
+ echo "OUTPUT_DIR: $OUTPUT_DIR"
45
+ echo "BATCH_SIZE: $BATCH_SIZE"
46
+ echo "EXP_NAME: $EXP_NAME"
47
+
48
+ # Create the directory if it does not exist
49
+ if [[ ! -d "$OUTPUT_DIR" ]]; then
50
+ echo "Directory $OUTPUT_DIR does not exist. Creating it..."
51
+ mkdir -p "$OUTPUT_DIR"
52
+ fi
53
+
54
+ # Construct the argument list
55
+ python_args="--config config/cris_verbonly_b64_nopos.yaml --opts TRAIN.metric_mode hardpos_only_op2 TRAIN.metric_loss_weight 0.1 TRAIN.hn_prob 0.0 TRAIN.resume latest TRAIN.batch_size ${BATCH_SIZE} TRAIN.margin_value 20 TRAIN.temperature 0.05 TRAIN.exp_name ${EXP_NAME} TRAIN.output_folder ${OUTPUT_DIR} Distributed.dist_url tcp://localhost:8845"
56
+
57
+ # Print the final command for debugging
58
+ echo "Final command: python -u train_angular_verb.py $python_args"
59
+
60
+ # Set NCCL environment variables
61
+ export NCCL_P2P_DISABLE=1
62
+ export NCCL_DEBUG=INFO
63
+ export NCCL_SOCKET_IFNAME=^docker0,lo
64
+ export CUDA_VISIBLE_DEVICES=0,1,2,3
65
+
66
+ # Run the Python training script
67
+ python -u train_angular_verb.py $python_args 2>&1 | tee ${OUTPUT_DIR}/${EXP_NAME}/train_rev_version1.log &
68
+
69
+ wait
70
+ exit 0
scripts/train_verb_vip.sh ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --job-name=CRIS_AML_pos10_m20_t005
3
+ #SBATCH --nodes=1
4
+ #SBATCH --gres=gpu:4
5
+ #SBATCH --partition=vip
6
+ #SBATCH --time=unlimited
7
+ #SBATCH --mem=80G
8
+ #SBATCH --cpus-per-task=12
9
+ #SBATCH --output=CRIS_ACLver2_OP2_Verbonly_p10_m20_t005_b64.txt
10
+
11
+ source ${HOME}/.bashrc
12
+ source ${HOME}/miniconda3/bin/activate base
13
+ conda activate cris
14
+
15
+ cd /home/s1/chaeyunkim/VerbCentric_CY
16
+
17
+ if [ "$#" -ne 3 ]; then
18
+ echo "Usage: $0 <output_dir> <batch_size> <exp_name>"
19
+ exit 1
20
+ fi
21
+
22
+ # Trap SIGUSR1 to handle job requeueing
23
+ max_restarts=3
24
+
25
+ function resubmit() {
26
+ scontext=$(scontrol show job ${SLURM_JOB_ID})
27
+ restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2)
28
+ if [[ $restarts -lt $max_restarts ]]; then
29
+ echo "Resubmitting job (restart $restarts/$max_restarts)..."
30
+ scontrol requeue ${SLURM_JOB_ID}
31
+ exit 0
32
+ else
33
+ echo "Job has exceeded the maximum restart limit ($max_restarts restarts)."
34
+ exit 1
35
+ fi
36
+ }
37
+ trap 'resubmit' SIGUSR1
38
+
39
+ # Use the first argument passed to the script as OUTPUT_DIR
40
+ OUTPUT_DIR=$1
41
+ BATCH_SIZE=$2
42
+ EXP_NAME=$3
43
+
44
+ # Print variables for debugging
45
+ echo "OUTPUT_DIR: $OUTPUT_DIR"
46
+ echo "BATCH_SIZE: $BATCH_SIZE"
47
+ echo "EXP_NAME: $EXP_NAME"
48
+
49
+ # Create the directory if it does not exist
50
+ if [[ ! -d "$OUTPUT_DIR" ]]; then
51
+ echo "Directory $OUTPUT_DIR does not exist. Creating it..."
52
+ mkdir -p "$OUTPUT_DIR"
53
+ fi
54
+
55
+ # Construct the argument list
56
+ python_args="--config config/cris_verbonly_b64_nopos.yaml --opts TRAIN.metric_mode hardpos_only_rev_op2 TRAIN.metric_loss_weight 0.1 TRAIN.hn_prob 0.0 TRAIN.resume latest TRAIN.batch_size ${BATCH_SIZE} TRAIN.margin_value 20 TRAIN.temperature 0.05 TRAIN.exp_name ${EXP_NAME} TRAIN.output_folder ${OUTPUT_DIR} Distributed.dist_url tcp://localhost:8045"
57
+
58
+ # Print the final command for debugging
59
+ echo "Final command: python -u train_angular_verb.py $python_args"
60
+
61
+ # Set NCCL environment variables
62
+ export NCCL_P2P_DISABLE=1
63
+ export NCCL_DEBUG=INFO
64
+ export NCCL_SOCKET_IFNAME=^docker0,lo
65
+ export CUDA_VISIBLE_DEVICES=0,1,2,3
66
+
67
+ # Run the Python training script
68
+ python -u train_angular_verb.py $python_args 2>&1 | tee ${OUTPUT_DIR}/${EXP_NAME}/train_rev_version1.log &
69
+
70
+ wait
71
+ exit 0