dianecy
/

VerbCentric-RIS

Model card Files Files and versions Community

dianecy commited on Oct 23, 2024

Commit

599450c

verified ·

1 Parent(s): 2b1ca99

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

scripts/test_verb.sh +53 -0
scripts/train_notarget.sh +81 -0
scripts/train_repro.sh +41 -0
scripts/train_tmp.sh +53 -0
scripts/train_tmp_seunghoon.sh +57 -0
scripts/train_verb.sh +70 -0
scripts/train_verb_vip.sh +71 -0

scripts/test_verb.sh ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/bin/bash
+#SBATCH --job-name=EVAL
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:1
+#SBATCH --time=0-12:00:00  # d-hh:mm:ss, job time limit
+#SBATCH --mem=50G
+#SBATCH --cpus-per-task=8 # cpu ����
+#SBATCH --output=./log_eval/ACLrevised1_VO_hp10_m20_tmp005_b64.txt
+source ${HOME}/.bashrc
+source ${HOME}/miniconda3/bin/activate base
+conda activate cris
+cd /home/s1/chaeyunkim/VerbCentric_CY
+export NCCL_P2P_DISABLE=1
+# constants
+CONFIG=config/cris_r50.yaml
+VAL_LMDB=datasets/lmdb/refcocog_u/val.lmdb
+TEST_LMDB=datasets/lmdb/refcocog_u/test.lmdb
+# variables
+EXP_NAME=ACLrevised1_VO_hp10_m20_tmp005_b64
+OPT_DIR=exp/refcocog_u/hardpos_loss_abl
+# TEST
+# test oIoU
+CUDA_VISIBLE_DEVICES=0 python -u test_oiou.py --config $CONFIG\
+ --opts TRAIN.exp_name $EXP_NAME\
+ TRAIN.output_folder $OPT_DIR\
+ TEST.test_split test\
+ TEST.test_lmdb $TEST_LMDB
+# test mIoU
+CUDA_VISIBLE_DEVICES=0  python -u test.py --config $CONFIG\
+ --opts TRAIN.exp_name $EXP_NAME\
+ TRAIN.output_folder $OPT_DIR\
+ TEST.test_split test\
+ TEST.test_lmdb $TEST_LMDB
+# VAL
+# val oIoU
+CUDA_VISIBLE_DEVICES=0  python -u test_oiou.py --config $CONFIG\
+ --opts TRAIN.exp_name $EXP_NAME\
+ TRAIN.output_folder $OPT_DIR\
+ TEST.test_split val-test\
+ TEST.test_lmdb $VAL_LMDB
+# val mIoU
+CUDA_VISIBLE_DEVICES=0 python -u test.py --config $CONFIG\
+ --opts TRAIN.exp_name $EXP_NAME\
+ TRAIN.output_folder $OPT_DIR\
+ TEST.test_split val-test\
+ TEST.test_lmdb $VAL_LMDB

scripts/train_notarget.sh ADDED Viewed

	@@ -0,0 +1,81 @@

+#!/bin/bash
+#SBATCH --job-name=CRIS_AML_pos10_m20_t005
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:4
+#SBATCH --partition=vip
+#SBATCH --time=unlimited
+#SBATCH --mem=80G
+#SBATCH --cpus-per-task=12
+#SBATCH --output=logs/CRIS_ACLver2_notgt_p10_m15_t005_b64.txt
+source ${HOME}/.bashrc
+source ${HOME}/miniconda3/bin/activate base
+conda activate cris
+cd /home/s1/chaeyunkim/VerbCentric_CY
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <output_dir> <batch_size> <exp_name>"
+    exit 1
+fi
+# Trap SIGUSR1 to handle job requeueing
+max_restarts=3
+function resubmit() {
+    scontext=$(scontrol show job ${SLURM_JOB_ID})
+    restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2)
+    if [[ $restarts -lt $max_restarts ]]; then
+        echo "Resubmitting job (restart $restarts/$max_restarts)..."
+        scontrol requeue ${SLURM_JOB_ID}
+        exit 0
+    else
+        echo "Job has exceeded the maximum restart limit ($max_restarts restarts)."
+        exit 1
+    fi
+}
+trap 'resubmit' SIGUSR1
+# Use the first argument passed to the script as OUTPUT_DIR
+OUTPUT_DIR=$1
+BATCH_SIZE=$2
+EXP_NAME=$3
+# Print variables for debugging
+echo "OUTPUT_DIR: $OUTPUT_DIR"
+echo "BATCH_SIZE: $BATCH_SIZE"
+echo "EXP_NAME: $EXP_NAME"
+# Create the directory if it does not exist
+if [[ ! -d "$OUTPUT_DIR" ]]; then
+    echo "Directory $OUTPUT_DIR does not exist. Creating it..."
+    mkdir -p "$OUTPUT_DIR"
+fi
+# Construct the argument list
+python_args="--config config/cris_verbonly_b64_nopos.yaml \
+--opts TRAIN.metric_mode hardpos_only_fin \
+TRAIN.metric_loss_weight 0.1 \
+TRAIN.hn_prob 0.0 \
+TRAIN.resume latest \
+TRAIN.batch_size ${BATCH_SIZE} \
+TRAIN.margin_value 15 \
+TRAIN.temperature 0.05 \
+TRAIN.exp_name ${EXP_NAME} \
+TRAIN.output_folder ${OUTPUT_DIR} \
+Distributed.dist_url tcp://localhost:7023"
+# Print the final command for debugging
+echo "Final command: python -u train_angular_verb.py $python_args"
+# Set NCCL environment variables
+export NCCL_P2P_DISABLE=1
+export NCCL_DEBUG=INFO
+export NCCL_SOCKET_IFNAME=^docker0,lo
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+# Run the Python training script
+python -u train_angular_verb.py $python_args 2>&1 | tee ${OUTPUT_DIR}/${EXP_NAME}/train_rev_version1.log &
+wait
+exit 0

scripts/train_repro.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/bin/bash
+#SBATCH --job-name=CRIS_repro
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:4
+#SBATCH --time=0-12:00:00
+#SBATCH --mem=60G
+#SBATCH --cpus-per-task=12
+#SBATCH --output=CRIS_REPRO.txt
+source ${HOME}/.bashrc
+source ${HOME}/miniconda3/bin/activate base
+conda activate cris
+cd /home/s1/chaeyunkim/VerbCentric_CY
+# Trap SIGUSR1 to handle job requeueing
+max_restarts=3
+function resubmit() {
+    scontext=$(scontrol show job ${SLURM_JOB_ID})
+    restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2)
+    if [[ $restarts -lt $max_restarts ]]; then
+        echo "Resubmitting job (restart $restarts/$max_restarts)..."
+        scontrol requeue ${SLURM_JOB_ID}
+        exit 0
+    else
+        echo "Job has exceeded the maximum restart limit ($max_restarts restarts)."
+        exit 1
+    fi
+}
+trap 'resubmit' SIGUSR1
+export NCCL_P2P_DISABLE=1
+export NCCL_DEBUG=INFO
+export NCCL_SOCKET_IFNAME=^docker0,lo
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -u train_angular_verb.py --config /home/s1/chaeyunkim/VerbCentric_CY/config/cris_r50.yaml  2>&1 | tee debug.log &
+wait
+exit 0

scripts/train_tmp.sh ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/bin/bash
+#SBATCH --job-name=CRIS_AML_pos10_m20_t005
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:1
+#SBATCH --time=0-12:00:00  # d-hh:mm:ss, job time limit
+#SBATCH --mem=20G
+#SBATCH --cpus-per-task=4
+#SBATCH --output=0_debug.txt
+source ${HOME}/.bashrc
+source ${HOME}/miniconda3/bin/activate base
+conda activate cris
+cd /home/s1/chaeyunkim/VerbCentric_CY
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <output_dir> <batch_size> <exp_name>"
+    exit 1
+fi
+# Use the first argument passed to the script as OUTPUT_DIR
+OUTPUT_DIR=$1
+BATCH_SIZE=$2
+EXP_NAME=$3
+# Print variables for debugging
+echo "OUTPUT_DIR: $OUTPUT_DIR"
+echo "BATCH_SIZE: $BATCH_SIZE"
+echo "EXP_NAME: $EXP_NAME"
+# Create the directory if it does not exist
+if [[ ! -d "$OUTPUT_DIR" ]]; then
+    echo "Directory $OUTPUT_DIR does not exist. Creating it..."
+    mkdir -p "$OUTPUT_DIR"
+fi
+# Construct the argument list
+python_args="--config config/cris_verbonly_b64_nopos.yaml --opts TRAIN.metric_mode hardpos_only_op2 TRAIN.metric_loss_weight 0.1 TRAIN.hn_prob 0.0 TRAIN.resume latest TRAIN.batch_size ${BATCH_SIZE} TRAIN.margin_value 20 TRAIN.temperature 0.05 TRAIN.exp_name ${EXP_NAME} TRAIN.output_folder ${OUTPUT_DIR} Distributed.dist_url tcp://localhost:8845"
+# Print the final command for debugging
+echo "Final command: python -u train_angular_verb.py $python_args"
+# Set NCCL environment variables
+export NCCL_P2P_DISABLE=1
+export NCCL_DEBUG=INFO
+export NCCL_SOCKET_IFNAME=^docker0,lo
+export CUDA_VISIBLE_DEVICES=0
+# Run the Python training script
+python -u train_angular_verb.py $python_args 2>&1 | tee curr-debug.log &
+wait
+exit 0

scripts/train_tmp_seunghoon.sh ADDED Viewed

	@@ -0,0 +1,57 @@

+#!/bin/bash
+# 환경 설정
+source ${HOME}/.bashrc
+eval "$(conda shell.bash hook)"
+conda activate cris
+# CUDA 및 분산 학습 환경 설정
+export NCCL_P2P_DISABLE=1
+export NCCL_DEBUG=INFO
+export NCCL_IB_DISABLE=1
+export NCCL_SOCKET_IFNAME=^docker0,lo
+export CUDA_VISIBLE_DEVICES=0,1
+cd /home/s1/chaeyunkim/VerbCentric_CY
+# 인자 확인
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <output_dir> <batch_size> <exp_name>"
+    exit 1
+fi
+# 변수 설정
+OUTPUT_DIR=$1
+BATCH_SIZE=$2
+EXP_NAME=$3
+echo $OUTPUT_DIR
+echo $BATCH_SIZE
+echo $EXP_NAME
+# 출력 디렉토리 생성
+if [[ ! -d "$OUTPUT_DIR" ]]; then
+    echo "Directory $OUTPUT_DIR does not exist. Creating it..."
+    mkdir -p "$OUTPUT_DIR"
+fi
+# 모델 체크포인트 확인
+FINAL_MODEL="${OUTPUT_DIR}/last_model.pth"
+if [[ ! -f "$FINAL_MODEL" ]]; then
+    resume_arg=""
+else
+    resume_arg="--resume"
+    model_weights="${FINAL_MODEL}"
+fi
+# Python 실행 인자 구성
+python_args="--config config/cris_verbonly_b64_nopos.yaml --opts TRAIN.metric_mode hardpos_only TRAIN.metric_loss_weight 0.1 TRAIN.hn_prob 0.0 TRAIN.batch_size ${BATCH_SIZE} TRAIN.margin_value 15 TRAIN.temperature 0.05 TRAIN.exp_name ${EXP_NAME} TRAIN.output_folder ${OUTPUT_DIR} Distributed.dist_url tcp://localhost:7023"
+# Resume 인자 추가
+if [[ -n "$resume_arg" ]]; then
+    python_args="$resume_arg ${model_weights} $python_args"
+fi
+# 학습 실행 및 로그 저장
+echo "Starting training..."
+python -u train_angular_verb.py $python_args 2>&1 | tee ${OUTPUT_DIR}/training.log

scripts/train_verb.sh ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/bin/bash
+#SBATCH --job-name=CRIS_AML_pos10_m20_t005
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:4
+#SBATCH --time=0-12:00:00  # d-hh:mm:ss, job time limit
+#SBATCH --mem=80G
+#SBATCH --cpus-per-task=12
+#SBATCH --output=CRIS_ACLver1_OP2_Verbonly_p10_m20_t005_b64.txt
+source ${HOME}/.bashrc
+source ${HOME}/miniconda3/bin/activate base
+conda activate cris
+cd /home/s1/chaeyunkim/VerbCentric_CY
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <output_dir> <batch_size> <exp_name>"
+    exit 1
+fi
+# Trap SIGUSR1 to handle job requeueing
+max_restarts=3
+function resubmit() {
+    scontext=$(scontrol show job ${SLURM_JOB_ID})
+    restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2)
+    if [[ $restarts -lt $max_restarts ]]; then
+        echo "Resubmitting job (restart $restarts/$max_restarts)..."
+        scontrol requeue ${SLURM_JOB_ID}
+        exit 0
+    else
+        echo "Job has exceeded the maximum restart limit ($max_restarts restarts)."
+        exit 1
+    fi
+}
+trap 'resubmit' SIGUSR1
+# Use the first argument passed to the script as OUTPUT_DIR
+OUTPUT_DIR=$1
+BATCH_SIZE=$2
+EXP_NAME=$3
+# Print variables for debugging
+echo "OUTPUT_DIR: $OUTPUT_DIR"
+echo "BATCH_SIZE: $BATCH_SIZE"
+echo "EXP_NAME: $EXP_NAME"
+# Create the directory if it does not exist
+if [[ ! -d "$OUTPUT_DIR" ]]; then
+    echo "Directory $OUTPUT_DIR does not exist. Creating it..."
+    mkdir -p "$OUTPUT_DIR"
+fi
+# Construct the argument list
+python_args="--config config/cris_verbonly_b64_nopos.yaml --opts TRAIN.metric_mode hardpos_only_op2 TRAIN.metric_loss_weight 0.1 TRAIN.hn_prob 0.0 TRAIN.resume latest TRAIN.batch_size ${BATCH_SIZE} TRAIN.margin_value 20 TRAIN.temperature 0.05 TRAIN.exp_name ${EXP_NAME} TRAIN.output_folder ${OUTPUT_DIR} Distributed.dist_url tcp://localhost:8845"
+# Print the final command for debugging
+echo "Final command: python -u train_angular_verb.py $python_args"
+# Set NCCL environment variables
+export NCCL_P2P_DISABLE=1
+export NCCL_DEBUG=INFO
+export NCCL_SOCKET_IFNAME=^docker0,lo
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+# Run the Python training script
+python -u train_angular_verb.py $python_args 2>&1 | tee ${OUTPUT_DIR}/${EXP_NAME}/train_rev_version1.log &
+wait
+exit 0

scripts/train_verb_vip.sh ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/bin/bash
+#SBATCH --job-name=CRIS_AML_pos10_m20_t005
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:4
+#SBATCH --partition=vip
+#SBATCH --time=unlimited
+#SBATCH --mem=80G
+#SBATCH --cpus-per-task=12
+#SBATCH --output=CRIS_ACLver2_OP2_Verbonly_p10_m20_t005_b64.txt
+source ${HOME}/.bashrc
+source ${HOME}/miniconda3/bin/activate base
+conda activate cris
+cd /home/s1/chaeyunkim/VerbCentric_CY
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <output_dir> <batch_size> <exp_name>"
+    exit 1
+fi
+# Trap SIGUSR1 to handle job requeueing
+max_restarts=3
+function resubmit() {
+    scontext=$(scontrol show job ${SLURM_JOB_ID})
+    restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2)
+    if [[ $restarts -lt $max_restarts ]]; then
+        echo "Resubmitting job (restart $restarts/$max_restarts)..."
+        scontrol requeue ${SLURM_JOB_ID}
+        exit 0
+    else
+        echo "Job has exceeded the maximum restart limit ($max_restarts restarts)."
+        exit 1
+    fi
+}
+trap 'resubmit' SIGUSR1
+# Use the first argument passed to the script as OUTPUT_DIR
+OUTPUT_DIR=$1
+BATCH_SIZE=$2
+EXP_NAME=$3
+# Print variables for debugging
+echo "OUTPUT_DIR: $OUTPUT_DIR"
+echo "BATCH_SIZE: $BATCH_SIZE"
+echo "EXP_NAME: $EXP_NAME"
+# Create the directory if it does not exist
+if [[ ! -d "$OUTPUT_DIR" ]]; then
+    echo "Directory $OUTPUT_DIR does not exist. Creating it..."
+    mkdir -p "$OUTPUT_DIR"
+fi
+# Construct the argument list
+python_args="--config config/cris_verbonly_b64_nopos.yaml --opts TRAIN.metric_mode hardpos_only_rev_op2 TRAIN.metric_loss_weight 0.1 TRAIN.hn_prob 0.0 TRAIN.resume latest TRAIN.batch_size ${BATCH_SIZE} TRAIN.margin_value 20 TRAIN.temperature 0.05 TRAIN.exp_name ${EXP_NAME} TRAIN.output_folder ${OUTPUT_DIR} Distributed.dist_url tcp://localhost:8045"
+# Print the final command for debugging
+echo "Final command: python -u train_angular_verb.py $python_args"
+# Set NCCL environment variables
+export NCCL_P2P_DISABLE=1
+export NCCL_DEBUG=INFO
+export NCCL_SOCKET_IFNAME=^docker0,lo
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+# Run the Python training script
+python -u train_angular_verb.py $python_args 2>&1 | tee ${OUTPUT_DIR}/${EXP_NAME}/train_rev_version1.log &
+wait
+exit 0