#!/bin/bash # 환경 설정 source ${HOME}/.bashrc eval "$(conda shell.bash hook)" conda activate cris # CUDA 및 분산 학습 환경 설정 export NCCL_P2P_DISABLE=1 export NCCL_DEBUG=INFO export NCCL_IB_DISABLE=1 export NCCL_SOCKET_IFNAME=^docker0,lo export CUDA_VISIBLE_DEVICES=0,1 cd /home/s1/chaeyunkim/VerbCentric_CY # 인자 확인 if [ "$#" -ne 3 ]; then echo "Usage: $0 " exit 1 fi # 변수 설정 OUTPUT_DIR=$1 BATCH_SIZE=$2 EXP_NAME=$3 echo $OUTPUT_DIR echo $BATCH_SIZE echo $EXP_NAME # 출력 디렉토리 생성 if [[ ! -d "$OUTPUT_DIR" ]]; then echo "Directory $OUTPUT_DIR does not exist. Creating it..." mkdir -p "$OUTPUT_DIR" fi # 모델 체크포인트 확인 FINAL_MODEL="${OUTPUT_DIR}/last_model.pth" if [[ ! -f "$FINAL_MODEL" ]]; then resume_arg="" else resume_arg="--resume" model_weights="${FINAL_MODEL}" fi # Python 실행 인자 구성 python_args="--config config/cris_verbonly_b64_nopos.yaml --opts TRAIN.metric_mode hardpos_only TRAIN.metric_loss_weight 0.1 TRAIN.hn_prob 0.0 TRAIN.batch_size ${BATCH_SIZE} TRAIN.margin_value 15 TRAIN.temperature 0.05 TRAIN.exp_name ${EXP_NAME} TRAIN.output_folder ${OUTPUT_DIR} Distributed.dist_url tcp://localhost:7023" # Resume 인자 추가 if [[ -n "$resume_arg" ]]; then python_args="$resume_arg ${model_weights} $python_args" fi # 학습 실행 및 로그 저장 echo "Starting training..." python -u train_angular_verb.py $python_args 2>&1 | tee ${OUTPUT_DIR}/training.log