#SBATCH --job-name=CRIS_AML_pos10_m20_t005 | |
#SBATCH --nodes=1 | |
#SBATCH --gres=gpu:1 | |
#SBATCH --time=0-12:00:00 # d-hh:mm:ss, job time limit | |
#SBATCH --mem=20G | |
#SBATCH --cpus-per-task=4 | |
#SBATCH --output=0_debug.txt | |
source ${HOME}/.bashrc | |
source ${HOME}/miniconda3/bin/activate base | |
conda activate cris | |
cd /home/s1/chaeyunkim/VerbCentric_CY | |
if [ "$#" -ne 3 ]; then | |
echo "Usage: $0 <output_dir> <batch_size> <exp_name>" | |
exit 1 | |
fi | |
# Use the first argument passed to the script as OUTPUT_DIR | |
OUTPUT_DIR=$1 | |
BATCH_SIZE=$2 | |
EXP_NAME=$3 | |
# Print variables for debugging | |
echo "OUTPUT_DIR: $OUTPUT_DIR" | |
echo "BATCH_SIZE: $BATCH_SIZE" | |
echo "EXP_NAME: $EXP_NAME" | |
# Create the directory if it does not exist | |
if [[ ! -d "$OUTPUT_DIR" ]]; then | |
echo "Directory $OUTPUT_DIR does not exist. Creating it..." | |
mkdir -p "$OUTPUT_DIR" | |
fi | |
# Construct the argument list | |
python_args="--config config/cris_verbonly_b64_nopos.yaml --opts TRAIN.metric_mode hardpos_only_op2 TRAIN.metric_loss_weight 0.1 TRAIN.hn_prob 0.0 TRAIN.resume latest TRAIN.batch_size ${BATCH_SIZE} TRAIN.margin_value 20 TRAIN.temperature 0.05 TRAIN.exp_name ${EXP_NAME} TRAIN.output_folder ${OUTPUT_DIR} Distributed.dist_url tcp://localhost:8845" | |
# Print the final command for debugging | |
echo "Final command: python -u train_angular_verb.py $python_args" | |
# Set NCCL environment variables | |
export NCCL_P2P_DISABLE=1 | |
export NCCL_DEBUG=INFO | |
export NCCL_SOCKET_IFNAME=^docker0,lo | |
export CUDA_VISIBLE_DEVICES=0 | |
# Run the Python training script | |
python -u train_angular_verb.py $python_args 2>&1 | tee curr-debug.log & | |
wait | |
exit 0 | |