|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
source ${HOME}/.bashrc |
|
source ${HOME}/miniconda3/bin/activate base |
|
conda activate cris |
|
|
|
cd /home/s1/chaeyunkim/VerbCentric_CY |
|
|
|
if [ "$#" -ne 3 ]; then |
|
echo "Usage: $0 <output_dir> <batch_size> <exp_name>" |
|
exit 1 |
|
fi |
|
|
|
|
|
max_restarts=3 |
|
|
|
function resubmit() { |
|
scontext=$(scontrol show job ${SLURM_JOB_ID}) |
|
restarts=$(echo ${scontext} | grep -o 'Restarts=[0-9]*' | cut -d= -f2) |
|
if [[ $restarts -lt $max_restarts ]]; then |
|
echo "Resubmitting job (restart $restarts/$max_restarts)..." |
|
scontrol requeue ${SLURM_JOB_ID} |
|
exit 0 |
|
else |
|
echo "Job has exceeded the maximum restart limit ($max_restarts restarts)." |
|
exit 1 |
|
fi |
|
} |
|
trap 'resubmit' SIGUSR1 |
|
|
|
|
|
OUTPUT_DIR=$1 |
|
BATCH_SIZE=$2 |
|
EXP_NAME=$3 |
|
|
|
|
|
echo "OUTPUT_DIR: $OUTPUT_DIR" |
|
echo "BATCH_SIZE: $BATCH_SIZE" |
|
echo "EXP_NAME: $EXP_NAME" |
|
|
|
|
|
if [[ ! -d "$OUTPUT_DIR" ]]; then |
|
echo "Directory $OUTPUT_DIR does not exist. Creating it..." |
|
mkdir -p "$OUTPUT_DIR" |
|
fi |
|
|
|
|
|
python_args="--config config/cris_verbonly_b64_nopos.yaml \ |
|
--opts TRAIN.metric_mode hardpos_only_fin \ |
|
TRAIN.metric_loss_weight 0.1 \ |
|
TRAIN.hn_prob 0.0 \ |
|
TRAIN.resume latest \ |
|
TRAIN.batch_size ${BATCH_SIZE} \ |
|
TRAIN.margin_value 15 \ |
|
TRAIN.temperature 0.05 \ |
|
TRAIN.exp_name ${EXP_NAME} \ |
|
TRAIN.output_folder ${OUTPUT_DIR} \ |
|
Distributed.dist_url tcp://localhost:7023" |
|
|
|
|
|
echo "Final command: python -u train_angular_verb.py $python_args" |
|
|
|
|
|
export NCCL_P2P_DISABLE=1 |
|
export NCCL_DEBUG=INFO |
|
export NCCL_SOCKET_IFNAME=^docker0,lo |
|
export CUDA_VISIBLE_DEVICES=0,1,2,3 |
|
|
|
|
|
python -u train_angular_verb.py $python_args 2>&1 | tee ${OUTPUT_DIR}/${EXP_NAME}/train_rev_version1.log & |
|
|
|
wait |
|
exit 0 |