mindeyev2old2 / src /accel9.slurm
ckadirt's picture
Upload folder using huggingface_hub
b8ea2b2 verified
#!/bin/bash
#SBATCH --account=fmri
#SBATCH --partition=g40x
#SBATCH --job-name=blip2captions
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1 # should = number of gpus
#SBATCH --gres=gpu:1
#SBATCH --time=24:00:00 # total run time limit (HH:MM:SS)
#SBATCH --comment=medarc
#SBATCH --requeue
#SBATCH -e slurms/%j.err
#SBATCH -o slurms/%j.out
export NUM_GPUS=1 # Set to equal gres=gpu:#!
export BATCH_SIZE=128
export GLOBAL_BATCH_SIZE=$((BATCH_SIZE * NUM_GPUS))
# Make sure another job doesnt use same port, here using random number
export MASTER_PORT=$((RANDOM % (19000 - 11000 + 1) + 11000))
export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export COUNT_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l)
export WANDB_DIR="/fsx/proj-fmri/ckadirt/MindEyeV2/src/wandb"
export WANDB_CACHE_DIR="/admin/home-ckadirt/.cache"
export WANDB_MODE="online"
echo MASTER_ADDR=${MASTER_ADDR}
echo MASTER_PORT=${MASTER_PORT}
echo WORLD_SIZE=${COUNT_NODE}
source /admin/home-ckadirt/.bashrc
###########
cd /fsx/proj-fmri/ckadirt/MindEyeV2/src/
# accelerate launch --num_processes=$(($NUM_GPUS * $COUNT_NODE)) --num_machines=$COUNT_NODE --main_process_ip=$MASTER_ADDR --main_process_port=$MASTER_PORT
python train2.py --data_path=/fsx/proj-fmri/shared/mindeyev2_dataset --model_name=caption_clip_0.5_bz --subj=1 --batch_size=${GLOBAL_BATCH_SIZE} --max_lr=1e-4 --mixup_pct=.66 --num_epochs=50 --use_image_aug --ckpt_interval=15 --clip_mse_ratio=0.5
# --wandb_log