#!/bin/bash #SBATCH --account=fmri #SBATCH --partition=g40x #SBATCH --job-name=memoryrr #SBATCH --nodes=1 #SBATCH --ntasks-per-node=8 # should = number of gpus #SBATCH --gres=gpu:8 #SBATCH --time=32:00:00 # total run time limit (HH:MM:SS) #SBATCH -e slurms/%j.err #SBATCH -o slurms/%j.out #SBATCH --comment=fmri export NUM_GPUS=8 # Set to equal gres=gpu:#! export BATCH_SIZE=32 export GLOBAL_BATCH_SIZE=$((BATCH_SIZE * NUM_GPUS)) # Make sure another job doesnt use same port, here using random number export MASTER_PORT=$((RANDOM % (19000 - 11000 + 1) + 11000)) export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) export COUNT_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l) export WANDB_DIR="/fsx/proj-fmri/ckadirt/MindEyeV2/src/wandb" export WANDB_CACHE_DIR="/admin/home-ckadirt/.cache" export WANDB_MODE="online" echo MASTER_ADDR=${MASTER_ADDR} echo MASTER_PORT=${MASTER_PORT} echo WORLD_SIZE=${COUNT_NODE} source /admin/home-ckadirt/.bashrc ########### cd /fsx/proj-fmri/ckadirt/MindEyeV2/src/ accelerate launch --num_processes=$(($NUM_GPUS * $COUNT_NODE)) --num_machines=$COUNT_NODE --main_process_ip=$MASTER_ADDR --main_process_port=$MASTER_PORT Train_MLPMixer-img.py --data_path=/fsx/proj-fmri/shared/mindeyev2_dataset --model_name=testing-rr-1024-img-past-2 --subj=1 --batch_size=${GLOBAL_BATCH_SIZE} --max_lr=3e-4 --mixup_pct=.66 --num_epochs=120 --ckpt_interval=10 --no-use_image_aug --hidden_dim=1024 --seq_len=2 # --wandb_log