#SBATCH --job-name g2 # Job name | |
### Logging | |
#SBATCH --output=%j.out # Stdout (%j expands to jobId) | |
#SBATCH --error=%j.err # Stderr (%j expands to jobId) | |
### Node info | |
#SBATCH --nodes=1 # Single node or multi node | |
#SBATCH --nodelist=sota-2 | |
#SBATCH --time 24:00:00 # Max time (hh:mm:ss) | |
#SBATCH --gres=gpu:2 # GPUs per node | |
#SBATCH --mem=96G # Recommend 32G per GPU | |
#SBATCH --ntasks-per-node=1 # Tasks per node | |
#SBATCH --cpus-per-task=16 # Recommend 8 per GPU | |
export NCCL_DEBUG=INFO | |
export REQUESTS_CA_BUNDLE="/etc/ssl/certs/ca-certificates.crt" | |
export HTTPS_PROXY="https://192.168.0.10:443/" | |
export https_proxy="https://192.168.0.10:443/" | |
export TEST_VAL_TRAIN=False | |
export TEST_VAL_PRED=True | |
export WANDB=True | |
sleep 86400 | |
cd /u/xiuyu/work/dev4 | |
PYTHONPATH=".":$PYTHONPATH python3 train.py \ | |
--devices 2 \ | |
--config configs/train/train_scalable_with_state.yaml \ | |
--save_ckpt_path output/seed_1k_pure_seed_150_3_emb_head_3_debug \ | |
--pretrain_ckpt output/ours_map_pretrain/epoch=31.ckpt | |
PYTHONPATH=".":$PYTHONPATH python val.py \ | |
--config configs/validation/val_scalable_with_state.yaml \ | |
--save_path output/seed_debug \ | |
--pretrain_ckpt output/seed_1k_pure_seed_150_3_emb_head_3/last.ckpt |