File size: 1,552 Bytes
c1a7f73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/bin/bash

#SBATCH --job-name g2                   # Job name
### Logging
#SBATCH --output=%j.out                 # Stdout (%j expands to jobId)
#SBATCH --error=%j.err                  # Stderr (%j expands to jobId)
### Node info
#SBATCH --nodes=1                       # Single node or multi node
#SBATCH --nodelist=sota-2
#SBATCH --time 24:00:00                 # Max time (hh:mm:ss)
#SBATCH --gres=gpu:2                    # GPUs per node
#SBATCH --mem=96G                      # Recommend 32G per GPU
#SBATCH --ntasks-per-node=1             # Tasks per node
#SBATCH --cpus-per-task=16              # Recommend 8 per GPU

export NCCL_DEBUG=INFO
export REQUESTS_CA_BUNDLE="/etc/ssl/certs/ca-certificates.crt"
export HTTPS_PROXY="https://192.168.0.10:443/"
export https_proxy="https://192.168.0.10:443/"

export TEST_VAL_TRAIN=False
export TEST_VAL_PRED=True
export WANDB=True

sleep 86400

cd /u/xiuyu/work/dev4
PYTHONPATH=".":$PYTHONPATH python3 train.py \
                            --devices 2 \
                            --config configs/train/train_scalable_with_state.yaml \
                            --save_ckpt_path output/seed_1k_pure_seed_150_3_emb_head_3_debug \
                            --pretrain_ckpt output/ours_map_pretrain/epoch=31.ckpt

PYTHONPATH=".":$PYTHONPATH python val.py \
                            --config configs/validation/val_scalable_with_state.yaml \
                            --save_path output/seed_debug \
                            --pretrain_ckpt output/seed_1k_pure_seed_150_3_emb_head_3/last.ckpt