djgagne commited on Jan 24

Commit

d6d123e

1 Parent(s): f59a072

Added model files

Browse files

Files changed (28) hide show

README.md +22 -3
backup_checkpoint.pt +3 -0
backup_final/best_checkpoint.pt +3 -0
backup_final/best_model_checkpoint.pt +3 -0
backup_final/best_optimizer_checkpoint.pt +3 -0
backup_final/checkpoint.pt +3 -0
backup_final/model_checkpoint.pt +3 -0
backup_final/optimizer_checkpoint.pt +3 -0
backup_final/training_log.csv +19 -0
backup_model_checkpoint.pt +3 -0
backup_optimizer_checkpoint.pt +3 -0
best_checkpoint.pt +3 -0
best_model_checkpoint.pt +3 -0
best_optimizer_checkpoint.pt +3 -0
checkpoint.pt +3 -0
l.sh +44 -0
launch_multi.sh +46 -0
launch_predict.sh +45 -0
launch_rollout.sh +45 -0
launch_single.sh +46 -0
model.yml +0 -0
model_checkpoint.pt +3 -0
model_multi.yml +224 -0
model_predict.yml +136 -0
model_predict_cpu.yml +116 -0
model_single.yml +205 -0
model_single_cached.yml +200 -0
optimizer_checkpoint.pt +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,22 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+language:
+- en
+tags:
+- weather
+- climate
+- global
+---
+# NSF NCAR Community Research Earth Digital Intelligence Twin (CREDIT) FuXi 6-Hour Model Weights and Configuration
+This repository contains the PyTorch checkpoint weights and data/model configuration files for the CREDIT WXFormer 6-hour model.
+More information about the training and verification of this model can be found in the Schreck et al. (2024) ArXiv [preprint](https://arxiv.org/abs/2411.07814).
+## Data Access
+Our model is trained on ERA5 Reanalysis Data from a subset of 16 of the 137 hybrid sigma-pressure vertical levels.
+The raw data are available on the NSF NCAR [Research Data Archive](https://rda.ucar.edu/datasets/d633006/).
+Processed data can be accessed on the [CREDIT ERA5 Zarr Files](https://app.globus.org/file-manager?origin_id=2fc90d8f-10b7-44e1-a6a5-cf844112822e&origin_path=%2F) globus collection.
+## Running the Model
+To run the model, first install the [CREDIT package](https://github.com/NCAR/miles-credit) from github.
+Modify the paths in the `finetune_final/model_predict.yml` configuration file to point to the appropriate ERA5 and scaler directories.

backup_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f23a63c6843906694ae529948df4d4ac347b4a103eafe7bf939331257fbb921
+size 1260

backup_final/best_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce0b399e07427b2ad9bd4d44d03801866f58bc236229341cdf2bce1dfcdd2551
+size 1132

backup_final/best_model_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3333a5bca6a8ca87285866c05998ffe00bd726156a70814b208dcfab0f9c86d8
+size 1044714782

backup_final/best_optimizer_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8fcbe3d191c3999d8652133c32ca2487924ff34a5a8238947b0e0ac795c1d78
+size 1683978368

backup_final/checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b25c034739e99cc1ad34d56332f3f29a00a30086874e631ffaca50294c0b663
+size 1132

backup_final/model_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af8ba6f9618c1a075b21c0fa6d741e511a933caac4466f74b7c5b88c3fdc1595
+size 1044714782

backup_final/optimizer_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c304aa2c70e1b98e5c88c0a4ed76024e6b7ce0681d1fb94e5cfb40d264ddde49
+size 1683978368

backup_final/training_log.csv ADDED Viewed

	@@ -0,0 +1,19 @@

+index,epoch,train_forecast_len,valid_forecast_len,train_loss,valid_loss,train_acc,valid_acc,train_mae,valid_mae,lr
+0,0,8.0,,0.1387606938719278,0.1108131468296051,0.970562528166061,0.9694464921951294,0.1099035362854148,0.1114028289914131,1e-06
+1,1,8.0,,0.1377466195342726,0.1100945279002189,0.9708313843792448,0.9694475412368776,0.1095289197089015,0.111414648592472,9.938441702975689e-07
+2,2,8.0,,0.1385083928684629,0.1117167651653289,0.970662198676108,0.9694490909576416,0.1096920592413432,0.1113834738731384,9.755282581475769e-07
+3,3,8.0,,0.1384558184501839,0.1127898842096328,0.9706702200948644,0.969444227218628,0.1096835084190795,0.1113786697387695,9.45503262094184e-07
+4,4,8.0,,0.1384486823842145,0.1118623703718185,0.9707273427048534,0.9694671511650086,0.1096120545829551,0.1113623306155204,9.045084971874738e-07
+5,5,8.0,,0.1381992981407168,0.1129628434777259,0.9707307068726762,0.9694624781608582,0.1096099303527311,0.1113620564341545,8.535533905932738e-07
+6,6,8.0,,0.1391399535737018,0.1127516105771064,0.970510085502318,0.9694764852523804,0.1099141280661301,0.11133803576231,7.938926261462367e-07
+7,7,8.0,,0.1381134882824537,0.1126476496458053,0.970679276152876,0.9694833755493164,0.1096669749230421,0.1113172695040702,7.269952498697736e-07
+8,8,8.0,,0.1384780509630525,0.1114308580756187,0.970642456109973,0.9694918870925904,0.1097221110256607,0.1112886816263198,6.54508497187474e-07
+9,9,8.0,,0.1387791881958643,0.109854482114315,0.9706856229088524,0.9695030689239502,0.1096888491503491,0.1112985372543335,5.782172325201157e-07
+10,10,8.0,,0.1381753478910926,0.1100143820047378,0.9706785721269992,0.9695019960403444,0.1096319330461097,0.1112829759716987,5.000000000000002e-07
+11,11,8.0,,0.1397529813965318,0.1093554720282554,0.9704045921917488,0.9695140957832336,0.1100279053032633,0.1112509205937385,4.217827674798849e-07
+12,12,8.0,,0.1388648276822212,0.1110095202922821,0.970521897509478,0.969509732723236,0.1098963392895986,0.1112472981214523,3.454915028125265e-07
+13,13,8.0,,0.139924709661833,0.1111257791519165,0.9704050614899796,0.9695259213447572,0.1100835978435119,0.111248242855072,2.730047501302268e-07
+14,14,8.0,,0.1386912109745035,0.1115610241889953,0.9705360937809598,0.9695113182067872,0.1098389108229688,0.1112457856535911,2.061073738537636e-07
+15,15,8.0,,0.1394790709411359,0.1099427804350852,0.9703982228975208,0.9695213079452516,0.1101159101182764,0.1112276285886764,1.4644660940672637e-07
+16,16,8.0,,0.1392562538778043,0.1106211960315704,0.9705332976241984,0.9695265769958497,0.1098723398013548,0.1112342774868011,9.549150281252641e-08
+17,17,8.0,,0.1382005311872648,0.11191920638084411,0.9706875172214232,0.9695238590240478,0.10971980983678531,0.11123556792736053,5.449673790581614e-08

backup_model_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63ce28c57eb54f0e5d4a4229cfe9dfc26327f7812a73c5ac0eb657fc617335c3
+size 1044714782

backup_optimizer_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d079d97419fa481b7bf2cda4b24a33fb43c5f3b357940b3e017f9b95910a7
+size 1683978368

best_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:521fa1b735f45041505f3d1fccfae8b7d18625e5a59f52069452a9cab31f183e
+size 1260

best_model_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1211699e85d387cf89e913a0b86d92e58986e8222322ef1cb1fb7dc30c23a790
+size 1044714782

best_optimizer_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca87b2643346de107cd09e11fe130fbaa33b8669b139c0cc462b65dcc41c180e
+size 1683978368

checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97de0e29c19c96993c73f700e695f0bf31ebdf7ba4297c3bda664e3e317a224a
+size 1260

l.sh ADDED Viewed

	@@ -0,0 +1,44 @@

+#!/bin/bash
+#PBS -A NAML0001
+#PBS -N wxformer_6h
+#PBS -l walltime=12:00:00
+#PBS -l select=8:ncpus=64:ngpus=4:mem=480GB
+#PBS -q main
+#PBS -j oe
+#PBS -k eod
+# Load modules
+module purge
+module load gcc craype cray-mpich cuda cudnn/8.8.1.3-12 conda
+conda activate /glade/u/home/schreck/.conda/envs/credit-derecho
+# Export environment variables
+export LSCRATCH=/glade/derecho/scratch/schreck/
+export LOGLEVEL=INFO
+export NCCL_DEBUG=INFO
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+export NCCL_SOCKET_IFNAME=hsn
+export MPICH_GPU_MANAGED_MEMORY_SUPPORT_ENABLED=1
+export MPICH_OFI_NIC_POLICY=GPU
+export MPICH_GPU_SUPPORT_ENABLED=1
+export NCCL_IB_DISABLE=1
+export NCCL_CROSS_NIC=1
+export NCCL_NCHANNELS_PER_NET_PEER=4
+export MPICH_RDMA_ENABLED_CUDA=1
+export NCCL_NET="AWS Libfabric"
+export NCCL_NET_GDR_LEVEL=PBH
+export FI_CXI_DISABLE_HOST_REGISTER=1
+export FI_CXI_OPTIMIZED_MRS=false
+export FI_MR_CACHE_MONITOR=userfaultfd
+export FI_CXI_DEFAULT_CQ_SIZE=131072
+# logger.info the results
+echo "Number of nodes: 8"
+echo "Number of GPUs per node: 4"
+echo "Total number of GPUs: 32"
+# Log in to WandB if needed
+# wandb login 02d2b1af00b5df901cb2bee071872de774781520
+# Launch MPIs
+nodes=( $( cat $PBS_NODEFILE ) )
+echo nodes: $nodes
+# Find headnode's IP:
+head_node=${nodes[0]}
+head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}')
+MASTER_ADDR=$head_node_ip MASTER_PORT=1234 mpiexec -n 32 --ppn 4 --cpu-bind none python /glade/derecho/scratch/schreck/CREDIT_runs/test_ben_env/miles-credit/applications/train.py -c model.yml --backend nccl

launch_multi.sh ADDED Viewed

	@@ -0,0 +1,46 @@

+#!/bin/bash
+#PBS -A NCIS0010
+#PBS -N fx6h_multi
+#PBS -l walltime=12:00:00
+#PBS -l select=8:ncpus=64:ngpus=4
+#PBS -q main
+#PBS -j oe
+#PBS -k eod
+#PBS -r n
+# Load modules
+module purge
+module load gcc craype cray-mpich cuda cudnn/8.8.1.3-12 conda
+conda activate /glade/work/ksha/miniconda3/envs/credit-derecho
+# conda conda activate /glade/u/home/schreck/.conda/envs/credit-derecho
+# Export environment variables
+export LSCRATCH=/glade/derecho/scratch/ksha/
+export LOGLEVEL=INFO
+export NCCL_DEBUG=INFO
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+export NCCL_SOCKET_IFNAME=hsn
+export MPICH_GPU_MANAGED_MEMORY_SUPPORT_ENABLED=1
+export MPICH_OFI_NIC_POLICY=GPU
+export MPICH_GPU_SUPPORT_ENABLED=1
+export NCCL_IB_DISABLE=1
+export NCCL_CROSS_NIC=1
+export NCCL_NCHANNELS_PER_NET_PEER=4
+export MPICH_RDMA_ENABLED_CUDA=1
+export NCCL_NET="AWS Libfabric"
+export NCCL_NET_GDR_LEVEL=PBH
+export FI_CXI_DISABLE_HOST_REGISTER=1
+export FI_CXI_OPTIMIZED_MRS=false
+export FI_MR_CACHE_MONITOR=userfaultfd
+export FI_CXI_DEFAULT_CQ_SIZE=131072
+# logger.info the results
+echo "Number of nodes: 8"
+echo "Number of GPUs per node: 4"
+echo "Total number of GPUs: 32"
+# Log in to WandB if needed
+# wandb login 02d2b1af00b5df901cb2bee071872de774781520
+# Launch MPIs
+nodes=( $( cat $PBS_NODEFILE ) )
+echo nodes: $nodes
+# Find headnode's IP:
+head_node=${nodes[0]}
+head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}')
+MASTER_ADDR=$head_node_ip MASTER_PORT=1234 mpiexec -n 32 --ppn 4 --cpu-bind none python /glade/u/home/ksha/miles-credit/applications/train_multistep.py -c /glade/work/ksha/CREDIT_runs/fuxi_6h/model_multi.yml --backend nccl

launch_predict.sh ADDED Viewed

	@@ -0,0 +1,45 @@

+#!/bin/bash
+#PBS -A NCIS0010
+#PBS -N fx6h_pred
+#PBS -l walltime=12:00:00
+#PBS -l select=8:ncpus=64:ngpus=4
+#PBS -q main
+#PBS -j oe
+#PBS -k eod
+#PBS -r n
+# Load modules
+module purge
+module load nvhpc cuda cray-mpich conda
+conda activate /glade/work/ksha/miniconda3/envs/credit
+# Get a list of allocated nodes
+nodes=( $( cat $PBS_NODEFILE ) )
+head_node=${nodes[0]}
+head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}')
+# Export environment variables
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31"
+export LSCRATCH=/glade/derecho/scratch/ksha/
+export LOGLEVEL=INFO
+#export NCCL_DEBUG=INFO
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_HOME=/glade/u/home/dhoward/work/nccl-ofi-plugin/install
+export LD_LIBRARY_PATH=$NCCL_HOME/lib:$NCCL_HOME/plugin/lib:$LD_LIBRARY_PATH
+export NCCL_NCHANNELS_PER_NET_PEER=4
+export MPICH_GPU_SUPPORT_ENABLED=1
+export MPICH_OFI_NIC_POLICY=GPU
+export MPICH_RDMA_ENABLED_CUDA=1
+export NCCL_DISABLE_IB=1
+export NCCL_CROSS_NIC=1
+export FI_CXI_DISABLE_HOST_REGISTER=1
+export FI_CXI_OPTIMIZED_MRS=false
+# Print the results
+echo "Number of nodes: 8"
+echo "Number of GPUs per node: 4"
+echo "Total number of GPUs: 32"
+# Log in to WandB if needed
+# wandb login 02d2b1af00b5df901cb2bee071872de774781520
+# Launch MPIs
+mpiexec -n 8 --ppn 1 --cpu-bind none torchrun --nnodes=8 --nproc-per-node=4 --rdzv-backend=c10d --rdzv-endpoint=$head_node_ip /glade/u/home/ksha/miles-credit/applications/rollout_to_netcdf.py -c /glade/work/ksha/CREDIT_runs/fuxi_6h/model_predict.yml

launch_rollout.sh ADDED Viewed

	@@ -0,0 +1,45 @@

+#!/bin/bash
+#PBS -A NAML0001
+#PBS -N fx6h_roll
+#PBS -l walltime=12:00:00
+#PBS -l select=8:ncpus=64:ngpus=4
+#PBS -q main
+#PBS -j oe
+#PBS -k eod
+#PBS -r n
+# Load modules
+module purge
+module load nvhpc cuda cray-mpich conda
+conda activate /glade/work/ksha/miniconda3/envs/credit
+# Get a list of allocated nodes
+nodes=( $( cat $PBS_NODEFILE ) )
+head_node=${nodes[0]}
+head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}')
+# Export environment variables
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31"
+export LSCRATCH=/glade/derecho/scratch/ksha/
+export LOGLEVEL=INFO
+#export NCCL_DEBUG=INFO
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_HOME=/glade/u/home/dhoward/work/nccl-ofi-plugin/install
+export LD_LIBRARY_PATH=$NCCL_HOME/lib:$NCCL_HOME/plugin/lib:$LD_LIBRARY_PATH
+export NCCL_NCHANNELS_PER_NET_PEER=4
+export MPICH_GPU_SUPPORT_ENABLED=1
+export MPICH_OFI_NIC_POLICY=GPU
+export MPICH_RDMA_ENABLED_CUDA=1
+export NCCL_DISABLE_IB=1
+export NCCL_CROSS_NIC=1
+export FI_CXI_DISABLE_HOST_REGISTER=1
+export FI_CXI_OPTIMIZED_MRS=false
+# Print the results
+echo "Number of nodes: 8"
+echo "Number of GPUs per node: 4"
+echo "Total number of GPUs: 32"
+# Log in to WandB if needed
+# wandb login 02d2b1af00b5df901cb2bee071872de774781520
+# Launch MPIs
+mpiexec -n 8 --ppn 1 --cpu-bind none torchrun --nnodes=8 --nproc-per-node=4 --rdzv-backend=c10d --rdzv-endpoint=$head_node_ip /glade/u/home/ksha/miles-credit/applications/rollout_metrics.py -c /glade/work/ksha/CREDIT_runs/fuxi_6h/model_predict.yml

launch_single.sh ADDED Viewed

	@@ -0,0 +1,46 @@

+#!/bin/bash
+#PBS -A NCIS0010
+#PBS -N fuxi_6h
+#PBS -l walltime=12:00:00
+#PBS -l select=8:ncpus=64:ngpus=4
+#PBS -q main
+#PBS -j oe
+#PBS -k eod
+#PBS -r n
+# Load modules
+module purge
+module load gcc craype cray-mpich cuda cudnn/8.8.1.3-12 conda
+conda activate /glade/work/ksha/miniconda3/envs/credit-derecho
+# conda conda activate /glade/u/home/schreck/.conda/envs/credit-derecho
+# Export environment variables
+export LSCRATCH=/glade/derecho/scratch/ksha/
+export LOGLEVEL=INFO
+export NCCL_DEBUG=INFO
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+export NCCL_SOCKET_IFNAME=hsn
+export MPICH_GPU_MANAGED_MEMORY_SUPPORT_ENABLED=1
+export MPICH_OFI_NIC_POLICY=GPU
+export MPICH_GPU_SUPPORT_ENABLED=1
+export NCCL_IB_DISABLE=1
+export NCCL_CROSS_NIC=1
+export NCCL_NCHANNELS_PER_NET_PEER=4
+export MPICH_RDMA_ENABLED_CUDA=1
+export NCCL_NET="AWS Libfabric"
+export NCCL_NET_GDR_LEVEL=PBH
+export FI_CXI_DISABLE_HOST_REGISTER=1
+export FI_CXI_OPTIMIZED_MRS=false
+export FI_MR_CACHE_MONITOR=userfaultfd
+export FI_CXI_DEFAULT_CQ_SIZE=131072
+# logger.info the results
+echo "Number of nodes: 8"
+echo "Number of GPUs per node: 4"
+echo "Total number of GPUs: 32"
+# Log in to WandB if needed
+# wandb login 02d2b1af00b5df901cb2bee071872de774781520
+# Launch MPIs
+nodes=( $( cat $PBS_NODEFILE ) )
+echo nodes: $nodes
+# Find headnode's IP:
+head_node=${nodes[0]}
+head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}')
+MASTER_ADDR=$head_node_ip MASTER_PORT=1234 mpiexec -n 32 --ppn 4 --cpu-bind none python /glade/u/home/ksha/miles-credit/applications/train.py -c /glade/work/ksha/CREDIT_runs/fuxi_6h/model_single.yml --backend nccl

model.yml ADDED Viewed

File without changes

model_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3137668291b82d20baf4b412769f6782a62db0a75cdcd213c7f03e02fba027ab
+size 1044714782

model_multi.yml ADDED Viewed

	@@ -0,0 +1,224 @@

+# --------------------------------------------------------------------------------------------------------------------- #
+# This yaml file implements 6 hourly FuXi on NSF NCAR HPCs (casper.ucar.edu and derecho.hpc.ucar.edu)
+# the FuXi architecture has been modified to reduce the overall model size
+# The model is trained on hourly model-level ERA5 data with top solar irradiance, geopotential, and land-sea mask inputs
+# Output variables: model level [U, V, T, Q], single level [SP, t2m], and 500 hPa [U, V, T, Z, Q]
+#
+# Yingkai Sha
+# [email protected]
+# --------------------------------------------------------------------------------------------------------------------- #
+save_loc: '/glade/work/ksha/CREDIT_runs/fuxi_6h/'
+seed: 1000
+data:
+    # upper-air variables
+    variables: ['U','V','T','Q']
+    save_loc: '/glade/derecho/scratch/ksha/CREDIT_data/ERA5_mlevel_arXiv/Sixiourly_y_TOTAL*'
+    # surface variables
+    surface_variables: ['SP','t2m','V500','U500','T500','Z500','Q500']
+    save_loc_surface: '/glade/derecho/scratch/ksha/CREDIT_data/ERA5_mlevel_arXiv/SixHourly_y_TOTAL*'
+    # dynamic forcing variables
+    dynamic_forcing_variables: ['tsi']
+    save_loc_dynamic_forcing: '/glade/derecho/scratch/dgagne/credit_solar_6h_0.25deg/*.nc'
+    # diagnostic variables
+    # diagnostic_variables: ['V500','U500','T500','Z500','Q500']
+    # save_loc_diagnostic: '/glade/derecho/scratch/wchapman/SixHourly_y_TOTAL*'
+    # static variables
+    static_variables: ['Z_GDS4_SFC','LSM']
+    save_loc_static: '/glade/derecho/scratch/ksha/CREDIT_data/static_norm_old.nc'
+    # mean / std path
+    mean_path: '/glade/derecho/scratch/ksha/CREDIT_data/mean_6h_1979_2018_16lev_0.25deg.nc'
+    std_path: '/glade/derecho/scratch/ksha/CREDIT_data/std_residual_6h_1979_2018_16lev_0.25deg.nc'
+    # train / validation split
+    train_years: [1979, 2018]
+    valid_years: [2018, 2019]
+    # data workflow
+    scaler_type: 'std_new'
+    # number of input states
+    # FuXi has 2 input states
+    history_len: 2
+    valid_history_len: 2
+    # number of forecast steps to compute loss
+    # 0 for single step training / validation
+    # larger than 0 for multi-step training / validation
+    forecast_len: 7
+    valid_forecast_len: 7
+    # one_shot: True --> compute loss on the last forecast step only
+    # one_shot: False --> compute loss on all forecast steps
+    one_shot: True
+    # 1 for hourly model
+    lead_time_periods: 6
+    # do not use skip_period
+    skip_periods: null
+    # compatible with the old 'std'
+    static_first: True
+trainer:
+    type: multi-step # <---------- change to your type
+    mode: fsdp
+    cpu_offload: False
+    activation_checkpoint: True
+    load_weights: True
+    load_optimizer: True
+    load_scaler: True
+    load_sheduler: True
+    skip_validation: False
+    update_learning_rate: False
+    save_backup_weights: True
+    save_best_weights: True
+    learning_rate: 1.0e-06 # <-- change to your lr
+    weight_decay: 0
+    train_batch_size: 1
+    valid_batch_size: 1
+    batches_per_epoch: 759 # full epoch: 1772
+    valid_batches_per_epoch: 0
+    stopping_patience: 50
+    start_epoch: 0
+    num_epoch: 1
+    # False when switching from single-step to multi-step
+    reload_epoch: True
+    epochs: &epochs 20
+    use_scheduler: True
+    scheduler: {'scheduler_type': 'cosine-annealing', 'T_max': *epochs,  'last_epoch': -1}
+    # Automatic Mixed Precision: False
+    amp: False
+    # rescale loss as loss = loss / grad_accum_every
+    grad_accum_every: 1
+    # gradient clipping
+    grad_max_norm: 1.0
+    # number of workers
+    thread_workers: 4
+    valid_thread_workers: 0
+model:
+    type: "fuxi"
+    frames: 2               # number of input states
+    image_height: 640       # number of latitude grids
+    image_width: 1280       # number of longitude grids
+    levels: 16              # number of upper-air variable levels
+    channels: 4             # upper-air variable channels
+    surface_channels: 7     # surface variable channels
+    input_only_channels: 3  # dynamic forcing, forcing, static channels
+    output_only_channels: 0 # diagnostic variable channels
+    # patchify layer
+    patch_height: 4         # number of latitude grids in each 3D patch
+    patch_width: 4          # number of longitude grids in each 3D patch
+    frame_patch_size: 2     # number of input states in each 3D patch
+    # hidden layers
+    dim: 1024               # dimension (default: 1536)
+    num_groups: 32          # number of groups (default: 32)
+    num_heads: 8            # number of heads (default: 8)
+    window_size: 7          # window size (default: 7)
+    depth: 16               # number of swin transformers (default: 48)
+    # use spectral norm
+    use_spectral_norm: True
+    # ============================================================== #
+    # New
+    # use interpolation to match the output size
+    interp: True
+    # map boundary padding
+    padding_conf:
+        activate: True
+        mode: earth
+        pad_lat: 80
+        pad_lon: 80
+    post_conf:
+        activate: True
+        tracer_fixer:
+            activate: True
+            denorm: True
+            tracer_name: ['Q', 'Q500']
+            tracer_thres: [1e-8, 1e-8]
+loss:
+    # the main training loss
+    training_loss: "mse"
+    # power loss (x), spectral_loss (x)
+    use_power_loss: False
+    use_spectral_loss: False
+    # use latitude weighting
+    use_latitude_weights: True
+    latitude_weights: "/glade/u/home/wchapman/MLWPS/DataLoader/LSM_static_variables_ERA5_zhght.nc"
+    # turn-off variable weighting
+    use_variable_weights: False
+    # variable_weights:
+    #     U: [0.132, 0.123, 0.113, 0.104, 0.095, 0.085, 0.076, 0.067, 0.057, 0.048, 0.039, 0.029, 0.02 , 0.011, 0.005]
+    #     V: [0.132, 0.123, 0.113, 0.104, 0.095, 0.085, 0.076, 0.067, 0.057, 0.048, 0.039, 0.029, 0.02 , 0.011, 0.005]
+    #     T: [0.132, 0.123, 0.113, 0.104, 0.095, 0.085, 0.076, 0.067, 0.057, 0.048, 0.039, 0.029, 0.02 , 0.011, 0.005]
+    #     Q: [0.132, 0.123, 0.113, 0.104, 0.095, 0.085, 0.076, 0.067, 0.057, 0.048, 0.039, 0.029, 0.02 , 0.011, 0.005]
+    #     SP: 0.1
+    #     t2m: 1.0
+    #     V500: 0.1
+    #     U500: 0.1
+    #     T500: 0.1
+    #     Z500: 0.1
+    #     Q500: 0.1
+predict:
+    forecasts:
+        type: "custom"       # keep it as "custom"
+        start_year: 2020     # year of the first initialization (where rollout will start)
+        start_month: 1       # month of the first initialization
+        start_day: 1         # day of the first initialization
+        start_hours: [0, 12] # hour-of-day for each initialization, 0 for 00Z, 12 for 12Z
+        duration: 30         # number of days to initialize, starting from the (year, mon, day) above
+                             # duration should be divisible by the number of GPUs
+                             # (e.g., duration: 384 for 365-day rollout using 32 GPUs)
+        days: 2              # forecast lead time as days (1 means 24-hour forecast)
+    save_forecast: '/glade/derecho/scratch/ksha/CREDIT/fuxi_6h/'
+    save_vars: ['SP','t2m','V500','U500','T500','Z500','Q500']
+    # turn-off low-pass filter
+    use_laplace_filter: False
+    # deprecated
+    # save_format: "nc"
+pbs: #derecho
+    conda: "/glade/work/ksha/miniconda3/envs/credit"
+    project: "NAML0001"
+    job_name: "fuxi_6h"
+    walltime: "12:00:00"
+    nodes: 8
+    ncpus: 64
+    ngpus: 4
+    mem: '480GB'
+    queue: 'main'

model_predict.yml ADDED Viewed

	@@ -0,0 +1,136 @@

+# --------------------------------------------------------------------------------------------------------------------- #
+# This yaml file implements 6 hourly FuXi on NSF NCAR HPCs (casper.ucar.edu and derecho.hpc.ucar.edu)
+# the FuXi architecture has been modified to reduce the overall model size
+# The model is trained on hourly model-level ERA5 data with top solar irradiance, geopotential, and land-sea mask inputs
+# Output variables: model level [U, V, T, Q], single level [SP, t2m], and 500 hPa [U, V, T, Z, Q]
+#
+# Yingkai Sha
+# [email protected]
+# --------------------------------------------------------------------------------------------------------------------- #
+save_loc: '/glade/work/ksha/CREDIT_runs/fuxi_6h/'
+seed: 1000
+data:
+    # upper-air variables
+    variables: ['U','V','T','Q']
+    save_loc: '/glade/derecho/scratch/ksha/CREDIT_data/ERA5_mlevel_arXiv/SixHourly_y*'
+    # surface variables
+    surface_variables: ['SP','t2m','V500','U500','T500','Z500','Q500']
+    save_loc_surface: '/glade/derecho/scratch/ksha/CREDIT_data/ERA5_mlevel_arXiv/SixHourly_y*'
+    # dynamic forcing variables
+    dynamic_forcing_variables: ['tsi']
+    save_loc_dynamic_forcing: '/glade/derecho/scratch/dgagne/credit_solar_6h_0.25deg/*.nc'
+    # static variables
+    static_variables: ['Z_GDS4_SFC','LSM']
+    save_loc_static: '/glade/derecho/scratch/ksha/CREDIT_data/static_norm_old.nc'
+    # mean / std path
+    mean_path: '/glade/derecho/scratch/ksha/CREDIT_data/mean_6h_1979_2018_16lev_0.25deg.nc'
+    std_path: '/glade/derecho/scratch/ksha/CREDIT_data/std_residual_6h_1979_2018_16lev_0.25deg.nc'
+    # train / validation split
+    train_years: [1979, 2018]
+    valid_years: [2018, 2019]
+    # data workflow
+    scaler_type: 'std_new'
+    # number of input states
+    # FuXi has 2 input states
+    history_len: 2
+    valid_history_len: 2
+    # number of forecast steps to compute loss
+    # 0 for single step training / validation
+    # larger than 0 for multi-step training / validation
+    forecast_len: 0
+    valid_forecast_len: 0
+    # 1 for hourly model
+    lead_time_periods: 6
+    # do not use skip_period
+    skip_periods: null
+    # compatible with the old 'std'
+    static_first: True
+trainer:
+    type: standard
+    mode: fsdp
+model:
+    type: "fuxi"
+    frames: 2               # number of input states
+    image_height: 640       # number of latitude grids
+    image_width: 1280       # number of longitude grids
+    levels: 16              # number of upper-air variable levels
+    channels: 4             # upper-air variable channels
+    surface_channels: 7     # surface variable channels
+    input_only_channels: 3  # dynamic forcing, forcing, static channels
+    output_only_channels: 0 # diagnostic variable channels
+    # patchify layer
+    patch_height: 4         # number of latitude grids in each 3D patch
+    patch_width: 4          # number of longitude grids in each 3D patch
+    frame_patch_size: 2     # number of input states in each 3D patch
+    # hidden layers
+    dim: 1024               # dimension (default: 1536)
+    num_groups: 32          # number of groups (default: 32)
+    num_heads: 8            # number of heads (default: 8)
+    window_size: 7          # window size (default: 7)
+    depth: 16               # number of swin transformers (default: 48)
+    # use spectral norm
+    use_spectral_norm: True
+    # ============================================================== #
+    # New
+    # use interpolation to match the output size
+    interp: True
+    # map boundary padding
+    padding_conf:
+        activate: True
+        mode: earth
+        pad_lat: 80
+        pad_lon: 80
+    post_conf:
+        activate: True
+        tracer_fixer:
+            activate: True
+            denorm: True
+            tracer_name: ['Q', 'Q500']
+            tracer_thres: [1e-8, 1e-8]
+loss:
+    use_latitude_weights: True
+    latitude_weights: "/glade/u/home/wchapman/MLWPS/DataLoader/LSM_static_variables_ERA5_zhght.nc"
+predict:
+    forecasts:
+        type: "custom"       # keep it as "custom"
+        start_year: 2021     # year of the first initialization (where rollout will start)
+        start_month: 12      # month of the first initialization
+        start_day: 31        # day of the first initialization
+        start_hours: [0, 12] # hour-of-day for each initialization, 0 for 00Z, 12 for 12Z
+        duration: 384        # number of days to initialize, starting from the (year, mon, day) above
+                             # duration should be divisible by the number of GPUs
+                             # (e.g., duration: 384 for 365-day rollout using 32 GPUs)
+        days: 10             # forecast lead time as days (1 means 24-hour forecast)
+    metadata: '/glade/u/home/ksha/miles-credit/credit/metadata/era5.yaml'
+    save_forecast: '/glade/derecho/scratch/ksha/CREDIT/RAW_OUTPUT/fuxi_6h_test/'
+    # turn-off low-pass filter
+    use_laplace_filter: False

model_predict_cpu.yml ADDED Viewed

	@@ -0,0 +1,116 @@

+# --------------------------------------------------------------------------------------------------------------------- #
+# This yaml file implements 6 hourly FuXi on NSF NCAR HPCs (casper.ucar.edu and derecho.hpc.ucar.edu)
+# the FuXi architecture has been modified to reduce the overall model size
+# The model is trained on hourly model-level ERA5 data with top solar irradiance, geopotential, and land-sea mask inputs
+# Output variables: model level [U, V, T, Q], single level [SP, t2m], and 500 hPa [U, V, T, Z, Q]
+#
+# Yingkai Sha
+# [email protected]
+# --------------------------------------------------------------------------------------------------------------------- #
+save_loc: '/glade/work/ksha/CREDIT_runs/fuxi_6h/'
+seed: 1000
+data:
+    # upper-air variables
+    variables: ['U','V','T','Q']
+    save_loc: '/glade/derecho/scratch/wchapman/SixHourly_y_TOTAL*'
+    # surface variables
+    surface_variables: ['SP','t2m','V500','U500','T500','Z500','Q500']
+    save_loc_surface: '/glade/derecho/scratch/wchapman/SixHourly_y_TOTAL*'
+    # dynamic forcing variables
+    dynamic_forcing_variables: ['tsi']
+    save_loc_dynamic_forcing: '/glade/derecho/scratch/dgagne/credit_solar_6h_0.25deg/*.nc'
+    # static variables
+    static_variables: ['Z_GDS4_SFC','LSM']
+    save_loc_static: '/glade/derecho/scratch/ksha/CREDIT_data/static_norm_old.nc'
+    # mean / std path
+    mean_path: '/glade/derecho/scratch/ksha/CREDIT_data/mean_6h_1979_2018_16lev_0.25deg.nc'
+    std_path: '/glade/derecho/scratch/ksha/CREDIT_data/std_residual_6h_1979_2018_16lev_0.25deg.nc'
+    # train / validation split
+    train_years: [1979, 2018]
+    valid_years: [2018, 2019]
+    # data workflow
+    scaler_type: 'std_new'
+    # number of input states
+    # FuXi has 2 input states
+    history_len: 2
+    valid_history_len: 2
+    # number of forecast steps to compute loss
+    # 0 for single step training / validation
+    # larger than 0 for multi-step training / validation
+    forecast_len: 0
+    valid_forecast_len: 0
+    # 1 for hourly model
+    lead_time_periods: 6
+    # do not use skip_period
+    skip_periods: null
+    # compatible with the old 'std'
+    static_first: True
+trainer:
+    type: standard
+    mode: none
+model:
+    type: "fuxi"
+    frames: 2               # number of input states
+    image_height: 640       # number of latitude grids
+    image_width: 1280       # number of longitude grids
+    levels: 16              # number of upper-air variable levels
+    channels: 4             # upper-air variable channels
+    surface_channels: 7     # surface variable channels
+    input_only_channels: 3  # dynamic forcing, forcing, static channels
+    output_only_channels: 0 # diagnostic variable channels
+    # patchify layer
+    patch_height: 4         # number of latitude grids in each 3D patch
+    patch_width: 4          # number of longitude grids in each 3D patch
+    frame_patch_size: 2     # number of input states in each 3D patch
+    # hidden layers
+    dim: 1024               # dimension (default: 1536)
+    num_groups: 32          # number of groups (default: 32)
+    num_heads: 8            # number of heads (default: 8)
+    window_size: 7          # window size (default: 7)
+    depth: 16               # number of swin transformers (default: 48)
+    # map boundary padding
+    pad_lon: 80             # number of grids to pad on 0 and 360 deg lon
+    pad_lat: 80             # number of grids to pad on -90 and 90 deg lat
+    # use spectral norm
+    use_spectral_norm: True
+loss:
+    use_latitude_weights: True
+    latitude_weights: "/glade/u/home/wchapman/MLWPS/DataLoader/LSM_static_variables_ERA5_zhght.nc"
+predict:
+    forecasts:
+        type: "custom"       # keep it as "custom"
+        start_year: 2020     # year of the first initialization (where rollout will start)
+        start_month: 1       # month of the first initialization
+        start_day: 1         # day of the first initialization
+        start_hours: [0,] # hour-of-day for each initialization, 0 for 00Z, 12 for 12Z
+        duration: 1        # number of days to initialize, starting from the (year, mon, day) above
+                             # duration should be divisible by the number of GPUs
+                             # (e.g., duration: 384 for 365-day rollout using 32 GPUs)
+        days: 1              # forecast lead time as days (1 means 24-hour forecast)
+    save_forecast: '/glade/derecho/scratch/ksha/CREDIT/RAW_OUTPUT/fuxi_6h_collins/'
+    save_vars: ['SP','t2m','V500','U500','T500','Z500','Q500']
+    # turn-off low-pass filter
+    use_laplace_filter: False

model_single.yml ADDED Viewed

	@@ -0,0 +1,205 @@

+# --------------------------------------------------------------------------------------------------------------------- #
+# This yaml file implements 6 hourly FuXi on NSF NCAR HPCs (casper.ucar.edu and derecho.hpc.ucar.edu)
+# the FuXi architecture has been modified to reduce the overall model size
+# The model is trained on hourly model-level ERA5 data with top solar irradiance, geopotential, and land-sea mask inputs
+# Output variables: model level [U, V, T, Q], single level [SP, t2m], and 500 hPa [U, V, T, Z, Q]
+#
+# Yingkai Sha
+# [email protected]
+# --------------------------------------------------------------------------------------------------------------------- #
+save_loc: '/glade/work/ksha/CREDIT_runs/fuxi_6h/'
+seed: 1000
+data:
+    # upper-air variables
+    variables: ['U','V','T','Q']
+    save_loc: '/glade/derecho/scratch/wchapman/SixHourly_y_TOTAL*'
+    # surface variables
+    surface_variables: ['SP','t2m','V500','U500','T500','Z500','Q500']
+    save_loc_surface: '/glade/derecho/scratch/wchapman/SixHourly_y_TOTAL*'
+    # dynamic forcing variables
+    dynamic_forcing_variables: ['tsi']
+    save_loc_dynamic_forcing: '/glade/derecho/scratch/dgagne/credit_solar_6h_0.25deg/*.nc'
+    # diagnostic variables
+    # diagnostic_variables: ['V500','U500','T500','Z500','Q500']
+    # save_loc_diagnostic: '/glade/derecho/scratch/wchapman/SixHourly_y_TOTAL*'
+    # static variables
+    static_variables: ['Z_GDS4_SFC','LSM']
+    save_loc_static: '/glade/derecho/scratch/ksha/CREDIT_data/static_norm_old.nc'
+    # mean / std path
+    mean_path: '/glade/derecho/scratch/ksha/CREDIT_data/mean_6h_1979_2018_16lev_0.25deg.nc'
+    std_path: '/glade/derecho/scratch/ksha/CREDIT_data/std_residual_6h_1979_2018_16lev_0.25deg.nc'
+    # train / validation split
+    train_years: [1979, 2018]
+    valid_years: [2018, 2019]
+    # data workflow
+    scaler_type: 'std_new'
+    # number of input states
+    # FuXi has 2 input states
+    history_len: 2
+    valid_history_len: 2
+    # number of forecast steps to compute loss
+    # 0 for single step training / validation
+    # larger than 0 for multi-step training / validation
+    forecast_len: 0
+    valid_forecast_len: 0
+    # one_shot: True --> compute loss on the last forecast step only
+    # one_shot: False --> compute loss on all forecast steps
+    one_shot: True
+    # 1 for hourly model
+    lead_time_periods: 6
+    # do not use skip_period
+    skip_periods: null
+    # compatible with the old 'std'
+    static_first: True
+trainer:
+    type: standard # <---------- change to your type
+    mode: fsdp
+    cpu_offload: False
+    activation_checkpoint: True
+    load_weights: True
+    load_optimizer: True
+    load_scaler: True
+    load_sheduler: True
+    skip_validation: False
+    update_learning_rate: False
+    save_backup_weights: True
+    save_best_weights: True
+    learning_rate: 1.0e-03 # <-- change to your lr
+    weight_decay: 0
+    train_batch_size: 1
+    valid_batch_size: 1
+    batches_per_epoch: 0
+    valid_batches_per_epoch: 0
+    stopping_patience: 50
+    start_epoch: 0
+    num_epoch: 2
+    reload_epoch: True
+    epochs: &epochs 70
+    use_scheduler: True
+    scheduler: {'scheduler_type': 'cosine-annealing', 'T_max': *epochs,  'last_epoch': -1}
+    # Automatic Mixed Precision: False
+    amp: False
+    # rescale loss as loss = loss / grad_accum_every
+    grad_accum_every: 1
+    # gradient clipping
+    grad_max_norm: 1.0
+    # number of workers
+    thread_workers: 4
+    valid_thread_workers: 0
+model:
+    type: "fuxi"
+    frames: 2               # number of input states
+    image_height: 640       # number of latitude grids
+    image_width: 1280       # number of longitude grids
+    levels: 16              # number of upper-air variable levels
+    channels: 4             # upper-air variable channels
+    surface_channels: 7     # surface variable channels
+    input_only_channels: 3  # dynamic forcing, forcing, static channels
+    output_only_channels: 0 # diagnostic variable channels
+    # patchify layer
+    patch_height: 4         # number of latitude grids in each 3D patch
+    patch_width: 4          # number of longitude grids in each 3D patch
+    frame_patch_size: 2     # number of input states in each 3D patch
+    # hidden layers
+    dim: 1024               # dimension (default: 1536)
+    num_groups: 32          # number of groups (default: 32)
+    num_heads: 8            # number of heads (default: 8)
+    window_size: 7          # window size (default: 7)
+    depth: 16               # number of swin transformers (default: 48)
+    # map boundary padding
+    pad_lon: 80             # number of grids to pad on 0 and 360 deg lon
+    pad_lat: 80             # number of grids to pad on -90 and 90 deg lat
+    # use spectral norm
+    use_spectral_norm: True
+loss:
+    # the main training loss
+    training_loss: "mse"
+    # power loss (x), spectral_loss (x)
+    use_power_loss: False
+    use_spectral_loss: False
+    # use latitude weighting
+    use_latitude_weights: True
+    latitude_weights: "/glade/u/home/wchapman/MLWPS/DataLoader/LSM_static_variables_ERA5_zhght.nc"
+    # turn-off variable weighting
+    use_variable_weights: False
+    # variable_weights:
+    #     U: [0.132, 0.123, 0.113, 0.104, 0.095, 0.085, 0.076, 0.067, 0.057, 0.048, 0.039, 0.029, 0.02 , 0.011, 0.005]
+    #     V: [0.132, 0.123, 0.113, 0.104, 0.095, 0.085, 0.076, 0.067, 0.057, 0.048, 0.039, 0.029, 0.02 , 0.011, 0.005]
+    #     T: [0.132, 0.123, 0.113, 0.104, 0.095, 0.085, 0.076, 0.067, 0.057, 0.048, 0.039, 0.029, 0.02 , 0.011, 0.005]
+    #     Q: [0.132, 0.123, 0.113, 0.104, 0.095, 0.085, 0.076, 0.067, 0.057, 0.048, 0.039, 0.029, 0.02 , 0.011, 0.005]
+    #     SP: 0.1
+    #     t2m: 1.0
+    #     V500: 0.1
+    #     U500: 0.1
+    #     T500: 0.1
+    #     Z500: 0.1
+    #     Q500: 0.1
+predict:
+    forecasts:
+        type: "custom"       # keep it as "custom"
+        start_year: 2020     # year of the first initialization (where rollout will start)
+        start_month: 1       # month of the first initialization
+        start_day: 1         # day of the first initialization
+        start_hours: [0, 12] # hour-of-day for each initialization, 0 for 00Z, 12 for 12Z
+        duration: 30         # number of days to initialize, starting from the (year, mon, day) above
+                             # duration should be divisible by the number of GPUs
+                             # (e.g., duration: 384 for 365-day rollout using 32 GPUs)
+        days: 2              # forecast lead time as days (1 means 24-hour forecast)
+    save_forecast: '/glade/derecho/scratch/ksha/CREDIT/fuxi_6h/'
+    save_vars: ['SP','t2m','V500','U500','T500','Z500','Q500']
+    # turn-off low-pass filter
+    use_laplace_filter: False
+    # deprecated
+    # save_format: "nc"
+pbs: #derecho
+    conda: "/glade/work/ksha/miniconda3/envs/credit"
+    project: "NAML0001"
+    job_name: "fuxi_6h"
+    walltime: "12:00:00"
+    nodes: 8
+    ncpus: 64
+    ngpus: 4
+    mem: '480GB'
+    queue: 'main'

model_single_cached.yml ADDED Viewed

	@@ -0,0 +1,200 @@

+# --------------------------------------------------------------------------------------------------------------------- #
+# This yaml file implements 6 hourly FuXi on NSF NCAR HPCs (casper.ucar.edu and derecho.hpc.ucar.edu)
+# the FuXi architecture has been modified to reduce the overall model size
+# The model is trained on hourly model-level ERA5 data with top solar irradiance, geopotential, and land-sea mask inputs
+# Output variables: model level [U, V, T, Q], single level [SP, t2m], and 500 hPa [U, V, T, Z, Q]
+#
+# Yingkai Sha
+# [email protected]
+# --------------------------------------------------------------------------------------------------------------------- #
+save_loc: '/glade/work/ksha/CREDIT_runs/fuxi_6h/'
+seed: 1000
+data:
+    # upper-air variables
+    variables: ['U','V','T','Q']
+    save_loc: '/glade/derecho/scratch/ksha/CREDIT_data/arXiv_cached/cache_arXiv_6h_*'
+    # surface variables
+    surface_variables: ['SP','t2m','V500','U500','T500','Z500','Q500']
+    save_loc_surface: '/glade/derecho/scratch/ksha/CREDIT_data/arXiv_cached/cache_arXiv_6h_*'
+    # dynamic forcing variables
+    dynamic_forcing_variables: ['tsi']
+    save_loc_dynamic_forcing: '/glade/derecho/scratch/ksha/CREDIT_data/arXiv_cached/cache_arXiv_6h_*'
+    # static variables
+    static_variables: ['Z_GDS4_SFC','LSM']
+    save_loc_static: '/glade/derecho/scratch/ksha/CREDIT_data/static_norm_old.nc'
+    # mean / std path
+    mean_path: '/glade/derecho/scratch/ksha/CREDIT_data/mean_6h_1979_2018_16lev_0.25deg.nc'
+    std_path: '/glade/derecho/scratch/ksha/CREDIT_data/std_residual_6h_1979_2018_16lev_0.25deg.nc'
+    # train / validation split
+    train_years: [1979, 2018]
+    valid_years: [2018, 2019]
+    # data workflow
+    scaler_type: 'std_cached'
+    # number of input states
+    # FuXi has 2 input states
+    history_len: 2
+    valid_history_len: 2
+    # number of forecast steps to compute loss
+    # 0 for single step training / validation
+    # larger than 0 for multi-step training / validation
+    forecast_len: 0
+    valid_forecast_len: 0
+    # one_shot: True --> compute loss on the last forecast step only
+    # one_shot: False --> compute loss on all forecast steps
+    one_shot: True
+    # 1 for hourly model
+    lead_time_periods: 6
+    # do not use skip_period
+    skip_periods: null
+    # compatible with the old 'std'
+    static_first: True
+trainer:
+    type: standard # <---------- change to your type
+    mode: fsdp
+    cpu_offload: False
+    activation_checkpoint: True
+    load_weights: True
+    load_optimizer: True
+    load_scaler: True
+    load_sheduler: True
+    skip_validation: False
+    update_learning_rate: False
+    save_backup_weights: True
+    save_best_weights: True
+    learning_rate: 1.0e-03 # <-- change to your lr
+    weight_decay: 0
+    train_batch_size: 1
+    valid_batch_size: 1
+    batches_per_epoch: 0
+    valid_batches_per_epoch: 0
+    stopping_patience: 50
+    start_epoch: 0
+    #num_epoch: 5
+    reload_epoch: True
+    epochs: &epochs 70
+    use_scheduler: True
+    scheduler: {'scheduler_type': 'cosine-annealing', 'T_max': *epochs,  'last_epoch': -1}
+    # Automatic Mixed Precision: False
+    amp: False
+    # rescale loss as loss = loss / grad_accum_every
+    grad_accum_every: 1
+    # gradient clipping
+    grad_max_norm: 1.0
+    # number of workers
+    thread_workers: 4
+    valid_thread_workers: 0
+model:
+    type: "fuxi"
+    frames: 2               # number of input states
+    image_height: 640       # number of latitude grids
+    image_width: 1280       # number of longitude grids
+    levels: 16              # number of upper-air variable levels
+    channels: 4             # upper-air variable channels
+    surface_channels: 7     # surface variable channels
+    input_only_channels: 3  # dynamic forcing, forcing, static channels
+    output_only_channels: 0 # diagnostic variable channels
+    # patchify layer
+    patch_height: 4         # number of latitude grids in each 3D patch
+    patch_width: 4          # number of longitude grids in each 3D patch
+    frame_patch_size: 2     # number of input states in each 3D patch
+    # hidden layers
+    dim: 1024               # dimension (default: 1536)
+    num_groups: 32          # number of groups (default: 32)
+    num_heads: 8            # number of heads (default: 8)
+    window_size: 7          # window size (default: 7)
+    depth: 16               # number of swin transformers (default: 48)
+    # map boundary padding
+    pad_lon: 80             # number of grids to pad on 0 and 360 deg lon
+    pad_lat: 80             # number of grids to pad on -90 and 90 deg lat
+    # use spectral norm
+    use_spectral_norm: True
+loss:
+    # the main training loss
+    training_loss: "mse"
+    # power loss (x), spectral_loss (x)
+    use_power_loss: False
+    use_spectral_loss: False
+    # use latitude weighting
+    use_latitude_weights: True
+    latitude_weights: "/glade/u/home/wchapman/MLWPS/DataLoader/LSM_static_variables_ERA5_zhght.nc"
+    # turn-off variable weighting
+    use_variable_weights: False
+    # variable_weights:
+    #     U: [0.132, 0.123, 0.113, 0.104, 0.095, 0.085, 0.076, 0.067, 0.057, 0.048, 0.039, 0.029, 0.02 , 0.011, 0.005]
+    #     V: [0.132, 0.123, 0.113, 0.104, 0.095, 0.085, 0.076, 0.067, 0.057, 0.048, 0.039, 0.029, 0.02 , 0.011, 0.005]
+    #     T: [0.132, 0.123, 0.113, 0.104, 0.095, 0.085, 0.076, 0.067, 0.057, 0.048, 0.039, 0.029, 0.02 , 0.011, 0.005]
+    #     Q: [0.132, 0.123, 0.113, 0.104, 0.095, 0.085, 0.076, 0.067, 0.057, 0.048, 0.039, 0.029, 0.02 , 0.011, 0.005]
+    #     SP: 0.1
+    #     t2m: 1.0
+    #     V500: 0.1
+    #     U500: 0.1
+    #     T500: 0.1
+    #     Z500: 0.1
+    #     Q500: 0.1
+predict:
+    forecasts:
+        type: "custom"       # keep it as "custom"
+        start_year: 2020     # year of the first initialization (where rollout will start)
+        start_month: 1       # month of the first initialization
+        start_day: 1         # day of the first initialization
+        start_hours: [0, 12] # hour-of-day for each initialization, 0 for 00Z, 12 for 12Z
+        duration: 30         # number of days to initialize, starting from the (year, mon, day) above
+                             # duration should be divisible by the number of GPUs
+                             # (e.g., duration: 384 for 365-day rollout using 32 GPUs)
+        days: 2              # forecast lead time as days (1 means 24-hour forecast)
+    save_forecast: '/glade/derecho/scratch/ksha/CREDIT/fuxi_6h/'
+    save_vars: ['SP','t2m','V500','U500','T500','Z500','Q500']
+    # turn-off low-pass filter
+    use_laplace_filter: False
+    # deprecated
+    # save_format: "nc"
+pbs: #derecho
+    conda: "/glade/work/ksha/miniconda3/envs/credit"
+    project: "NAML0001"
+    job_name: "fuxi_6h"
+    walltime: "12:00:00"
+    nodes: 8
+    ncpus: 64
+    ngpus: 4
+    mem: '480GB'
+    queue: 'main'

optimizer_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ae6c78578cc62ec39b838f86701f8e22d5238d97a3b2fd16daa2513fdfeebb7
+size 1683978368