# RWKV v5 multi-size training experiment

**Note:** This project assumes you have the rwkv-infctx conda env setup

# Basic Setup

In [1]:
# First lets setup the various directories, and init the model
!mkdir -p ../../../../model/
!mkdir -p ../../../../datapath/
!mkdir -p ../../../../checkpoint/

In [2]:
DEEPSPEED_STRAT="deepspeed_stage_2_offload"
GPU_DEVICES="auto"
ENABLE_WANDB=True

EMBED_SCALE=0.01
EMBED_SCALE_LABEL=str(EMBED_SCALE).replace(".", "_")

EMBED_SIZE=2048

WANDB_PREFIX=f"[Multi-size] v5-L6+6-D{EMBED_SIZE}-E{EMBED_SCALE}"
FILENAME_PREFIX=f"v5-L6+6-D{EMBED_SIZE}-E{EMBED_SCALE_LABEL}"

print("DEEPSPEED_STRAT:", DEEPSPEED_STRAT)
print("ENABLE_WANDB:", ENABLE_WANDB)
print("GPU_DEVICES:", GPU_DEVICES)

if ENABLE_WANDB:
    WANDB_MODE="online"
else:
    WANDB_MODE="disabled"

# Computing the notebook, and various paths
import os
NOTEBOOK_DIR=os.path.dirname(os.path.abspath("__file__"))
PROJECT_DIR=os.path.abspath(os.path.join(NOTEBOOK_DIR, "../../../../"))
TRAINER_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))
INFERENCE_DIR=os.path.abspath(os.path.join(PROJECT_DIR, "./RWKV-v5/"))

print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("INFERENCE_DIR:", INFERENCE_DIR)
print("TRAINER_DIR:", TRAINER_DIR)
print("PROJECT_DIR:", PROJECT_DIR)

DEEPSPEED_STRAT: deepspeed_stage_2_offload
ENABLE_WANDB: True
GPU_DEVICES: auto
NOTEBOOK_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train
INFERENCE_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5
TRAINER_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5
PROJECT_DIR: /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer


In [3]:
# Get the init split model, and finetune from there
!cd "{PROJECT_DIR}/model/" && wget -nc "https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/0600b94a58219f658326b4792ef5cd020e9d1a43/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2a.pth"
!cd "{PROJECT_DIR}/model/" && wget -nc "https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/0600b94a58219f658326b4792ef5cd020e9d1a43/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2b.pth"

--2023-10-11 08:02:24--  https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/0600b94a58219f658326b4792ef5cd020e9d1a43/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2a.pth
Resolving huggingface.co (huggingface.co)... 18.154.227.87, 18.154.227.7, 18.154.227.69, ...
Connecting to huggingface.co (huggingface.co)|18.154.227.87|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/2f52085cee9c3db4bb079dc44edf50b0a19c170bd92128e918e6203efef83cea?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5-L6-D2048-E0_01-split-2a.pth%3B+filename%3D%22v5-L6-D2048-E0_01-split-2a.pth%22%3B&Expires=1697270544&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NzI3MDU0NH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4Yj

200 OK
Length: 1066536657 (1017M) [binary/octet-stream]
Saving to: ‚Äòv5-L6-D2048-E0_01-split-2a.pth‚Äô

          v5-L6-D20   0%[                    ]       0  --.-KB/s               

         v5-L6-D204   1%[                    ]  15.26M  42.9MB/s               

        v5-L6-D2048   3%[                    ]  30.52M  47.7MB/s               

       v5-L6-D2048-   4%[                    ]  45.26M  51.5MB/s               

      v5-L6-D2048-E   5%[>                   ]  59.20M  52.1MB/s               

     v5-L6-D2048-E0   6%[>                   ]  65.20M  48.8MB/s               

    v5-L6-D2048-E0_   7%[>                   ]  76.29M  44.4MB/s               

   v5-L6-D2048-E0_0   8%[>                   ]  91.03M  47.2MB/s               

  v5-L6-D2048-E0_01   9%[>                   ]  91.55M  43.0MB/s               

 v5-L6-D2048-E0_01-  10%[=>                  ] 106.81M  43.2MB/s               

v5-L6-D2048-E0_01-s  11%[=>                  ] 120.25M  43.9MB/s               

5-L6-D2048-E0_01-sp  12%[=>                  ] 122.07M  41.4MB/s               

-L6-D2048-E0_01-spl  13%[=>                  ] 136.81M  42.5MB/s    eta 21s    

L6-D2048-E0_01-spli  14%[=>                  ] 152.07M  42.8MB/s    eta 21s    

6-D2048-E0_01-split  15%[==>                 ] 152.72M  40.5MB/s    eta 21s    

-D2048-E0_01-split-  16%[==>                 ] 167.85M  41.5MB/s    eta 21s    

D2048-E0_01-split-2  18%[==>                 ] 183.10M  43.0MB/s    eta 19s    

2048-E0_01-split-2a  19%[==>                 ] 198.36M  43.4MB/s    eta 19s    

048-E0_01-split-2a.  20%[===>                ] 213.11M  44.1MB/s    eta 19s    

48-E0_01-split-2a.p  22%[===>                ] 228.36M  43.3MB/s    eta 19s    

8-E0_01-split-2a.pt  22%[===>                ] 228.87M  41.1MB/s    eta 18s    

-E0_01-split-2a.pth  24%[===>                ] 244.13M  41.0MB/s    eta 18s    

E0_01-split-2a.pth   25%[====>               ] 259.40M  42.4MB/s    eta 18s    

0_01-split-2a.pth    26%[====>               ] 272.83M  40.4MB/s    eta 18s    

_01-split-2a.pth     28%[====>               ] 289.40M  41.4MB/s    eta 18s    

01-split-2a.pth      28%[====>               ] 289.92M  37.9MB/s    eta 18s    

1-split-2a.pth       29%[====>               ] 304.66M  36.1MB/s    eta 19s    

-split-2a.pth        30%[=====>              ] 305.18M  33.4MB/s    eta 19s    

split-2a.pth         31%[=====>              ] 318.60M  33.2MB/s    eta 19s    

plit-2a.pth          31%[=====>              ] 320.29M  33.3MB/s    eta 19s    

lit-2a.pth           31%[=====>              ] 320.57M  30.7MB/s    eta 19s    

it-2a.pth            32%[=====>              ] 335.18M  30.1MB/s    eta 19s    

t-2a.pth             33%[=====>              ] 345.53M  31.2MB/s    eta 19s    

-2a.pth              34%[=====>              ] 350.82M  29.7MB/s    eta 19s    






































































































































2023-10-11 08:02:52 (36.4 MB/s) - ‚Äòv5-L6-D2048-E0_01-split-2a.pth‚Äô saved [1066536657/1066536657]



--2023-10-11 08:02:53--  https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/0600b94a58219f658326b4792ef5cd020e9d1a43/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-E0_01-split-2b.pth
Resolving huggingface.co (huggingface.co)... 18.154.227.67, 18.154.227.69, 18.154.227.7, ...
Connecting to huggingface.co (huggingface.co)|18.154.227.67|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/6b64a1018631b9ddd15a746002bab3eafe956dced78a91af7abcdadaae4a7b25?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5-L6-D2048-E0_01-split-2b.pth%3B+filename%3D%22v5-L6-D2048-E0_01-split-2b.pth%22%3B&Expires=1697270573&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NzI3MDU3M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4Yj

HTTP request sent, awaiting response... 

200 OK
Length: 1066536657 (1017M) [binary/octet-stream]
Saving to: ‚Äòv5-L6-D2048-E0_01-split-2b.pth‚Äô

          v5-L6-D20   0%[                    ]       0  --.-KB/s               

         v5-L6-D204   1%[                    ]  14.74M  67.8MB/s               

        v5-L6-D2048   2%[                    ]  28.69M  63.1MB/s               

       v5-L6-D2048-   3%[                    ]  30.52M  42.3MB/s               

      v5-L6-D2048-E   4%[                    ]  45.26M  45.2MB/s               

     v5-L6-D2048-E0   4%[                    ]  45.78M  37.9MB/s               

    v5-L6-D2048-E0_   6%[>                   ]  61.03M  41.2MB/s               

   v5-L6-D2048-E0_0   7%[>                   ]  75.78M  45.0MB/s               

  v5-L6-D2048-E0_01   8%[>                   ]  85.94M  45.6MB/s               

 v5-L6-D2048-E0_01-   9%[>                   ]  91.55M  40.8MB/s               

v5-L6-D2048-E0_01-s  10%[=>                  ] 106.81M  40.5MB/s               

5-L6-D2048-E0_01-sp  12%[=>                  ] 122.07M  40.2MB/s    eta 22s    

-L6-D2048-E0_01-spl  13%[=>                  ] 137.33M  41.7MB/s    eta 22s    

L6-D2048-E0_01-spli  14%[=>                  ] 152.07M  42.9MB/s    eta 22s    

6-D2048-E0_01-split  16%[==>                 ] 167.33M  43.6MB/s    eta 22s    

-D2048-E0_01-split-  17%[==>                 ] 181.32M  44.9MB/s    eta 19s    

D2048-E0_01-split-2  18%[==>                 ] 183.10M  41.7MB/s    eta 19s    

2048-E0_01-split-2b  19%[==>                 ] 196.53M  41.8MB/s    eta 19s    

048-E0_01-split-2b.  19%[==>                 ] 198.36M  39.0MB/s    eta 19s    

48-E0_01-split-2b.p  20%[===>                ] 213.11M  39.0MB/s    eta 20s    

8-E0_01-split-2b.pt  21%[===>                ] 220.29M  40.8MB/s    eta 20s    

-E0_01-split-2b.pth  22%[===>                ] 228.36M  39.9MB/s    eta 20s    

E0_01-split-2b.pth   24%[===>                ] 244.13M  40.3MB/s    eta 20s    

0_01-split-2b.pth    25%[====>               ] 259.40M  40.4MB/s    eta 18s    

_01-split-2b.pth     26%[====>               ] 274.14M  42.1MB/s    eta 18s    

01-split-2b.pth      27%[====>               ] 274.66M  38.5MB/s    eta 18s    

1-split-2b.pth       28%[====>               ] 289.92M  41.6MB/s    eta 18s    

-split-2b.pth        30%[=====>              ] 305.18M  41.6MB/s    eta 17s    

split-2b.pth         31%[=====>              ] 320.43M  40.9MB/s    eta 17s    

plit-2b.pth          32%[=====>              ] 335.18M  41.2MB/s    eta 17s    

lit-2b.pth           33%[=====>              ] 335.69M  38.5MB/s    eta 17s    

it-2b.pth            34%[=====>              ] 350.95M  38.6MB/s    eta 16s    






























































































































2023-10-11 08:03:19 (38.9 MB/s) - ‚Äòv5-L6-D2048-E0_01-split-2b.pth‚Äô saved [1066536657/1066536657]



In [4]:
# Lets preload the requried datasets
!cd "{TRAINER_DIR}" && \
    python3 preload_datapath.py "{NOTEBOOK_DIR}/enwiki-4k-part3.yaml"

Saving the dataset (0/2 shards):   0%|         | 0/27200 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   7%| | 2000/27200 [00:00<00:01, 16356.85 examp

Saving the dataset (0/2 shards):  15%|‚ñè| 4000/27200 [00:00<00:01, 17283.77 examp

Saving the dataset (0/2 shards):  22%|‚ñè| 6000/27200 [00:00<00:01, 17873.97 examp

Saving the dataset (0/2 shards):  29%|‚ñé| 8000/27200 [00:00<00:01, 18442.59 examp

Saving the dataset (0/2 shards):  40%|‚ñç| 11000/27200 [00:00<00:00, 19135.78 exam

Saving the dataset (0/2 shards):  50%|‚ñå| 13600/27200 [00:00<00:00, 19543.92 examSaving the dataset (1/2 shards):  50%|‚ñå| 13600/27200 [00:00<00:00, 19543.92 exam

Saving the dataset (1/2 shards):  65%|‚ñã| 17600/27200 [00:00<00:00, 20515.42 exam

Saving the dataset (1/2 shards):  79%|‚ñä| 21600/27200 [00:01<00:00, 21426.57 exam

Saving the dataset (1/2 shards):  94%|‚ñâ| 25600/27200 [00:01<00:00, 22078.81 exam

Saving the dataset (2/2 shards): 100%|‚ñà| 27200/27200 [00:01<00:00, 22078.81 examSaving the dataset (2/2 shards): 100%|‚ñà| 27200/27200 [00:01<00:00, 20603.99 exam
Saving the dataset (0/1 shards):   0%|           | 0/109 [00:00<?, ? examples/s]Saving the dataset (1/1 shards): 100%|‚ñà| 109/109 [00:00<00:00, 8117.24 examples/Saving the dataset (1/1 shards): 100%|‚ñà| 109/109 [00:00<00:00, 7809.82 examples/


## Enwiki Stage 3 : Split-Baseline-A training

In [5]:
# Start the foundation model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/enwiki-4k-part3.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - layer-expansion A3 (train-ctx=4k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-layer-expansion-a3/" \
        --model.load_model="../model/{FILENAME_PREFIX}-split-2a.pth" \
        --model.ctx_len=4096 \
        --model.bptt_learning_range=1

[2023-10-11 08:03:33,838] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'


  rank_zero_warn(


  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 1933922385


[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m ([33mrwkv-x-dev[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Tracking run with wandb version 0.15.12
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20231011_080337-5696uouo[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion A3 (train-ctx=4k, deepspeed_stage_2_offload)[0m
[34m[1mwandb[0m: ‚≠êÔ∏è View project at [34m[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments[0m
[34m[1mwandb[0m: üöÄ View run at [34m[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/5696uouo[0m


Traceback (most recent call last):
  File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py", line 278, in <module>
    cli_main()
  File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py", line 253, in cli_main
    LightningCLI(
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py", line 350, in __init__
    self.instantiate_classes()
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py", line 499, in instantiate_classes
    self.config_init = self.parser.instantiate_classes(self.config)
  File "/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py", line 139, in patched_instantiate_classes
    cfg = self._unpatched_instantiate_classes(cfg, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py", line 1130, in instantiate_classes
    cfg[subcommand] = subparser.instantiate_classes(cfg[subcommand], instantiate_gro

[34m[1mwandb[0m: Waiting for W&B process to finish... [31m(failed 1).[0m Press Control-C to abort syncing.


[34m[1mwandb[0m: üöÄ View run [33m[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion A3 (train-ctx=4k, deepspeed_stage_2_offload)[0m at: [34m[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/5696uouo[0m
[34m[1mwandb[0m: Ô∏è‚ö° View job at [34m[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v16[0m
[34m[1mwandb[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)
[34m[1mwandb[0m: Find logs at: [35m[1m./wandb/run-20231011_080337-5696uouo/logs[0m


In [6]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python3 export_checkpoint.py "../checkpoint/{FILENAME_PREFIX}-layer-expansion-a3/last.ckpt" "../model/{FILENAME_PREFIX}-layer-expansion-a3.pth" "bf16"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-layer-expansion-a3.pth"

[2023-10-11 08:03:49,278] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Traceback (most recent call last):
  File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py", line 651, in <module>
    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)
  File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py", line 542, in convert_zero_checkpoint_to_fp32_state_dict
    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
  File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py", line 516, in get_fp32_state_dict_from_zero_checkpoint
    raise ValueError(f"Unable to find 'latest' file at {latest_path}")
ValueError: Unable to find 'latest' file at ../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-a3/last.ckpt/latest


ls: cannot access '../model/v5-L6+6-D2048-E0_01-layer-expansion-a3.pth': No such file or directory


In [7]:
# # Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && \
    python3 dragon_test.py "../model/{FILENAME_PREFIX}-layer-expansion-a3.pth" "cuda fp32"

[2023-10-11 08:03:54,934] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'


Traceback (most recent call last):
  File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py", line 52, in <module>
    model = SimpleRWKV(MODEL_PATH, device=DEVICE)
  File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py", line 1420, in __init__
    self.model = RWKV(**model_config)
  File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py", line 566, in __init__
    raise ValueError(f"load_model file '{load_model}' does not exist")
ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-layer-expansion-a3.pth' does not exist


## Enwiki Stage 3 : Split-Baseline-B training

In [8]:
# Start the foundation model training
!cd "{TRAINER_DIR}" && \
    export WANDB_MODE="{WANDB_MODE}" && \
    python3 lightning_trainer.py fit \
        -c "{NOTEBOOK_DIR}/enwiki-4k-part3.yaml" \
        --trainer.logger.init_args.name="{WANDB_PREFIX} - layer-expansion B3 (train-ctx=4k, {DEEPSPEED_STRAT})" \
        --trainer.strategy="{DEEPSPEED_STRAT}" \
        --trainer.devices="{GPU_DEVICES}" \
        --trainer.callbacks.init_args.dirpath="../checkpoint/{FILENAME_PREFIX}-layer-expansion-b3/" \
        --model.load_model="../model/{FILENAME_PREFIX}-split-2b.pth" \
        --model.ctx_len=4096 \
        --model.bptt_learning_range=1

[2023-10-11 08:04:01,096] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'


  rank_zero_warn(


  rank_zero_warn(f"No seed found, seed set to {seed}")
Global seed set to 1732922148


[34m[1mwandb[0m: Currently logged in as: [33mpicocreator[0m ([33mrwkv-x-dev[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Tracking run with wandb version 0.15.12
[34m[1mwandb[0m: Run data is saved locally in [35m[1m./wandb/run-20231011_080403-88lcuk7j[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion B3 (train-ctx=4k, deepspeed_stage_2_offload)[0m
[34m[1mwandb[0m: ‚≠êÔ∏è View project at [34m[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments[0m
[34m[1mwandb[0m: üöÄ View run at [34m[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/88lcuk7j[0m


Traceback (most recent call last):
  File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py", line 278, in <module>
    cli_main()
  File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py", line 253, in cli_main
    LightningCLI(
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py", line 350, in __init__
    self.instantiate_classes()
  File "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py", line 499, in instantiate_classes
    self.config_init = self.parser.instantiate_classes(self.config)
  File "/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py", line 139, in patched_instantiate_classes
    cfg = self._unpatched_instantiate_classes(cfg, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py", line 1130, in instantiate_classes
    cfg[subcommand] = subparser.instantiate_classes(cfg[subcommand], instantiate_gro

[34m[1mwandb[0m: Waiting for W&B process to finish... [31m(failed 1).[0m Press Control-C to abort syncing.


[34m[1mwandb[0m: üöÄ View run [33m[Multi-size] v5-L6+6-D2048-E0.01 - layer-expansion B3 (train-ctx=4k, deepspeed_stage_2_offload)[0m at: [34m[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/runs/88lcuk7j[0m
[34m[1mwandb[0m: Ô∏è‚ö° View job at [34m[4mhttps://wandb.ai/rwkv-x-dev/RWKV-5X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjk0OTk4MDcy/version_details/v16[0m
[34m[1mwandb[0m: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
[34m[1mwandb[0m: Find logs at: [35m[1m./wandb/run-20231011_080403-88lcuk7j/logs[0m


In [9]:
# Lets export the model from the checkpoint
!cd "{TRAINER_DIR}" && \
    python3 export_checkpoint.py "../checkpoint/{FILENAME_PREFIX}-layer-expansion-b3/last.ckpt" "../model/{FILENAME_PREFIX}-layer-expansion-b3.pth" "bf16"
!cd "{TRAINER_DIR}" && ls -alh "../model/{FILENAME_PREFIX}-layer-expansion-b3.pth"

[2023-10-11 08:04:13,869] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Traceback (most recent call last):
  File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py", line 651, in <module>
    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)
  File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py", line 542, in convert_zero_checkpoint_to_fp32_state_dict
    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
  File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py", line 516, in get_fp32_state_dict_from_zero_checkpoint
    raise ValueError(f"Unable to find 'latest' file at {latest_path}")
ValueError: Unable to find 'latest' file at ../checkpoint/v5-L6+6-D2048-E0_01-layer-expansion-b3/last.ckpt/latest


ls: cannot access '../model/v5-L6+6-D2048-E0_01-layer-expansion-b3.pth': No such file or directory


In [10]:
# # Lets do a quick dragon prompt validation
!cd "{INFERENCE_DIR}" && \
    python3 dragon_test.py "../model/{FILENAME_PREFIX}-layer-expansion-b3.pth" "cuda fp32"

[2023-10-11 08:04:19,430] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'
Traceback (most recent call last):
  File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py", line 52, in <module>
    model = SimpleRWKV(MODEL_PATH, device=DEVICE)
  File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py", line 1420, in __init__
    self.model = RWKV(**model_config)
  File "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py", line 566, in __init__
    raise ValueError(f"load_model file '{load_model}' does not exist")
ValueError: load_model file '../model/v5-L6+6-D2048-E0_01-layer-expansion-b3.pth' does not exist
