diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors index 8f02a6eaccbf3b8a9cf0781d1f5950c740f3eda0..5e73116caac7642d47b121275ee9b9c45ac41f88 100644 --- a/model-00001-of-00004.safetensors +++ b/model-00001-of-00004.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a98b4e3194379ba7bec05ef5af0baa479391fb72c51b261749bdd1d251fb9b81 +oid sha256:759bfaadb94eed1a885167a727b0ee10a9bd210978edf2ee6f32b6440662c6af size 4874843752 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors index e9ca17f60dc466c8cf71e6e805e0f20322a7836e..ac739042b3129d43f8810540d142c7ae5ddf1331 100644 --- a/model-00002-of-00004.safetensors +++ b/model-00002-of-00004.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab9e2130ff05c8efc6ce715675f96987ccecebb141684c969aa92ece6777278a +oid sha256:603ff69dbbb82330c2c44613eee9bc388d2d9ef10c0adce062ec4f72efb5a1c6 size 4932751008 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors index de48d0bc1f9deb9779835fa77ba42e0db7633d27..9ae1cdcc8496996b723300b987892d9ec24ea687 100644 --- a/model-00003-of-00004.safetensors +++ b/model-00003-of-00004.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:181225151db82d7c2a4b0d6bf1b800f2aa0b9e19ebcbc3f666e96ff341cd3a0b +oid sha256:0b066c8dba9b337ae2000ebd03f8b87c12a15ed22da59744ad7441dc885ca6cb size 4330865200 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors index 883357330e36de7f061b1e654a52763541c8bb90..e55b95f267d7ee7197c503c4f06c0a95a5ad2f46 100644 --- a/model-00004-of-00004.safetensors +++ b/model-00004-of-00004.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9fd93165bb07b6dbb7f816180d4f65ad089d8c4efe19e5171a47c0c751f84b0 +oid sha256:b0f369f7c0efb4c8190836f344c4ae2030e0d24d6479abb6159167cf94c2cabb size 1087177856 diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log index 9423637605c34cb0bbc06c85cbfdff921fc39577..d57ff56f05236bfb949ea7a4324622871469c47c 100644 --- a/wandb/debug-internal.log +++ b/wandb/debug-internal.log @@ -371,3 +371,262 @@ 2024-11-13 19:11:47,847 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status 2024-11-13 19:11:47,848 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status 2024-11-13 19:11:51,230 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:11:56,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:01,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:02,847 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:12:02,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:12:02,848 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:12:06,021 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:12:07,202 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:12,203 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:17,006 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:12:17,847 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:12:17,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:12:17,848 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:12:17,991 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:23,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:28,029 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:12:28,901 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:32,847 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:12:32,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:12:32,848 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:12:34,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:39,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:44,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:47,008 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:12:47,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:12:47,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:12:47,848 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:12:48,871 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: partial_history +2024-11-13 19:12:48,872 DEBUG SenderThread:1939 [sender.py:send():382] send: history +2024-11-13 19:12:48,873 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 19:12:48,873 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 19:12:49,037 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/wandb-summary.json +2024-11-13 19:12:49,401 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:50,038 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:12:50,481 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: partial_history +2024-11-13 19:12:50,482 DEBUG SenderThread:1939 [sender.py:send():382] send: history +2024-11-13 19:12:50,483 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 19:12:50,484 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 19:12:51,038 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:12:51,039 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/wandb-summary.json +2024-11-13 19:12:52,039 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:12:54,485 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:59,486 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:02,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:13:02,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:13:02,848 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:13:05,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:10,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:14,047 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:13:15,403 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:17,011 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:13:17,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:13:17,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:13:17,849 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:13:21,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:26,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:31,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:32,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:13:32,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:13:32,849 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:13:36,055 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:13:36,280 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:41,280 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:46,281 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:47,014 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:13:47,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:13:47,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:13:47,849 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:13:52,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:57,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:00,064 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:14:02,559 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:02,854 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:14:02,854 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:14:02,856 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:14:08,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:13,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:17,017 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:14:17,855 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:14:17,855 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:14:17,895 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:14:19,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:22,072 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:14:24,329 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:29,330 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:32,854 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:14:32,855 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:14:32,855 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:14:35,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:40,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:45,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:47,019 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:14:48,157 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:14:48,157 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:14:48,415 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:14:50,256 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:56,051 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:01,051 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:02,910 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:15:02,910 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:15:03,032 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:15:06,239 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:10,089 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:15:11,478 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:16,478 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:17,022 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:15:18,866 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:15:18,866 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:15:19,390 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:15:22,355 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:27,579 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:28,095 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:15:30,096 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:15:32,097 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:15:32,786 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:32,973 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:15:32,973 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:15:33,126 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:15:34,098 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:15:36,099 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:15:37,917 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:38,100 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:15:40,101 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:15:43,713 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:47,025 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:15:49,026 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:49,048 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:15:49,666 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:15:49,666 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:15:54,756 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:59,756 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:03,643 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:16:03,643 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:16:04,015 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:16:05,465 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:10,466 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:15,937 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:16,113 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:16:17,030 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:16:18,245 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:16:18,245 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:16:18,549 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:16:21,587 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:26,588 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:31,588 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:33,245 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:16:33,246 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:16:33,549 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:16:37,587 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:42,588 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:47,030 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:16:48,032 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:48,251 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:16:48,251 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:16:49,121 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:16:53,293 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:58,293 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:03,294 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:05,806 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:17:05,806 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:17:05,941 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:17:08,940 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:13,941 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:17,033 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:17:19,035 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:20,127 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:17:20,128 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:17:20,171 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:17:24,246 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:30,141 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:17:30,209 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:35,210 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:35,928 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:17:35,928 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:17:36,794 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:17:40,795 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:45,796 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:47,036 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:17:51,038 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:51,987 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:17:51,987 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:17:51,988 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:17:54,152 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:17:56,164 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:17:56,204 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:59,166 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:18:01,167 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:18:01,358 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:03,168 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:18:05,169 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:18:07,169 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:18:07,346 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:12,347 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:13,267 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:18:13,476 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:18:13,476 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:18:17,039 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:18:18,041 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:23,042 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:28,199 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:29,298 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:18:30,038 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:18:30,039 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:18:34,183 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:39,184 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:43,601 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:18:43,602 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:18:44,627 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:18:44,740 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:47,042 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:18:50,044 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:53,037 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 19:18:53,037 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: partial_history +2024-11-13 19:18:53,039 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 19:18:53,039 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 19:18:53,040 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 19:18:53,040 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 19:18:53,040 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 19:18:53,040 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 19:18:53,040 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 19:18:53,041 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 19:18:53,041 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 19:18:53,041 DEBUG SenderThread:1939 [sender.py:send():382] send: history +2024-11-13 19:18:53,041 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 19:18:53,041 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 19:18:53,186 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:18:53,187 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/wandb-summary.json +2024-11-13 19:18:54,187 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:18:55,187 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:18:55,768 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:18:55,922 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:18:55,922 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:18:57,159 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:02,160 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:07,160 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:10,768 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:19:10,923 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:19:10,923 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:19:13,077 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:17,045 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:19:19,046 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:24,047 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:25,768 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:19:25,923 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:19:25,923 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:19:29,078 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:34,079 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:39,079 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:40,768 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:19:40,922 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:19:40,923 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:19:45,078 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:47,048 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:19:51,050 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:55,768 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:19:55,923 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:19:55,923 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:19:56,077 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:20:01,078 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:20:06,079 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report diff --git a/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log b/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log index 35d6657db780d3f22be35148f916832cb0b1cec8..bf9c60a6eae0c05205484a235c1453b6f281117f 100644 --- a/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +++ b/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log @@ -1423,3 +1423,2808 @@ I1113 19:09:38.768262 140440525453120 fsdp_utils.py:195] Optimizer state saved i + + + + 83%|████████▎ | 20/24 [09:46<01:43, 25.80s/it] +{'loss': 0.7268, 'grad_norm': 2.40625, 'learning_rate': 6.698729810778064e-08, 'epoch': 1.62} + + + + + + +100%|██████████| 24/24 [11:18<00:00, 23.55s/it]I1113 19:15:08.476735 140440525453120 fsdp_utils.py:89] Saving model to /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/checkpoint-24/pytorch_model_fsdp.bin +I1113 19:15:25.608172 140440525453120 fsdp_utils.py:91] Model saved to /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/checkpoint-24/pytorch_model_fsdp.bin +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103535 GiB | 103529 GiB | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103494 GiB | 103489 GiB | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103535 GiB | 103529 GiB | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103494 GiB | 103489 GiB | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103528 GiB | 103523 GiB | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103488 GiB | 103482 GiB | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456849 KiB | 18699 MiB | 121034 GiB | 121034 GiB | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 120993 GiB | 120993 GiB | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3300 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462277 | 462257 | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373747 | 373738 | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88530 | 88519 | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:26,577] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103537 GiB | 103532 GiB | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103497 GiB | 103491 GiB | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103537 GiB | 103532 GiB | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103497 GiB | 103491 GiB | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103531 GiB | 103525 GiB | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103490 GiB | 103485 GiB | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121039 GiB | 121039 GiB | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 120998 GiB | 120998 GiB | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462287 | 462267 | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373748 | 373739 | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88539 | 88528 | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:28,367] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103538 GiB | 103532 GiB | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103497 GiB | 103492 GiB | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103538 GiB | 103532 GiB | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103497 GiB | 103492 GiB | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103531 GiB | 103526 GiB | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103491 GiB | 103485 GiB | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121045 GiB | 121044 GiB | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121004 GiB | 121003 GiB | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462299 | 462279 | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373750 | 373741 | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88549 | 88538 | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:28,447] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103538 GiB | 103533 GiB | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103498 GiB | 103492 GiB | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103538 GiB | 103533 GiB | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103498 GiB | 103492 GiB | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103532 GiB | 103526 GiB | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103491 GiB | 103486 GiB | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121051 GiB | 121050 GiB | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121010 GiB | 121009 GiB | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462310 | 462290 | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373752 | 373743 | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88558 | 88547 | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:28,620] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103539 GiB | 103533 GiB | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103498 GiB | 103493 GiB | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103539 GiB | 103533 GiB | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103498 GiB | 103493 GiB | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103532 GiB | 103527 GiB | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103492 GiB | 103486 GiB | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121056 GiB | 121056 GiB | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121015 GiB | 121015 GiB | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462321 | 462301 | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373754 | 373745 | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88567 | 88556 | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:28,980] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103539 GiB | 103534 GiB | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103499 GiB | 103493 GiB | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103539 GiB | 103534 GiB | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103499 GiB | 103493 GiB | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103533 GiB | 103527 GiB | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103492 GiB | 103487 GiB | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121062 GiB | 121062 GiB | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121021 GiB | 121021 GiB | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462332 | 462312 | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373756 | 373747 | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88576 | 88565 | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:29,341] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103540 GiB | 103535 GiB | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103499 GiB | 103494 GiB | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103540 GiB | 103535 GiB | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103499 GiB | 103494 GiB | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103533 GiB | 103528 GiB | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103493 GiB | 103488 GiB | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121068 GiB | 121068 GiB | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121027 GiB | 121027 GiB | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462343 | 462323 | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373758 | 373749 | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88585 | 88574 | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:29,692] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103540 GiB | 103535 GiB | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103500 GiB | 103495 GiB | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103540 GiB | 103535 GiB | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103500 GiB | 103495 GiB | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103534 GiB | 103529 GiB | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103493 GiB | 103488 GiB | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121074 GiB | 121073 GiB | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121033 GiB | 121032 GiB | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462354 | 462334 | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373760 | 373751 | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88594 | 88583 | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:30,044] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103541 GiB | 103536 GiB | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103500 GiB | 103495 GiB | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103541 GiB | 103536 GiB | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103500 GiB | 103495 GiB | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103534 GiB | 103529 GiB | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103494 GiB | 103489 GiB | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121080 GiB | 121079 GiB | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121039 GiB | 121038 GiB | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1051 K | 1051 K | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462365 | 462345 | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373762 | 373753 | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88603 | 88592 | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:30,425] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103542 GiB | 103536 GiB | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103501 GiB | 103496 GiB | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103542 GiB | 103536 GiB | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103501 GiB | 103496 GiB | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103535 GiB | 103530 GiB | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103495 GiB | 103489 GiB | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121085 GiB | 121085 GiB | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121044 GiB | 121044 GiB | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462376 | 462356 | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373764 | 373755 | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88612 | 88601 | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:30,815] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103542 GiB | 103537 GiB | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103502 GiB | 103496 GiB | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103542 GiB | 103537 GiB | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103502 GiB | 103496 GiB | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103536 GiB | 103530 GiB | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103495 GiB | 103490 GiB | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121091 GiB | 121091 GiB | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121050 GiB | 121050 GiB | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462387 | 462367 | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373766 | 373757 | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88621 | 88610 | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:31,213] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103543 GiB | 103537 GiB | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103502 GiB | 103497 GiB | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103543 GiB | 103537 GiB | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103502 GiB | 103497 GiB | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103536 GiB | 103531 GiB | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103496 GiB | 103490 GiB | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121097 GiB | 121096 GiB | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121056 GiB | 121055 GiB | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462398 | 462378 | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373768 | 373759 | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88630 | 88619 | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:31,603] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103543 GiB | 103538 GiB | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103503 GiB | 103497 GiB | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103543 GiB | 103538 GiB | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103503 GiB | 103497 GiB | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103537 GiB | 103531 GiB | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103496 GiB | 103491 GiB | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121103 GiB | 121102 GiB | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121062 GiB | 121061 GiB | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462409 | 462389 | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373770 | 373761 | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88639 | 88628 | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:32,003] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103544 GiB | 103538 GiB | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103503 GiB | 103498 GiB | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103544 GiB | 103538 GiB | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103503 GiB | 103498 GiB | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103537 GiB | 103532 GiB | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103497 GiB | 103491 GiB | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121108 GiB | 121108 GiB | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121067 GiB | 121067 GiB | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462420 | 462400 | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373772 | 373763 | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88648 | 88637 | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:32,396] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103544 GiB | 103539 GiB | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103504 GiB | 103498 GiB | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103544 GiB | 103539 GiB | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103504 GiB | 103498 GiB | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103538 GiB | 103532 GiB | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103497 GiB | 103492 GiB | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121114 GiB | 121114 GiB | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121073 GiB | 121073 GiB | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462431 | 462411 | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373774 | 373765 | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88657 | 88646 | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:32,785] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103545 GiB | 103539 GiB | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103504 GiB | 103499 GiB | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103545 GiB | 103539 GiB | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103504 GiB | 103499 GiB | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103538 GiB | 103533 GiB | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103498 GiB | 103492 GiB | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121120 GiB | 121120 GiB | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121079 GiB | 121079 GiB | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462442 | 462422 | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373776 | 373767 | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88666 | 88655 | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:33,184] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103545 GiB | 103540 GiB | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103505 GiB | 103499 GiB | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103545 GiB | 103540 GiB | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103505 GiB | 103499 GiB | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103539 GiB | 103533 GiB | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103498 GiB | 103493 GiB | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121126 GiB | 121125 GiB | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121085 GiB | 121084 GiB | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462453 | 462433 | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373778 | 373769 | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88675 | 88664 | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:33,585] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103546 GiB | 103541 GiB | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103505 GiB | 103500 GiB | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103546 GiB | 103541 GiB | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103505 GiB | 103500 GiB | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103539 GiB | 103534 GiB | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103499 GiB | 103493 GiB | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121132 GiB | 121131 GiB | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121091 GiB | 121090 GiB | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462464 | 462444 | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373780 | 373771 | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88684 | 88673 | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:33,976] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103546 GiB | 103541 GiB | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103506 GiB | 103501 GiB | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103546 GiB | 103541 GiB | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103506 GiB | 103501 GiB | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103540 GiB | 103535 GiB | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103499 GiB | 103494 GiB | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121137 GiB | 121137 GiB | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121096 GiB | 121096 GiB | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462475 | 462455 | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373782 | 373773 | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88693 | 88682 | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:34,374] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103547 GiB | 103542 GiB | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103506 GiB | 103501 GiB | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103547 GiB | 103542 GiB | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103506 GiB | 103501 GiB | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103540 GiB | 103535 GiB | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103500 GiB | 103495 GiB | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121143 GiB | 121143 GiB | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121102 GiB | 121102 GiB | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462486 | 462466 | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373784 | 373775 | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88702 | 88691 | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:34,773] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103548 GiB | 103542 GiB | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103507 GiB | 103502 GiB | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103548 GiB | 103542 GiB | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103507 GiB | 103502 GiB | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103541 GiB | 103536 GiB | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103500 GiB | 103495 GiB | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121149 GiB | 121148 GiB | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121108 GiB | 121107 GiB | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462497 | 462477 | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373786 | 373777 | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88711 | 88700 | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:35,164] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103548 GiB | 103543 GiB | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103508 GiB | 103502 GiB | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103548 GiB | 103543 GiB | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103508 GiB | 103502 GiB | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103542 GiB | 103536 GiB | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103501 GiB | 103496 GiB | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121155 GiB | 121154 GiB | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121114 GiB | 121113 GiB | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462508 | 462488 | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373788 | 373779 | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88720 | 88709 | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:35,555] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103549 GiB | 103543 GiB | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103508 GiB | 103503 GiB | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103549 GiB | 103543 GiB | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103508 GiB | 103503 GiB | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103542 GiB | 103537 GiB | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103502 GiB | 103496 GiB | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121160 GiB | 121160 GiB | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121119 GiB | 121119 GiB | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462519 | 462499 | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373790 | 373781 | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88729 | 88718 | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:35,944] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103549 GiB | 103544 GiB | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103509 GiB | 103503 GiB | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103549 GiB | 103544 GiB | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103509 GiB | 103503 GiB | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103543 GiB | 103537 GiB | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103502 GiB | 103497 GiB | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121166 GiB | 121166 GiB | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121125 GiB | 121125 GiB | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462530 | 462510 | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373792 | 373783 | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88738 | 88727 | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:36,348] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103550 GiB | 103544 GiB | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103509 GiB | 103504 GiB | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103550 GiB | 103544 GiB | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103509 GiB | 103504 GiB | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103543 GiB | 103538 GiB | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103503 GiB | 103497 GiB | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121172 GiB | 121172 GiB | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121131 GiB | 121131 GiB | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462541 | 462521 | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373794 | 373785 | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88747 | 88736 | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:36,739] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103550 GiB | 103545 GiB | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103510 GiB | 103504 GiB | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103550 GiB | 103545 GiB | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103510 GiB | 103504 GiB | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103544 GiB | 103538 GiB | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103503 GiB | 103498 GiB | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121178 GiB | 121177 GiB | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121137 GiB | 121136 GiB | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462552 | 462532 | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373796 | 373787 | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88756 | 88745 | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:37,138] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103551 GiB | 103545 GiB | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103510 GiB | 103505 GiB | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103551 GiB | 103545 GiB | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103510 GiB | 103505 GiB | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103544 GiB | 103539 GiB | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103504 GiB | 103498 GiB | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121184 GiB | 121183 GiB | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121143 GiB | 121142 GiB | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462563 | 462543 | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373798 | 373789 | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88765 | 88754 | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:37,527] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103551 GiB | 103546 GiB | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103511 GiB | 103505 GiB | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103551 GiB | 103546 GiB | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103511 GiB | 103505 GiB | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103545 GiB | 103539 GiB | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103504 GiB | 103499 GiB | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121189 GiB | 121189 GiB | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121148 GiB | 121148 GiB | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462574 | 462554 | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373800 | 373791 | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88774 | 88763 | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:37,916] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103552 GiB | 103546 GiB | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103511 GiB | 103506 GiB | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103552 GiB | 103546 GiB | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103511 GiB | 103506 GiB | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103545 GiB | 103540 GiB | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103505 GiB | 103499 GiB | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121195 GiB | 121195 GiB | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121154 GiB | 121154 GiB | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462585 | 462565 | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373802 | 373793 | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88783 | 88772 | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:15:38,316] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:15:38,712] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(, {'preprocessing': 0.0342652339986671, 'preprocessing_with_comm': 0.0035613090003607795, : 0.10952290800014453, : 0.08624853599212656, : 11.892996056034463, : 12.136542246997124, 'state_converting': 12.138428608999675, : 12.178800212999704}) +I1113 19:15:38.712820 140440525453120 fsdp_utils.py:193] Saving Optimizer state to /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/checkpoint-24/optimizer.bin +I1113 19:16:13.936019 140440525453120 fsdp_utils.py:195] Optimizer state saved in /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/checkpoint-24/optimizer.bin +I1113 19:17:29.207759 140440525453120 fsdp_utils.py:89] Saving model to /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/checkpoint-24/pytorch_model_fsdp.bin +I1113 19:17:51.939389 140440525453120 fsdp_utils.py:91] Model saved to /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/checkpoint-24/pytorch_model_fsdp.bin +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103609 GiB | 103604 GiB | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103569 GiB | 103563 GiB | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103609 GiB | 103604 GiB | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103569 GiB | 103563 GiB | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103603 GiB | 103597 GiB | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103562 GiB | 103557 GiB | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456849 KiB | 18699 MiB | 121853 GiB | 121852 GiB | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121812 GiB | 121811 GiB | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3300 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462715 | 462695 | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373920 | 373911 | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88795 | 88784 | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:52,938] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103612 GiB | 103606 GiB | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103571 GiB | 103566 GiB | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103612 GiB | 103606 GiB | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103571 GiB | 103566 GiB | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103605 GiB | 103600 GiB | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103565 GiB | 103559 GiB | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121857 GiB | 121857 GiB | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121816 GiB | 121816 GiB | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1051 K | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462725 | 462705 | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373921 | 373912 | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88804 | 88793 | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:54,831] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103612 GiB | 103607 GiB | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103572 GiB | 103566 GiB | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103612 GiB | 103607 GiB | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103572 GiB | 103566 GiB | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103606 GiB | 103600 GiB | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103565 GiB | 103560 GiB | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121863 GiB | 121863 GiB | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121822 GiB | 121822 GiB | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462737 | 462717 | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373923 | 373914 | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88814 | 88803 | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:54,919] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103613 GiB | 103607 GiB | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103572 GiB | 103567 GiB | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103613 GiB | 103607 GiB | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103572 GiB | 103567 GiB | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103606 GiB | 103601 GiB | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103566 GiB | 103560 GiB | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121869 GiB | 121868 GiB | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121828 GiB | 121827 GiB | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462748 | 462728 | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373925 | 373916 | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88823 | 88812 | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:55,098] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103613 GiB | 103608 GiB | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103573 GiB | 103567 GiB | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103613 GiB | 103608 GiB | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103573 GiB | 103567 GiB | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103607 GiB | 103601 GiB | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103566 GiB | 103561 GiB | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121875 GiB | 121874 GiB | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121834 GiB | 121833 GiB | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 227 K | 227 K | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462760 | 462740 | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373927 | 373918 | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88833 | 88822 | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:55,469] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103614 GiB | 103608 GiB | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103573 GiB | 103568 GiB | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103614 GiB | 103608 GiB | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103573 GiB | 103568 GiB | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103607 GiB | 103602 GiB | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103567 GiB | 103561 GiB | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121880 GiB | 121880 GiB | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121839 GiB | 121839 GiB | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462771 | 462751 | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373929 | 373920 | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88842 | 88831 | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:55,835] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103614 GiB | 103609 GiB | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103574 GiB | 103568 GiB | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103614 GiB | 103609 GiB | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103574 GiB | 103568 GiB | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103608 GiB | 103602 GiB | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103567 GiB | 103562 GiB | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121886 GiB | 121886 GiB | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121845 GiB | 121845 GiB | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462783 | 462763 | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373931 | 373922 | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88852 | 88841 | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:56,203] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103615 GiB | 103610 GiB | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103574 GiB | 103569 GiB | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103615 GiB | 103610 GiB | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103574 GiB | 103569 GiB | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103608 GiB | 103603 GiB | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103568 GiB | 103563 GiB | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121892 GiB | 121892 GiB | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121851 GiB | 121851 GiB | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462794 | 462774 | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373933 | 373924 | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88861 | 88850 | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:56,563] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103615 GiB | 103610 GiB | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103575 GiB | 103570 GiB | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103615 GiB | 103610 GiB | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103575 GiB | 103570 GiB | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103609 GiB | 103604 GiB | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103568 GiB | 103563 GiB | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121898 GiB | 121897 GiB | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121857 GiB | 121856 GiB | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462805 | 462785 | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373935 | 373926 | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88870 | 88859 | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:56,956] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103616 GiB | 103611 GiB | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103575 GiB | 103570 GiB | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103616 GiB | 103611 GiB | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103575 GiB | 103570 GiB | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103609 GiB | 103604 GiB | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103569 GiB | 103564 GiB | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121904 GiB | 121903 GiB | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121863 GiB | 121862 GiB | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462816 | 462796 | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373937 | 373928 | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88879 | 88868 | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:57,353] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103617 GiB | 103611 GiB | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103576 GiB | 103571 GiB | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103617 GiB | 103611 GiB | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103576 GiB | 103571 GiB | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103610 GiB | 103605 GiB | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103570 GiB | 103564 GiB | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121909 GiB | 121909 GiB | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121868 GiB | 121868 GiB | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462827 | 462807 | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373939 | 373930 | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88888 | 88877 | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:57,758] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103617 GiB | 103612 GiB | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103577 GiB | 103571 GiB | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103617 GiB | 103612 GiB | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103577 GiB | 103571 GiB | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103611 GiB | 103605 GiB | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103570 GiB | 103565 GiB | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121915 GiB | 121915 GiB | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121874 GiB | 121874 GiB | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462838 | 462818 | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373941 | 373932 | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88897 | 88886 | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:58,154] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103618 GiB | 103612 GiB | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103577 GiB | 103572 GiB | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103618 GiB | 103612 GiB | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103577 GiB | 103572 GiB | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103611 GiB | 103606 GiB | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103571 GiB | 103565 GiB | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121921 GiB | 121920 GiB | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121880 GiB | 121879 GiB | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462849 | 462829 | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373943 | 373934 | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88906 | 88895 | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:58,550] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103618 GiB | 103613 GiB | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103578 GiB | 103572 GiB | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103618 GiB | 103613 GiB | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103578 GiB | 103572 GiB | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103612 GiB | 103606 GiB | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103571 GiB | 103566 GiB | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121927 GiB | 121926 GiB | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121886 GiB | 121885 GiB | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462860 | 462840 | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373945 | 373936 | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88915 | 88904 | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:58,957] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103619 GiB | 103613 GiB | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103578 GiB | 103573 GiB | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103619 GiB | 103613 GiB | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103578 GiB | 103573 GiB | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103612 GiB | 103607 GiB | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103572 GiB | 103566 GiB | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121932 GiB | 121932 GiB | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121891 GiB | 121891 GiB | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462871 | 462851 | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373947 | 373938 | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88924 | 88913 | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:59,351] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103619 GiB | 103614 GiB | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103579 GiB | 103573 GiB | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103619 GiB | 103614 GiB | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103579 GiB | 103573 GiB | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103613 GiB | 103607 GiB | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103572 GiB | 103567 GiB | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121938 GiB | 121938 GiB | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121897 GiB | 121897 GiB | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462882 | 462862 | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373949 | 373940 | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88933 | 88922 | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:17:59,756] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103620 GiB | 103614 GiB | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103579 GiB | 103574 GiB | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103620 GiB | 103614 GiB | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103579 GiB | 103574 GiB | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103613 GiB | 103608 GiB | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103573 GiB | 103567 GiB | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121944 GiB | 121944 GiB | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121903 GiB | 121903 GiB | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462893 | 462873 | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373951 | 373942 | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88942 | 88931 | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:00,162] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103620 GiB | 103615 GiB | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103580 GiB | 103574 GiB | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103620 GiB | 103615 GiB | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103580 GiB | 103574 GiB | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103614 GiB | 103608 GiB | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103573 GiB | 103568 GiB | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121950 GiB | 121949 GiB | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121909 GiB | 121908 GiB | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462904 | 462884 | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373953 | 373944 | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88951 | 88940 | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:00,558] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103621 GiB | 103615 GiB | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103580 GiB | 103575 GiB | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103621 GiB | 103615 GiB | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103580 GiB | 103575 GiB | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103614 GiB | 103609 GiB | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103574 GiB | 103568 GiB | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121956 GiB | 121955 GiB | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121915 GiB | 121914 GiB | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462915 | 462895 | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373955 | 373946 | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88960 | 88949 | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:00,962] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103621 GiB | 103616 GiB | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103581 GiB | 103576 GiB | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103621 GiB | 103616 GiB | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103581 GiB | 103576 GiB | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103615 GiB | 103610 GiB | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103574 GiB | 103569 GiB | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121961 GiB | 121961 GiB | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121920 GiB | 121920 GiB | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462926 | 462906 | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373957 | 373948 | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88969 | 88958 | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:01,357] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103622 GiB | 103617 GiB | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103581 GiB | 103576 GiB | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103622 GiB | 103617 GiB | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103581 GiB | 103576 GiB | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103615 GiB | 103610 GiB | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103575 GiB | 103570 GiB | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121967 GiB | 121967 GiB | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121926 GiB | 121926 GiB | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462937 | 462917 | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373959 | 373950 | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88978 | 88967 | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:01,753] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103623 GiB | 103617 GiB | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103582 GiB | 103577 GiB | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103623 GiB | 103617 GiB | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103582 GiB | 103577 GiB | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103616 GiB | 103611 GiB | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103575 GiB | 103570 GiB | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121973 GiB | 121972 GiB | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121932 GiB | 121931 GiB | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462948 | 462928 | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373961 | 373952 | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88987 | 88976 | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:02,168] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103623 GiB | 103618 GiB | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103583 GiB | 103577 GiB | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103623 GiB | 103618 GiB | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103583 GiB | 103577 GiB | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103617 GiB | 103611 GiB | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103576 GiB | 103571 GiB | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121979 GiB | 121978 GiB | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121938 GiB | 121937 GiB | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 227 K | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462959 | 462939 | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373963 | 373954 | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 88996 | 88985 | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:02,564] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103624 GiB | 103618 GiB | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103583 GiB | 103578 GiB | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103624 GiB | 103618 GiB | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103583 GiB | 103578 GiB | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103617 GiB | 103612 GiB | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103577 GiB | 103571 GiB | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121984 GiB | 121984 GiB | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121943 GiB | 121943 GiB | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462970 | 462950 | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373965 | 373956 | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 89005 | 88994 | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:02,961] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103624 GiB | 103619 GiB | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103584 GiB | 103578 GiB | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103624 GiB | 103619 GiB | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103584 GiB | 103578 GiB | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103618 GiB | 103612 GiB | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103577 GiB | 103572 GiB | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121990 GiB | 121990 GiB | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121949 GiB | 121949 GiB | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462981 | 462961 | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373967 | 373958 | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 89014 | 89003 | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:03,356] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103625 GiB | 103619 GiB | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103584 GiB | 103579 GiB | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103625 GiB | 103619 GiB | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103584 GiB | 103579 GiB | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103618 GiB | 103613 GiB | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103578 GiB | 103572 GiB | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 121996 GiB | 121996 GiB | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121955 GiB | 121955 GiB | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 462992 | 462972 | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373969 | 373960 | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 89023 | 89012 | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:03,760] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103625 GiB | 103620 GiB | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103585 GiB | 103579 GiB | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103625 GiB | 103620 GiB | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103585 GiB | 103579 GiB | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103619 GiB | 103613 GiB | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103578 GiB | 103573 GiB | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 122002 GiB | 122001 GiB | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121961 GiB | 121960 GiB | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 463004 | 462984 | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373971 | 373962 | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 89033 | 89022 | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:04,155] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103626 GiB | 103620 GiB | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103585 GiB | 103580 GiB | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103626 GiB | 103620 GiB | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103585 GiB | 103580 GiB | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103619 GiB | 103614 GiB | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103579 GiB | 103573 GiB | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 122008 GiB | 122007 GiB | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121967 GiB | 121966 GiB | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 463015 | 462995 | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373973 | 373964 | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 89042 | 89031 | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:04,552] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] CUDA Memory Summary before calling to _allgather_orig_param_states |===========================================================================| +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | PyTorch CUDA memory summary, device ID 0 | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | CUDA OOMs: 0 | cudaMalloc retries: 2 | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | Allocated memory | 5523 MiB | 58937 MiB | 103626 GiB | 103621 GiB | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 58933 MiB | 103586 GiB | 103580 GiB | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | Active memory | 5523 MiB | 59826 MiB | 103626 GiB | 103621 GiB | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5521 MiB | 59822 MiB | 103586 GiB | 103580 GiB | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | Requested memory | 5511 MiB | 59812 MiB | 103620 GiB | 103614 GiB | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 5508 MiB | 59808 MiB | 103579 GiB | 103574 GiB | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 2 MiB | 7 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved memory | 71846 MiB | 73286 MiB | 140138 MiB | 68292 MiB | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 71836 MiB | 73278 MiB | 140124 MiB | 68288 MiB | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 10 MiB | 10 MiB | 14 MiB | 4 MiB | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable memory | 456811 KiB | 18699 MiB | 122013 GiB | 122013 GiB | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 453549 KiB | 18696 MiB | 121972 GiB | 121972 GiB | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 3262 KiB | 6 MiB | 40 GiB | 40 GiB | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | Allocations | 482 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 225 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | Active allocs | 487 | 710 | 1052 K | 1052 K | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 257 | 399 | 824 K | 824 K | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 230 | 424 | 228 K | 228 K | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | GPU reserved segments | 110 | 110 | 161 | 51 | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 105 | 105 | 154 | 49 | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 5 | 5 | 7 | 2 | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | Non-releasable allocs | 20 | 68 | 463026 | 463006 | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from large pool | 9 | 55 | 373975 | 373966 | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | from small pool | 11 | 17 | 89051 | 89040 | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize allocations | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] |---------------------------------------------------------------------------| +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] | Oversize GPU segments | 0 | 0 | 0 | 0 | +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] |===========================================================================| +[rank0]:[2024-11-13 19:18:04,947] torch.distributed.fsdp._optim_utils: [WARNING] +[rank0]:[2024-11-13 19:18:05,345] torch.distributed.fsdp._debug_utils: [WARNING] FSDP _optim_state_dict() profiling: defaultdict(, {'preprocessing': 0.03401543000109086, 'preprocessing_with_comm': 0.005995718998747179, : 0.14788699399832694, : 0.08761637099632935, : 12.097734961020251, : 12.406498964997809, 'state_converting': 12.410317059999215, : 12.452914931000123}) +I1113 19:18:05.345734 140440525453120 fsdp_utils.py:193] Saving Optimizer state to /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/checkpoint-24/optimizer.bin +I1113 19:18:51.193263 140440525453120 fsdp_utils.py:195] Optimizer state saved in /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/checkpoint-24/optimizer.bin +{'train_runtime': 967.6106, 'train_samples_per_second': 13.133, 'train_steps_per_second': 0.025, 'train_loss': 0.760478196044763, 'epoch': 1.94} diff --git a/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/wandb-summary.json b/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/wandb-summary.json index dff9a14a08aeec9ad7d20cb524323b28e66774de..e2a878aa7aecb57dd10d895a5fb335d54b1124d9 100644 --- a/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/wandb-summary.json +++ b/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/wandb-summary.json @@ -1 +1 @@ -{"train/loss": 0.856, "train/grad_norm": 2.515625, "train/learning_rate": 3.086582838174551e-07, "train/epoch": 1.22, "train/global_step": 15, "_timestamp": 1731525055.2001407, "_runtime": 488.7689197063446, "_step": 6, "eval/loss": 0.7827465534210205, "eval/runtime": 1.6563, "eval/samples_per_second": 61.584, "eval/steps_per_second": 1.208} \ No newline at end of file +{"train/loss": 0.7268, "train/grad_norm": 2.40625, "train/learning_rate": 6.698729810778064e-08, "train/epoch": 1.94, "train/global_step": 24, "_timestamp": 1731525533.0362926, "_runtime": 966.6050715446472, "_step": 9, "eval/loss": 0.7821270227432251, "eval/runtime": 1.6364, "eval/samples_per_second": 62.331, "eval/steps_per_second": 1.222, "train_runtime": 967.6106, "train_samples_per_second": 13.133, "train_steps_per_second": 0.025, "total_flos": 9.400248301309133e+16, "train_loss": 0.760478196044763} \ No newline at end of file diff --git a/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/logs/debug-internal.log b/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/logs/debug-internal.log index 9423637605c34cb0bbc06c85cbfdff921fc39577..d57ff56f05236bfb949ea7a4324622871469c47c 100644 --- a/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/logs/debug-internal.log +++ b/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/logs/debug-internal.log @@ -371,3 +371,262 @@ 2024-11-13 19:11:47,847 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status 2024-11-13 19:11:47,848 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status 2024-11-13 19:11:51,230 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:11:56,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:01,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:02,847 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:12:02,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:12:02,848 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:12:06,021 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:12:07,202 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:12,203 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:17,006 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:12:17,847 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:12:17,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:12:17,848 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:12:17,991 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:23,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:28,029 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:12:28,901 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:32,847 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:12:32,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:12:32,848 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:12:34,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:39,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:44,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:47,008 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:12:47,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:12:47,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:12:47,848 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:12:48,871 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: partial_history +2024-11-13 19:12:48,872 DEBUG SenderThread:1939 [sender.py:send():382] send: history +2024-11-13 19:12:48,873 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 19:12:48,873 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 19:12:49,037 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/wandb-summary.json +2024-11-13 19:12:49,401 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:50,038 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:12:50,481 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: partial_history +2024-11-13 19:12:50,482 DEBUG SenderThread:1939 [sender.py:send():382] send: history +2024-11-13 19:12:50,483 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 19:12:50,484 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 19:12:51,038 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:12:51,039 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/wandb-summary.json +2024-11-13 19:12:52,039 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:12:54,485 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:12:59,486 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:02,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:13:02,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:13:02,848 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:13:05,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:10,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:14,047 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:13:15,403 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:17,011 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:13:17,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:13:17,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:13:17,849 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:13:21,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:26,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:31,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:32,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:13:32,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:13:32,849 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:13:36,055 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:13:36,280 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:41,280 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:46,281 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:47,014 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:13:47,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:13:47,848 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:13:47,849 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:13:52,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:13:57,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:00,064 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:14:02,559 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:02,854 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:14:02,854 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:14:02,856 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:14:08,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:13,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:17,017 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:14:17,855 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:14:17,855 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:14:17,895 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:14:19,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:22,072 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:14:24,329 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:29,330 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:32,854 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:14:32,855 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:14:32,855 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:14:35,231 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:40,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:45,232 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:47,019 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:14:48,157 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:14:48,157 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:14:48,415 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:14:50,256 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:14:56,051 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:01,051 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:02,910 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:15:02,910 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:15:03,032 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:15:06,239 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:10,089 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:15:11,478 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:16,478 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:17,022 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:15:18,866 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:15:18,866 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:15:19,390 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:15:22,355 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:27,579 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:28,095 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:15:30,096 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:15:32,097 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:15:32,786 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:32,973 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:15:32,973 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:15:33,126 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:15:34,098 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:15:36,099 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:15:37,917 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:38,100 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:15:40,101 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:15:43,713 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:47,025 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:15:49,026 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:49,048 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:15:49,666 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:15:49,666 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:15:54,756 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:15:59,756 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:03,643 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:16:03,643 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:16:04,015 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:16:05,465 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:10,466 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:15,937 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:16,113 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:16:17,030 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:16:18,245 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:16:18,245 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:16:18,549 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:16:21,587 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:26,588 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:31,588 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:33,245 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:16:33,246 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:16:33,549 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:16:37,587 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:42,588 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:47,030 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:16:48,032 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:48,251 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:16:48,251 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:16:49,121 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:16:53,293 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:16:58,293 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:03,294 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:05,806 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:17:05,806 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:17:05,941 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:17:08,940 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:13,941 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:17,033 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:17:19,035 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:20,127 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:17:20,128 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:17:20,171 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:17:24,246 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:30,141 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:17:30,209 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:35,210 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:35,928 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:17:35,928 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:17:36,794 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:17:40,795 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:45,796 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:47,036 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:17:51,038 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:51,987 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:17:51,987 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:17:51,988 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:17:54,152 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:17:56,164 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:17:56,204 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:17:59,166 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:18:01,167 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:18:01,358 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:03,168 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:18:05,169 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:18:07,169 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:18:07,346 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:12,347 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:13,267 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:18:13,476 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:18:13,476 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:18:17,039 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:18:18,041 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:23,042 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:28,199 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:29,298 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:18:30,038 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:18:30,039 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:18:34,183 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:39,184 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:43,601 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:18:43,602 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:18:44,627 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:18:44,740 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:47,042 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:18:50,044 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:18:53,037 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 19:18:53,037 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: partial_history +2024-11-13 19:18:53,039 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 19:18:53,039 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 19:18:53,040 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 19:18:53,040 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 19:18:53,040 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 19:18:53,040 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 19:18:53,040 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 19:18:53,041 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 19:18:53,041 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 19:18:53,041 DEBUG SenderThread:1939 [sender.py:send():382] send: history +2024-11-13 19:18:53,041 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: summary_record +2024-11-13 19:18:53,041 INFO SenderThread:1939 [sender.py:_save_file():1403] saving file wandb-summary.json with policy end +2024-11-13 19:18:53,186 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:18:53,187 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/wandb-summary.json +2024-11-13 19:18:54,187 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:18:55,187 INFO Thread-12 :1939 [dir_watcher.py:_on_file_modified():288] file/dir modified: /opt/ml/model//sft-codecontests-qwen_ds-code-contests_model-Qwen2.5-Coder-7B-Instruct_sch-cosine_lr-1e-6_bs-64_acc-8_len-2048/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/files/output.log +2024-11-13 19:18:55,768 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:18:55,922 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:18:55,922 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:18:57,159 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:02,160 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:07,160 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:10,768 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:19:10,923 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:19:10,923 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:19:13,077 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:17,045 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:19:19,046 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:24,047 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:25,768 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:19:25,923 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:19:25,923 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:19:29,078 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:34,079 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:39,079 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:40,768 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:19:40,922 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:19:40,923 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:19:45,078 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:47,048 DEBUG SenderThread:1939 [sender.py:send():382] send: stats +2024-11-13 19:19:51,050 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:19:55,768 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: internal_messages +2024-11-13 19:19:55,923 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: stop_status +2024-11-13 19:19:55,923 DEBUG SenderThread:1939 [sender.py:send_request():409] send_request: stop_status +2024-11-13 19:19:56,077 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:20:01,078 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report +2024-11-13 19:20:06,079 DEBUG HandlerThread:1939 [handler.py:handle_request():146] handle_request: status_report diff --git a/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/run-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1.wandb b/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/run-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1.wandb index 47103e7be4d3a49962f6b613b4bccf8dee025bf2..afa3211dc5c3b4474d422c7d97d27508fc3f75e8 100644 Binary files a/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/run-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1.wandb and b/wandb/run-20241113_190246-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1/run-kushalarora-rvv-main-2024-11-13-17-55-42-664-g0ro8r-algo-1.wandb differ