Training in progress, step 200
Browse files
breeze-dsw-tiny-id.log
CHANGED
@@ -479,3 +479,16 @@ Config: alpha=0.000010, betas=(0.900000, 0.999000), weight_decay=0.000000, adam_
|
|
479 |
[2024-01-12 17:14:28,538] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
480 |
[2024-01-12 17:14:28,695] [INFO] [engine.py:3431:_save_zero_checkpoint] zero checkpoint saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_optim_states.pt
|
481 |
[2024-01-12 17:14:28,695] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step100 is ready now!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
479 |
[2024-01-12 17:14:28,538] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
480 |
[2024-01-12 17:14:28,695] [INFO] [engine.py:3431:_save_zero_checkpoint] zero checkpoint saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_optim_states.pt
|
481 |
[2024-01-12 17:14:28,695] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step100 is ready now!
|
482 |
+
{'loss': 0.765, 'learning_rate': 7.716963756434345e-06, 'epoch': 0.12}
|
483 |
+
{'loss': 0.5849, 'learning_rate': 8.019180844200955e-06, 'epoch': 0.15}
|
484 |
+
{'loss': 0.7834, 'learning_rate': 8.27351214279797e-06, 'epoch': 1.02}
|
485 |
+
{'loss': 0.7896, 'learning_rate': 8.49307723936858e-06, 'epoch': 1.04}
|
486 |
+
{'eval_loss': 0.7578125, 'eval_wer': 48.339313644309506, 'eval_runtime': 1161.7981, 'eval_samples_per_second': 3.135, 'eval_steps_per_second': 0.196, 'epoch': 1.04}
|
487 |
+
[2024-01-12 17:55:30,694] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved!
|
488 |
+
[2024-01-12 17:55:30,700] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-200/global_step200/mp_rank_00_model_states.pt
|
489 |
+
[2024-01-12 17:55:30,701] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-200/global_step200/mp_rank_00_model_states.pt...
|
490 |
+
[2024-01-12 17:55:31,582] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-200/global_step200/mp_rank_00_model_states.pt.
|
491 |
+
[2024-01-12 17:55:31,591] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_optim_states.pt...
|
492 |
+
[2024-01-12 17:55:36,176] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
493 |
+
[2024-01-12 17:55:36,323] [INFO] [engine.py:3431:_save_zero_checkpoint] zero checkpoint saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_optim_states.pt
|
494 |
+
[2024-01-12 17:55:36,323] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now!
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 115372576
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be61a2c1c624f25f441d6cf9fe0c79590422b574f8791961f0e1f7f5952d7878
|
3 |
size 115372576
|
runs/Jan12_16-30-10_knight/events.out.tfevents.1705073454.knight.2969.0
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e8734c76c171e764577b5cf4dfe4d5d0334035114f1a18376a28794c2791db50
|
3 |
+
size 7308
|