Training in progress, step 300
Browse files
breeze-dsw-tiny-id.log
CHANGED
@@ -492,3 +492,16 @@ Config: alpha=0.000010, betas=(0.900000, 0.999000), weight_decay=0.000000, adam_
|
|
492 |
[2024-01-12 17:55:36,176] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
493 |
[2024-01-12 17:55:36,323] [INFO] [engine.py:3431:_save_zero_checkpoint] zero checkpoint saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_optim_states.pt
|
494 |
[2024-01-12 17:55:36,323] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
492 |
[2024-01-12 17:55:36,176] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
493 |
[2024-01-12 17:55:36,323] [INFO] [engine.py:3431:_save_zero_checkpoint] zero checkpoint saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_optim_states.pt
|
494 |
[2024-01-12 17:55:36,323] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now!
|
495 |
+
{'loss': 0.8413, 'learning_rate': 8.686247975778677e-06, 'epoch': 1.07}
|
496 |
+
{'loss': 0.68, 'learning_rate': 8.858694625217149e-06, 'epoch': 1.09}
|
497 |
+
{'loss': 0.5851, 'learning_rate': 9.014436199608479e-06, 'epoch': 1.12}
|
498 |
+
{'loss': 0.4164, 'learning_rate': 9.156425255148058e-06, 'epoch': 1.14}
|
499 |
+
{'eval_loss': 0.73876953125, 'eval_wer': 49.25936148679732, 'eval_runtime': 1191.6636, 'eval_samples_per_second': 3.056, 'eval_steps_per_second': 0.191, 'epoch': 1.14}
|
500 |
+
[2024-01-12 18:37:19,231] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step300 is about to be saved!
|
501 |
+
[2024-01-12 18:37:19,238] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-300/global_step300/mp_rank_00_model_states.pt
|
502 |
+
[2024-01-12 18:37:19,238] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-300/global_step300/mp_rank_00_model_states.pt...
|
503 |
+
[2024-01-12 18:37:20,104] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-300/global_step300/mp_rank_00_model_states.pt.
|
504 |
+
[2024-01-12 18:37:20,112] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-300/global_step300/zero_pp_rank_0_mp_rank_00_optim_states.pt...
|
505 |
+
[2024-01-12 18:37:24,766] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-300/global_step300/zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
506 |
+
[2024-01-12 18:37:24,902] [INFO] [engine.py:3431:_save_zero_checkpoint] zero checkpoint saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-300/global_step300/zero_pp_rank_0_mp_rank_00_optim_states.pt
|
507 |
+
[2024-01-12 18:37:24,903] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step300 is ready now!
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 115372576
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db646ecef6ab4fe31707808eb183414e4d7262a94f779603e8f6af5a948cf444
|
3 |
size 115372576
|
runs/Jan12_16-30-10_knight/events.out.tfevents.1705073454.knight.2969.0
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:68a264e6d478e1abf924c9278ce9fb4d52705f09ecbf440afd4ef49c50aefc57
|
3 |
+
size 8254
|