Training in progress, step 500
Browse files
breeze-dsw-tiny-id.log
CHANGED
@@ -518,3 +518,16 @@ Config: alpha=0.000010, betas=(0.900000, 0.999000), weight_decay=0.000000, adam_
|
|
518 |
[2024-01-12 19:18:38,426] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-400/global_step400/zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
519 |
[2024-01-12 19:18:38,469] [INFO] [engine.py:3431:_save_zero_checkpoint] zero checkpoint saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-400/global_step400/zero_pp_rank_0_mp_rank_00_optim_states.pt
|
520 |
[2024-01-12 19:18:38,470] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
518 |
[2024-01-12 19:18:38,426] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-400/global_step400/zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
519 |
[2024-01-12 19:18:38,469] [INFO] [engine.py:3431:_save_zero_checkpoint] zero checkpoint saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-400/global_step400/zero_pp_rank_0_mp_rank_00_optim_states.pt
|
520 |
[2024-01-12 19:18:38,470] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now!
|
521 |
+
{'loss': 0.48, 'learning_rate': 9.723272550712454e-06, 'epoch': 2.11}
|
522 |
+
{'loss': 0.3325, 'learning_rate': 9.816095971633122e-06, 'epoch': 2.14}
|
523 |
+
{'loss': 0.3389, 'learning_rate': 9.90385555539545e-06, 'epoch': 3.01}
|
524 |
+
{'loss': 0.476, 'learning_rate': 9.987075336738768e-06, 'epoch': 3.03}
|
525 |
+
{'eval_loss': 0.7109375, 'eval_wer': 45.243352654338025, 'eval_runtime': 1126.9534, 'eval_samples_per_second': 3.232, 'eval_steps_per_second': 0.202, 'epoch': 3.03}
|
526 |
+
[2024-01-12 20:00:05,991] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step500 is about to be saved!
|
527 |
+
[2024-01-12 20:00:06,003] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-500/global_step500/mp_rank_00_model_states.pt
|
528 |
+
[2024-01-12 20:00:06,003] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-500/global_step500/mp_rank_00_model_states.pt...
|
529 |
+
[2024-01-12 20:00:06,891] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-500/global_step500/mp_rank_00_model_states.pt.
|
530 |
+
[2024-01-12 20:00:06,900] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_optim_states.pt...
|
531 |
+
[2024-01-12 20:00:11,513] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
532 |
+
[2024-01-12 20:00:11,652] [INFO] [engine.py:3431:_save_zero_checkpoint] zero checkpoint saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_optim_states.pt
|
533 |
+
[2024-01-12 20:00:11,653] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step500 is ready now!
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 115372576
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a2065f194b4cbfea71d782580339fe7e0d884733dc06c033026a42d4b30f7bde
|
3 |
size 115372576
|
runs/Jan12_16-30-10_knight/events.out.tfevents.1705073454.knight.2969.0
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:47a10f2b390f8fcf2d8432d9a833e6e89e5082aeb6216492ad977e96b4cbd615
|
3 |
+
size 10146
|