Training in progress, step 400
Browse files
breeze-dsw-tiny-id.log
CHANGED
@@ -505,3 +505,16 @@ Config: alpha=0.000010, betas=(0.900000, 0.999000), weight_decay=0.000000, adam_
|
|
505 |
[2024-01-12 18:37:24,766] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-300/global_step300/zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
506 |
[2024-01-12 18:37:24,902] [INFO] [engine.py:3431:_save_zero_checkpoint] zero checkpoint saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-300/global_step300/zero_pp_rank_0_mp_rank_00_optim_states.pt
|
507 |
[2024-01-12 18:37:24,903] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step300 is ready now!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
505 |
[2024-01-12 18:37:24,766] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-300/global_step300/zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
506 |
[2024-01-12 18:37:24,902] [INFO] [engine.py:3431:_save_zero_checkpoint] zero checkpoint saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-300/global_step300/zero_pp_rank_0_mp_rank_00_optim_states.pt
|
507 |
[2024-01-12 18:37:24,903] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step300 is ready now!
|
508 |
+
{'loss': 0.5102, 'learning_rate': 9.28689473531776e-06, 'epoch': 2.01}
|
509 |
+
{'loss': 0.602, 'learning_rate': 9.407574351377137e-06, 'epoch': 2.04}
|
510 |
+
{'loss': 0.6399, 'learning_rate': 9.519831289296397e-06, 'epoch': 2.06}
|
511 |
+
{'loss': 0.5456, 'learning_rate': 9.624764935335318e-06, 'epoch': 2.09}
|
512 |
+
{'eval_loss': 0.7177734375, 'eval_wer': 46.126598583126324, 'eval_runtime': 1169.8926, 'eval_samples_per_second': 3.113, 'eval_steps_per_second': 0.195, 'epoch': 2.09}
|
513 |
+
[2024-01-12 19:18:32,645] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step400 is about to be saved!
|
514 |
+
[2024-01-12 19:18:32,655] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-400/global_step400/mp_rank_00_model_states.pt
|
515 |
+
[2024-01-12 19:18:32,655] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-400/global_step400/mp_rank_00_model_states.pt...
|
516 |
+
[2024-01-12 19:18:33,633] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-400/global_step400/mp_rank_00_model_states.pt.
|
517 |
+
[2024-01-12 19:18:33,641] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-400/global_step400/zero_pp_rank_0_mp_rank_00_optim_states.pt...
|
518 |
+
[2024-01-12 19:18:38,426] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-400/global_step400/zero_pp_rank_0_mp_rank_00_optim_states.pt.
|
519 |
+
[2024-01-12 19:18:38,469] [INFO] [engine.py:3431:_save_zero_checkpoint] zero checkpoint saved /cosmos/home/sp-operator/ai/training/models/huggingface/scripts/../breeze-dsw-tiny-id/tmp-checkpoint-400/global_step400/zero_pp_rank_0_mp_rank_00_optim_states.pt
|
520 |
+
[2024-01-12 19:18:38,470] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now!
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 115372576
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3a4682ee8189d040969ac5bde6d3216eeefa0ead9439e1ab9a2b9191d52408a1
|
3 |
size 115372576
|
runs/Jan12_16-30-10_knight/events.out.tfevents.1705073454.knight.2969.0
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a4f169565764893879775bb7575fd7b3a840f973ba66a7f8c0c801d55a26f69
|
3 |
+
size 9200
|