SystemAdmin123 commited on
Commit
5d52abe
·
verified ·
1 Parent(s): 3c96910

Training in progress, step 1200, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2f61cccc71965321d5bfa0a82a36d3ce48485b151d51630bed6fa6f1b3e1ef0
3
  size 2066752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5c077b171ae4b737b7f301a57a9b94a0f46db1956a8b28ce23c479697cca649
3
  size 2066752
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d24716c68872d0691c8737d29256cdc4c237a8b9f40809eeea9970c38b4f513f
3
  size 2162798
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b791eecb5542c5f0363f26e98b2e45debdfbfcde29b0a6e042cdf3e5e6567d8
3
  size 2162798
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c431bcafebc4c8ee346d130e382b11c81be579ca0bfd3918fae07b16e10b92f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a85b7ee4e3e06f8b21d4d23e7eb8bbe5510e7f25d23cfc2ffc16d97845a1be25
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40b6b717644e21f80a22ec98694b3a2fd9d62a6467e549d64314725dba905d52
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:427276ae77d918ee2b880ea4152618640d39ea76588856ca2cd62fe2ab8b83d7
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.23675643681562591,
5
  "eval_steps": 200,
6
- "global_step": 800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -607,6 +607,302 @@
607
  "eval_samples_per_second": 89.48,
608
  "eval_steps_per_second": 22.4,
609
  "step": 800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610
  }
611
  ],
612
  "logging_steps": 10,
@@ -626,7 +922,7 @@
626
  "attributes": {}
627
  }
628
  },
629
- "total_flos": 20509072293888.0,
630
  "train_batch_size": 4,
631
  "trial_name": null,
632
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.3551346552234389,
5
  "eval_steps": 200,
6
+ "global_step": 1200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
607
  "eval_samples_per_second": 89.48,
608
  "eval_steps_per_second": 22.4,
609
  "step": 800
610
+ },
611
+ {
612
+ "epoch": 0.23971589227582124,
613
+ "grad_norm": 0.47265625,
614
+ "learning_rate": 0.0001616818989495711,
615
+ "loss": 8.7235,
616
+ "step": 810
617
+ },
618
+ {
619
+ "epoch": 0.24267534773601657,
620
+ "grad_norm": 0.447265625,
621
+ "learning_rate": 0.00016063536950548826,
622
+ "loss": 8.7121,
623
+ "step": 820
624
+ },
625
+ {
626
+ "epoch": 0.2456348031962119,
627
+ "grad_norm": 0.50390625,
628
+ "learning_rate": 0.0001595782306274553,
629
+ "loss": 8.741,
630
+ "step": 830
631
+ },
632
+ {
633
+ "epoch": 0.24859425865640722,
634
+ "grad_norm": 0.58203125,
635
+ "learning_rate": 0.00015851066728416618,
636
+ "loss": 8.6978,
637
+ "step": 840
638
+ },
639
+ {
640
+ "epoch": 0.25155371411660254,
641
+ "grad_norm": 1.296875,
642
+ "learning_rate": 0.00015743286626829437,
643
+ "loss": 8.7496,
644
+ "step": 850
645
+ },
646
+ {
647
+ "epoch": 0.25451316957679787,
648
+ "grad_norm": 0.455078125,
649
+ "learning_rate": 0.00015634501616380967,
650
+ "loss": 8.6913,
651
+ "step": 860
652
+ },
653
+ {
654
+ "epoch": 0.2574726250369932,
655
+ "grad_norm": 0.5078125,
656
+ "learning_rate": 0.00015524730731298134,
657
+ "loss": 8.6728,
658
+ "step": 870
659
+ },
660
+ {
661
+ "epoch": 0.2604320804971885,
662
+ "grad_norm": 0.5,
663
+ "learning_rate": 0.0001541399317830738,
664
+ "loss": 8.6724,
665
+ "step": 880
666
+ },
667
+ {
668
+ "epoch": 0.26339153595738385,
669
+ "grad_norm": 0.8359375,
670
+ "learning_rate": 0.0001530230833327405,
671
+ "loss": 8.763,
672
+ "step": 890
673
+ },
674
+ {
675
+ "epoch": 0.2663509914175792,
676
+ "grad_norm": 0.953125,
677
+ "learning_rate": 0.00015189695737812152,
678
+ "loss": 8.6008,
679
+ "step": 900
680
+ },
681
+ {
682
+ "epoch": 0.2693104468777745,
683
+ "grad_norm": 0.455078125,
684
+ "learning_rate": 0.0001507617509586517,
685
+ "loss": 8.7395,
686
+ "step": 910
687
+ },
688
+ {
689
+ "epoch": 0.2722699023379698,
690
+ "grad_norm": 0.47265625,
691
+ "learning_rate": 0.00014961766270258422,
692
+ "loss": 8.6413,
693
+ "step": 920
694
+ },
695
+ {
696
+ "epoch": 0.27522935779816515,
697
+ "grad_norm": 0.5078125,
698
+ "learning_rate": 0.00014846489279223652,
699
+ "loss": 8.7083,
700
+ "step": 930
701
+ },
702
+ {
703
+ "epoch": 0.2781888132583605,
704
+ "grad_norm": 0.609375,
705
+ "learning_rate": 0.0001473036429289641,
706
+ "loss": 8.6829,
707
+ "step": 940
708
+ },
709
+ {
710
+ "epoch": 0.2811482687185558,
711
+ "grad_norm": 1.34375,
712
+ "learning_rate": 0.0001461341162978688,
713
+ "loss": 8.6955,
714
+ "step": 950
715
+ },
716
+ {
717
+ "epoch": 0.28410772417875113,
718
+ "grad_norm": 0.419921875,
719
+ "learning_rate": 0.00014495651753224705,
720
+ "loss": 8.6962,
721
+ "step": 960
722
+ },
723
+ {
724
+ "epoch": 0.28706717963894646,
725
+ "grad_norm": 0.478515625,
726
+ "learning_rate": 0.00014377105267778518,
727
+ "loss": 8.7166,
728
+ "step": 970
729
+ },
730
+ {
731
+ "epoch": 0.2900266350991418,
732
+ "grad_norm": 0.55859375,
733
+ "learning_rate": 0.00014257792915650728,
734
+ "loss": 8.6469,
735
+ "step": 980
736
+ },
737
+ {
738
+ "epoch": 0.2929860905593371,
739
+ "grad_norm": 0.6640625,
740
+ "learning_rate": 0.00014137735573048233,
741
+ "loss": 8.6999,
742
+ "step": 990
743
+ },
744
+ {
745
+ "epoch": 0.2959455460195324,
746
+ "grad_norm": 1.1171875,
747
+ "learning_rate": 0.00014016954246529696,
748
+ "loss": 8.5944,
749
+ "step": 1000
750
+ },
751
+ {
752
+ "epoch": 0.2959455460195324,
753
+ "eval_loss": 8.698212623596191,
754
+ "eval_runtime": 13.7844,
755
+ "eval_samples_per_second": 108.964,
756
+ "eval_steps_per_second": 27.277,
757
+ "step": 1000
758
+ },
759
+ {
760
+ "epoch": 0.2989050014797277,
761
+ "grad_norm": 0.5546875,
762
+ "learning_rate": 0.00013895470069330004,
763
+ "loss": 8.7432,
764
+ "step": 1010
765
+ },
766
+ {
767
+ "epoch": 0.30186445693992303,
768
+ "grad_norm": 0.498046875,
769
+ "learning_rate": 0.00013773304297662559,
770
+ "loss": 8.6772,
771
+ "step": 1020
772
+ },
773
+ {
774
+ "epoch": 0.30482391240011836,
775
+ "grad_norm": 0.671875,
776
+ "learning_rate": 0.00013650478307000057,
777
+ "loss": 8.73,
778
+ "step": 1030
779
+ },
780
+ {
781
+ "epoch": 0.3077833678603137,
782
+ "grad_norm": 0.77734375,
783
+ "learning_rate": 0.00013527013588334415,
784
+ "loss": 8.7362,
785
+ "step": 1040
786
+ },
787
+ {
788
+ "epoch": 0.310742823320509,
789
+ "grad_norm": 1.296875,
790
+ "learning_rate": 0.00013402931744416433,
791
+ "loss": 8.6947,
792
+ "step": 1050
793
+ },
794
+ {
795
+ "epoch": 0.31370227878070434,
796
+ "grad_norm": 0.451171875,
797
+ "learning_rate": 0.00013278254485975976,
798
+ "loss": 8.6919,
799
+ "step": 1060
800
+ },
801
+ {
802
+ "epoch": 0.31666173424089966,
803
+ "grad_norm": 0.59375,
804
+ "learning_rate": 0.00013153003627923218,
805
+ "loss": 8.7202,
806
+ "step": 1070
807
+ },
808
+ {
809
+ "epoch": 0.319621189701095,
810
+ "grad_norm": 0.58203125,
811
+ "learning_rate": 0.00013027201085531634,
812
+ "loss": 8.7236,
813
+ "step": 1080
814
+ },
815
+ {
816
+ "epoch": 0.3225806451612903,
817
+ "grad_norm": 0.640625,
818
+ "learning_rate": 0.00012900868870603503,
819
+ "loss": 8.7817,
820
+ "step": 1090
821
+ },
822
+ {
823
+ "epoch": 0.32554010062148564,
824
+ "grad_norm": 1.1015625,
825
+ "learning_rate": 0.00012774029087618446,
826
+ "loss": 8.8011,
827
+ "step": 1100
828
+ },
829
+ {
830
+ "epoch": 0.32849955608168097,
831
+ "grad_norm": 0.462890625,
832
+ "learning_rate": 0.00012646703929865817,
833
+ "loss": 8.687,
834
+ "step": 1110
835
+ },
836
+ {
837
+ "epoch": 0.3314590115418763,
838
+ "grad_norm": 0.47265625,
839
+ "learning_rate": 0.00012518915675561483,
840
+ "loss": 8.6354,
841
+ "step": 1120
842
+ },
843
+ {
844
+ "epoch": 0.3344184670020716,
845
+ "grad_norm": 0.578125,
846
+ "learning_rate": 0.00012390686683949798,
847
+ "loss": 8.6407,
848
+ "step": 1130
849
+ },
850
+ {
851
+ "epoch": 0.33737792246226694,
852
+ "grad_norm": 0.71875,
853
+ "learning_rate": 0.00012262039391391404,
854
+ "loss": 8.6823,
855
+ "step": 1140
856
+ },
857
+ {
858
+ "epoch": 0.34033737792246227,
859
+ "grad_norm": 1.6484375,
860
+ "learning_rate": 0.0001213299630743747,
861
+ "loss": 8.7369,
862
+ "step": 1150
863
+ },
864
+ {
865
+ "epoch": 0.3432968333826576,
866
+ "grad_norm": 0.48046875,
867
+ "learning_rate": 0.00012003580010891213,
868
+ "loss": 8.6849,
869
+ "step": 1160
870
+ },
871
+ {
872
+ "epoch": 0.3462562888428529,
873
+ "grad_norm": 0.55078125,
874
+ "learning_rate": 0.00011873813145857249,
875
+ "loss": 8.6571,
876
+ "step": 1170
877
+ },
878
+ {
879
+ "epoch": 0.34921574430304825,
880
+ "grad_norm": 0.51171875,
881
+ "learning_rate": 0.00011743718417779517,
882
+ "loss": 8.7425,
883
+ "step": 1180
884
+ },
885
+ {
886
+ "epoch": 0.3521751997632436,
887
+ "grad_norm": 0.9453125,
888
+ "learning_rate": 0.00011613318589468511,
889
+ "loss": 8.6455,
890
+ "step": 1190
891
+ },
892
+ {
893
+ "epoch": 0.3551346552234389,
894
+ "grad_norm": 0.8203125,
895
+ "learning_rate": 0.0001148263647711842,
896
+ "loss": 8.673,
897
+ "step": 1200
898
+ },
899
+ {
900
+ "epoch": 0.3551346552234389,
901
+ "eval_loss": 8.696282386779785,
902
+ "eval_runtime": 20.1067,
903
+ "eval_samples_per_second": 74.702,
904
+ "eval_steps_per_second": 18.7,
905
+ "step": 1200
906
  }
907
  ],
908
  "logging_steps": 10,
 
922
  "attributes": {}
923
  }
924
  },
925
+ "total_flos": 30763608440832.0,
926
  "train_batch_size": 4,
927
  "trial_name": null,
928
  "trial_params": null