dabrown commited on
Commit
d35fb2a
·
verified ·
1 Parent(s): 5bd65ce

Training in progress, step 149, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0edf81e08968f34163bc0db35d46efdcae33ce717a6a80d73d6e87dea2fbb687
3
  size 381467712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddd2876e0a68bb8b7e959a04501787cffdab82fd97ca9977055aee5327255323
3
  size 381467712
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c031591cf4dd76da2d24c8202807a3b694feb1ace00f6cc763c19626c9a64ab0
3
  size 194112692
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81668ee561c075c98c7a59efcf651ad7f78341d9ed87e7bd51098eb363c51a3c
3
  size 194112692
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0fe9d358728f3e2dfc39039512d5239b54756f9c232b67c69fe2b994b4ba419
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd6ebb4d1fcba13f470668229470b99a4a72d5e05b994155e9cfcbea87d34006
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:343b14e7ebf2086e0f8dcc5acab2659f6394ab8c83ebb899f766408511f6a705
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3167249cc14ab1cd4e7d1463e098421e293c9d1ad0924be094f1a6596bad0ce
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.2716773748397827,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-100",
4
- "epoch": 2.0202020202020203,
5
  "eval_steps": 50,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -723,6 +723,349 @@
723
  "eval_samples_per_second": 35.425,
724
  "eval_steps_per_second": 8.856,
725
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
  }
727
  ],
728
  "logging_steps": 1,
@@ -746,12 +1089,12 @@
746
  "should_evaluate": false,
747
  "should_log": false,
748
  "should_save": true,
749
- "should_training_stop": false
750
  },
751
  "attributes": {}
752
  }
753
  },
754
- "total_flos": 5.844904293747917e+16,
755
  "train_batch_size": 8,
756
  "trial_name": null,
757
  "trial_params": null
 
1
  {
2
  "best_metric": 0.2716773748397827,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-100",
4
+ "epoch": 3.01010101010101,
5
  "eval_steps": 50,
6
+ "global_step": 149,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
723
  "eval_samples_per_second": 35.425,
724
  "eval_steps_per_second": 8.856,
725
  "step": 100
726
+ },
727
+ {
728
+ "epoch": 2.04040404040404,
729
+ "grad_norm": 16.043392181396484,
730
+ "learning_rate": 2.664846361837997e-05,
731
+ "loss": 0.2829,
732
+ "step": 101
733
+ },
734
+ {
735
+ "epoch": 2.0606060606060606,
736
+ "grad_norm": 4.770566463470459,
737
+ "learning_rate": 2.565525897695651e-05,
738
+ "loss": 0.3002,
739
+ "step": 102
740
+ },
741
+ {
742
+ "epoch": 2.080808080808081,
743
+ "grad_norm": 15.801465034484863,
744
+ "learning_rate": 2.467448965126443e-05,
745
+ "loss": 0.2552,
746
+ "step": 103
747
+ },
748
+ {
749
+ "epoch": 2.101010101010101,
750
+ "grad_norm": 6.288998126983643,
751
+ "learning_rate": 2.3706656619162278e-05,
752
+ "loss": 0.2836,
753
+ "step": 104
754
+ },
755
+ {
756
+ "epoch": 2.121212121212121,
757
+ "grad_norm": 4.048598289489746,
758
+ "learning_rate": 2.2752254250638126e-05,
759
+ "loss": 0.2602,
760
+ "step": 105
761
+ },
762
+ {
763
+ "epoch": 2.1414141414141414,
764
+ "grad_norm": 5.017980098724365,
765
+ "learning_rate": 2.1811770055284968e-05,
766
+ "loss": 0.2913,
767
+ "step": 106
768
+ },
769
+ {
770
+ "epoch": 2.1616161616161618,
771
+ "grad_norm": 4.102363586425781,
772
+ "learning_rate": 2.0885684433280333e-05,
773
+ "loss": 0.2716,
774
+ "step": 107
775
+ },
776
+ {
777
+ "epoch": 2.1818181818181817,
778
+ "grad_norm": 4.771272659301758,
779
+ "learning_rate": 1.9974470429997483e-05,
780
+ "loss": 0.2735,
781
+ "step": 108
782
+ },
783
+ {
784
+ "epoch": 2.202020202020202,
785
+ "grad_norm": 4.773435592651367,
786
+ "learning_rate": 1.907859349437336e-05,
787
+ "loss": 0.2543,
788
+ "step": 109
789
+ },
790
+ {
791
+ "epoch": 2.2222222222222223,
792
+ "grad_norm": 8.052456855773926,
793
+ "learning_rate": 1.8198511241156903e-05,
794
+ "loss": 0.3169,
795
+ "step": 110
796
+ },
797
+ {
798
+ "epoch": 2.242424242424242,
799
+ "grad_norm": 7.02642297744751,
800
+ "learning_rate": 1.7334673217158974e-05,
801
+ "loss": 0.3384,
802
+ "step": 111
803
+ },
804
+ {
805
+ "epoch": 2.2626262626262625,
806
+ "grad_norm": 4.065310478210449,
807
+ "learning_rate": 1.6487520671623468e-05,
808
+ "loss": 0.3324,
809
+ "step": 112
810
+ },
811
+ {
812
+ "epoch": 2.282828282828283,
813
+ "grad_norm": 4.226308345794678,
814
+ "learning_rate": 1.5657486330836784e-05,
815
+ "loss": 0.283,
816
+ "step": 113
817
+ },
818
+ {
819
+ "epoch": 2.303030303030303,
820
+ "grad_norm": 4.673561096191406,
821
+ "learning_rate": 1.484499417709087e-05,
822
+ "loss": 0.2792,
823
+ "step": 114
824
+ },
825
+ {
826
+ "epoch": 2.323232323232323,
827
+ "grad_norm": 3.937140464782715,
828
+ "learning_rate": 1.405045923211265e-05,
829
+ "loss": 0.3049,
830
+ "step": 115
831
+ },
832
+ {
833
+ "epoch": 2.3434343434343434,
834
+ "grad_norm": 6.760053634643555,
835
+ "learning_rate": 1.3274287345070562e-05,
836
+ "loss": 0.2954,
837
+ "step": 116
838
+ },
839
+ {
840
+ "epoch": 2.3636363636363638,
841
+ "grad_norm": 4.757177352905273,
842
+ "learning_rate": 1.2516874985266508e-05,
843
+ "loss": 0.2701,
844
+ "step": 117
845
+ },
846
+ {
847
+ "epoch": 2.3838383838383836,
848
+ "grad_norm": 4.382113933563232,
849
+ "learning_rate": 1.1778609039618805e-05,
850
+ "loss": 0.2332,
851
+ "step": 118
852
+ },
853
+ {
854
+ "epoch": 2.404040404040404,
855
+ "grad_norm": 5.1120100021362305,
856
+ "learning_rate": 1.1059866615040204e-05,
857
+ "loss": 0.2603,
858
+ "step": 119
859
+ },
860
+ {
861
+ "epoch": 2.4242424242424243,
862
+ "grad_norm": 5.514906406402588,
863
+ "learning_rate": 1.0361014845811168e-05,
864
+ "loss": 0.3278,
865
+ "step": 120
866
+ },
867
+ {
868
+ "epoch": 2.4444444444444446,
869
+ "grad_norm": 4.801580429077148,
870
+ "learning_rate": 9.682410706047428e-06,
871
+ "loss": 0.2455,
872
+ "step": 121
873
+ },
874
+ {
875
+ "epoch": 2.4646464646464645,
876
+ "grad_norm": 4.759316921234131,
877
+ "learning_rate": 9.024400827357344e-06,
878
+ "loss": 0.2886,
879
+ "step": 122
880
+ },
881
+ {
882
+ "epoch": 2.484848484848485,
883
+ "grad_norm": 9.59177303314209,
884
+ "learning_rate": 8.387321321781976e-06,
885
+ "loss": 0.4744,
886
+ "step": 123
887
+ },
888
+ {
889
+ "epoch": 2.505050505050505,
890
+ "grad_norm": 5.647252559661865,
891
+ "learning_rate": 7.77149761010898e-06,
892
+ "loss": 0.31,
893
+ "step": 124
894
+ },
895
+ {
896
+ "epoch": 2.525252525252525,
897
+ "grad_norm": 4.252394676208496,
898
+ "learning_rate": 7.177244255647208e-06,
899
+ "loss": 0.2365,
900
+ "step": 125
901
+ },
902
+ {
903
+ "epoch": 2.5454545454545454,
904
+ "grad_norm": 7.538860321044922,
905
+ "learning_rate": 6.6048648035475115e-06,
906
+ "loss": 0.2448,
907
+ "step": 126
908
+ },
909
+ {
910
+ "epoch": 2.5656565656565657,
911
+ "grad_norm": 5.2316999435424805,
912
+ "learning_rate": 6.054651625751717e-06,
913
+ "loss": 0.3191,
914
+ "step": 127
915
+ },
916
+ {
917
+ "epoch": 2.5858585858585856,
918
+ "grad_norm": 4.985154151916504,
919
+ "learning_rate": 5.526885771648599e-06,
920
+ "loss": 0.2462,
921
+ "step": 128
922
+ },
923
+ {
924
+ "epoch": 2.606060606060606,
925
+ "grad_norm": 3.8869848251342773,
926
+ "learning_rate": 5.021836824513759e-06,
927
+ "loss": 0.2581,
928
+ "step": 129
929
+ },
930
+ {
931
+ "epoch": 2.6262626262626263,
932
+ "grad_norm": 4.702490329742432,
933
+ "learning_rate": 4.53976276380616e-06,
934
+ "loss": 0.3093,
935
+ "step": 130
936
+ },
937
+ {
938
+ "epoch": 2.6464646464646466,
939
+ "grad_norm": 4.508066177368164,
940
+ "learning_rate": 4.080909833391944e-06,
941
+ "loss": 0.301,
942
+ "step": 131
943
+ },
944
+ {
945
+ "epoch": 2.6666666666666665,
946
+ "grad_norm": 5.0213446617126465,
947
+ "learning_rate": 3.6455124157629805e-06,
948
+ "loss": 0.3258,
949
+ "step": 132
950
+ },
951
+ {
952
+ "epoch": 2.686868686868687,
953
+ "grad_norm": 5.15741491317749,
954
+ "learning_rate": 3.2337929123139434e-06,
955
+ "loss": 0.2747,
956
+ "step": 133
957
+ },
958
+ {
959
+ "epoch": 2.707070707070707,
960
+ "grad_norm": 4.806821823120117,
961
+ "learning_rate": 2.8459616297395466e-06,
962
+ "loss": 0.2677,
963
+ "step": 134
964
+ },
965
+ {
966
+ "epoch": 2.7272727272727275,
967
+ "grad_norm": 6.16556978225708,
968
+ "learning_rate": 2.4822166726096774e-06,
969
+ "loss": 0.3746,
970
+ "step": 135
971
+ },
972
+ {
973
+ "epoch": 2.7474747474747474,
974
+ "grad_norm": 4.232897758483887,
975
+ "learning_rate": 2.142743842177386e-06,
976
+ "loss": 0.2943,
977
+ "step": 136
978
+ },
979
+ {
980
+ "epoch": 2.7676767676767677,
981
+ "grad_norm": 4.609518051147461,
982
+ "learning_rate": 1.827716541471486e-06,
983
+ "loss": 0.2712,
984
+ "step": 137
985
+ },
986
+ {
987
+ "epoch": 2.787878787878788,
988
+ "grad_norm": 5.150665760040283,
989
+ "learning_rate": 1.5372956867220677e-06,
990
+ "loss": 0.2823,
991
+ "step": 138
992
+ },
993
+ {
994
+ "epoch": 2.808080808080808,
995
+ "grad_norm": 3.9823038578033447,
996
+ "learning_rate": 1.2716296251644e-06,
997
+ "loss": 0.2736,
998
+ "step": 139
999
+ },
1000
+ {
1001
+ "epoch": 2.8282828282828283,
1002
+ "grad_norm": 5.216023921966553,
1003
+ "learning_rate": 1.0308540592629756e-06,
1004
+ "loss": 0.3313,
1005
+ "step": 140
1006
+ },
1007
+ {
1008
+ "epoch": 2.8484848484848486,
1009
+ "grad_norm": 4.384965896606445,
1010
+ "learning_rate": 8.150919773946164e-07,
1011
+ "loss": 0.2674,
1012
+ "step": 141
1013
+ },
1014
+ {
1015
+ "epoch": 2.8686868686868685,
1016
+ "grad_norm": 4.805583953857422,
1017
+ "learning_rate": 6.244535910258698e-07,
1018
+ "loss": 0.2695,
1019
+ "step": 142
1020
+ },
1021
+ {
1022
+ "epoch": 2.888888888888889,
1023
+ "grad_norm": 4.811954021453857,
1024
+ "learning_rate": 4.590362784169022e-07,
1025
+ "loss": 0.2901,
1026
+ "step": 143
1027
+ },
1028
+ {
1029
+ "epoch": 2.909090909090909,
1030
+ "grad_norm": 5.8568949699401855,
1031
+ "learning_rate": 3.1892453488058803e-07,
1032
+ "loss": 0.3121,
1033
+ "step": 144
1034
+ },
1035
+ {
1036
+ "epoch": 2.929292929292929,
1037
+ "grad_norm": 4.470373153686523,
1038
+ "learning_rate": 2.0418992962224492e-07,
1039
+ "loss": 0.2924,
1040
+ "step": 145
1041
+ },
1042
+ {
1043
+ "epoch": 2.9494949494949494,
1044
+ "grad_norm": 5.206216335296631,
1045
+ "learning_rate": 1.1489106918200487e-07,
1046
+ "loss": 0.3076,
1047
+ "step": 146
1048
+ },
1049
+ {
1050
+ "epoch": 2.9696969696969697,
1051
+ "grad_norm": 5.941263198852539,
1052
+ "learning_rate": 5.107356749853298e-08,
1053
+ "loss": 0.3061,
1054
+ "step": 147
1055
+ },
1056
+ {
1057
+ "epoch": 2.98989898989899,
1058
+ "grad_norm": 5.180295467376709,
1059
+ "learning_rate": 1.2770022609409626e-08,
1060
+ "loss": 0.2699,
1061
+ "step": 148
1062
+ },
1063
+ {
1064
+ "epoch": 3.01010101010101,
1065
+ "grad_norm": 5.430907249450684,
1066
+ "learning_rate": 0.0,
1067
+ "loss": 0.3501,
1068
+ "step": 149
1069
  }
1070
  ],
1071
  "logging_steps": 1,
 
1089
  "should_evaluate": false,
1090
  "should_log": false,
1091
  "should_save": true,
1092
+ "should_training_stop": true
1093
  },
1094
  "attributes": {}
1095
  }
1096
  },
1097
+ "total_flos": 8.708870844186624e+16,
1098
  "train_batch_size": 8,
1099
  "trial_name": null,
1100
  "trial_params": null