wcyat commited on
Commit
834ef71
·
verified ·
1 Parent(s): 4684bea

Training in progress, step 1500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12c3f1920c22b8c03f0e5e79cd16be00f955c741f065320582722df80a77c84d
3
  size 1304192904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8531674f361d182c891832698f501d99806f4a54ed644f913a7d6cceb4fcd09
3
  size 1304192904
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:92a580347f950bf1fbeed881ed69b88a5d8b74d5edfd33e30fdda4e84706a337
3
  size 2608620781
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd3900014829cff936e288455c77cc8b24c817e5bdea4b3c8b0a9a2a3b7ae871
3
  size 2608620781
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:07cd9b3292d4b8757e6a0b4dda333538b576e8e0c44ce0aa1c5d00941891c85e
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57ce7f83dafbabb337ab4f899de43a8d1e2ad1723ca17b896a002cebd19940f6
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b1833c721be8b262ff9a67cb491cdc5d937a286f1d484ecf25e40cca4aa7adf
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:303407244c6d788bc550e7a7560bf135b9af5a2f148beb7dd1c9b2a263c088ce
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.3018172085285187,
3
- "best_model_checkpoint": "./results/checkpoint-860",
4
- "epoch": 3.067484662576687,
5
  "eval_steps": 20,
6
- "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -807,6 +807,406 @@
807
  "eval_samples_per_second": 10.737,
808
  "eval_steps_per_second": 2.72,
809
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
810
  }
811
  ],
812
  "logging_steps": 20,
@@ -826,7 +1226,7 @@
826
  "attributes": {}
827
  }
828
  },
829
- "total_flos": 3245188867093824.0,
830
  "train_batch_size": 4,
831
  "trial_name": null,
832
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.2884907126426697,
3
+ "best_model_checkpoint": "./results/checkpoint-1440",
4
+ "epoch": 4.601226993865031,
5
  "eval_steps": 20,
6
+ "global_step": 1500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
807
  "eval_samples_per_second": 10.737,
808
  "eval_steps_per_second": 2.72,
809
  "step": 1000
810
+ },
811
+ {
812
+ "epoch": 3.128834355828221,
813
+ "grad_norm": 0.021293368190526962,
814
+ "learning_rate": 7.484662576687118e-06,
815
+ "loss": 0.1619,
816
+ "step": 1020
817
+ },
818
+ {
819
+ "epoch": 3.128834355828221,
820
+ "eval_accuracy": 0.9133333333333333,
821
+ "eval_loss": 0.4180934429168701,
822
+ "eval_runtime": 14.3308,
823
+ "eval_samples_per_second": 10.467,
824
+ "eval_steps_per_second": 2.652,
825
+ "step": 1020
826
+ },
827
+ {
828
+ "epoch": 3.190184049079755,
829
+ "grad_norm": 14.229204177856445,
830
+ "learning_rate": 7.239263803680983e-06,
831
+ "loss": 0.1849,
832
+ "step": 1040
833
+ },
834
+ {
835
+ "epoch": 3.190184049079755,
836
+ "eval_accuracy": 0.92,
837
+ "eval_loss": 0.47268736362457275,
838
+ "eval_runtime": 13.9315,
839
+ "eval_samples_per_second": 10.767,
840
+ "eval_steps_per_second": 2.728,
841
+ "step": 1040
842
+ },
843
+ {
844
+ "epoch": 3.2515337423312882,
845
+ "grad_norm": 0.017998775467276573,
846
+ "learning_rate": 6.993865030674847e-06,
847
+ "loss": 0.1949,
848
+ "step": 1060
849
+ },
850
+ {
851
+ "epoch": 3.2515337423312882,
852
+ "eval_accuracy": 0.8933333333333333,
853
+ "eval_loss": 0.3345566689968109,
854
+ "eval_runtime": 14.0413,
855
+ "eval_samples_per_second": 10.683,
856
+ "eval_steps_per_second": 2.706,
857
+ "step": 1060
858
+ },
859
+ {
860
+ "epoch": 3.312883435582822,
861
+ "grad_norm": 0.020114585757255554,
862
+ "learning_rate": 6.748466257668712e-06,
863
+ "loss": 0.1796,
864
+ "step": 1080
865
+ },
866
+ {
867
+ "epoch": 3.312883435582822,
868
+ "eval_accuracy": 0.9266666666666666,
869
+ "eval_loss": 0.3471291661262512,
870
+ "eval_runtime": 13.9741,
871
+ "eval_samples_per_second": 10.734,
872
+ "eval_steps_per_second": 2.719,
873
+ "step": 1080
874
+ },
875
+ {
876
+ "epoch": 3.374233128834356,
877
+ "grad_norm": 0.013905039988458157,
878
+ "learning_rate": 6.503067484662578e-06,
879
+ "loss": 0.086,
880
+ "step": 1100
881
+ },
882
+ {
883
+ "epoch": 3.374233128834356,
884
+ "eval_accuracy": 0.8866666666666667,
885
+ "eval_loss": 0.4089130759239197,
886
+ "eval_runtime": 13.9473,
887
+ "eval_samples_per_second": 10.755,
888
+ "eval_steps_per_second": 2.725,
889
+ "step": 1100
890
+ },
891
+ {
892
+ "epoch": 3.4355828220858897,
893
+ "grad_norm": 0.009700474329292774,
894
+ "learning_rate": 6.257668711656443e-06,
895
+ "loss": 0.0187,
896
+ "step": 1120
897
+ },
898
+ {
899
+ "epoch": 3.4355828220858897,
900
+ "eval_accuracy": 0.92,
901
+ "eval_loss": 0.3867844045162201,
902
+ "eval_runtime": 14.013,
903
+ "eval_samples_per_second": 10.704,
904
+ "eval_steps_per_second": 2.712,
905
+ "step": 1120
906
+ },
907
+ {
908
+ "epoch": 3.4969325153374236,
909
+ "grad_norm": 0.003781616687774658,
910
+ "learning_rate": 6.012269938650307e-06,
911
+ "loss": 0.0768,
912
+ "step": 1140
913
+ },
914
+ {
915
+ "epoch": 3.4969325153374236,
916
+ "eval_accuracy": 0.9266666666666666,
917
+ "eval_loss": 0.4095223546028137,
918
+ "eval_runtime": 13.9902,
919
+ "eval_samples_per_second": 10.722,
920
+ "eval_steps_per_second": 2.716,
921
+ "step": 1140
922
+ },
923
+ {
924
+ "epoch": 3.558282208588957,
925
+ "grad_norm": 0.21029236912727356,
926
+ "learning_rate": 5.766871165644172e-06,
927
+ "loss": 0.0008,
928
+ "step": 1160
929
+ },
930
+ {
931
+ "epoch": 3.558282208588957,
932
+ "eval_accuracy": 0.9066666666666666,
933
+ "eval_loss": 0.3779890835285187,
934
+ "eval_runtime": 13.9578,
935
+ "eval_samples_per_second": 10.747,
936
+ "eval_steps_per_second": 2.722,
937
+ "step": 1160
938
+ },
939
+ {
940
+ "epoch": 3.6196319018404908,
941
+ "grad_norm": 0.032502181828022,
942
+ "learning_rate": 5.521472392638038e-06,
943
+ "loss": 0.183,
944
+ "step": 1180
945
+ },
946
+ {
947
+ "epoch": 3.6196319018404908,
948
+ "eval_accuracy": 0.9,
949
+ "eval_loss": 0.3827475905418396,
950
+ "eval_runtime": 13.933,
951
+ "eval_samples_per_second": 10.766,
952
+ "eval_steps_per_second": 2.727,
953
+ "step": 1180
954
+ },
955
+ {
956
+ "epoch": 3.6809815950920246,
957
+ "grad_norm": 0.0289248526096344,
958
+ "learning_rate": 5.276073619631902e-06,
959
+ "loss": 0.204,
960
+ "step": 1200
961
+ },
962
+ {
963
+ "epoch": 3.6809815950920246,
964
+ "eval_accuracy": 0.9,
965
+ "eval_loss": 0.5132840871810913,
966
+ "eval_runtime": 13.9107,
967
+ "eval_samples_per_second": 10.783,
968
+ "eval_steps_per_second": 2.732,
969
+ "step": 1200
970
+ },
971
+ {
972
+ "epoch": 3.7423312883435584,
973
+ "grad_norm": 1.0068583488464355,
974
+ "learning_rate": 5.030674846625767e-06,
975
+ "loss": 0.0758,
976
+ "step": 1220
977
+ },
978
+ {
979
+ "epoch": 3.7423312883435584,
980
+ "eval_accuracy": 0.9133333333333333,
981
+ "eval_loss": 0.4279702305793762,
982
+ "eval_runtime": 13.8908,
983
+ "eval_samples_per_second": 10.799,
984
+ "eval_steps_per_second": 2.736,
985
+ "step": 1220
986
+ },
987
+ {
988
+ "epoch": 3.8036809815950923,
989
+ "grad_norm": 0.0101453373208642,
990
+ "learning_rate": 4.785276073619632e-06,
991
+ "loss": 0.0237,
992
+ "step": 1240
993
+ },
994
+ {
995
+ "epoch": 3.8036809815950923,
996
+ "eval_accuracy": 0.92,
997
+ "eval_loss": 0.3941916823387146,
998
+ "eval_runtime": 13.8887,
999
+ "eval_samples_per_second": 10.8,
1000
+ "eval_steps_per_second": 2.736,
1001
+ "step": 1240
1002
+ },
1003
+ {
1004
+ "epoch": 3.8650306748466257,
1005
+ "grad_norm": 182.59510803222656,
1006
+ "learning_rate": 4.539877300613497e-06,
1007
+ "loss": 0.2143,
1008
+ "step": 1260
1009
+ },
1010
+ {
1011
+ "epoch": 3.8650306748466257,
1012
+ "eval_accuracy": 0.9066666666666666,
1013
+ "eval_loss": 0.36801090836524963,
1014
+ "eval_runtime": 13.8946,
1015
+ "eval_samples_per_second": 10.796,
1016
+ "eval_steps_per_second": 2.735,
1017
+ "step": 1260
1018
+ },
1019
+ {
1020
+ "epoch": 3.9263803680981595,
1021
+ "grad_norm": 0.03958132117986679,
1022
+ "learning_rate": 4.294478527607362e-06,
1023
+ "loss": 0.0106,
1024
+ "step": 1280
1025
+ },
1026
+ {
1027
+ "epoch": 3.9263803680981595,
1028
+ "eval_accuracy": 0.8866666666666667,
1029
+ "eval_loss": 0.5633125901222229,
1030
+ "eval_runtime": 13.9747,
1031
+ "eval_samples_per_second": 10.734,
1032
+ "eval_steps_per_second": 2.719,
1033
+ "step": 1280
1034
+ },
1035
+ {
1036
+ "epoch": 3.9877300613496933,
1037
+ "grad_norm": 140.984375,
1038
+ "learning_rate": 4.049079754601227e-06,
1039
+ "loss": 0.2221,
1040
+ "step": 1300
1041
+ },
1042
+ {
1043
+ "epoch": 3.9877300613496933,
1044
+ "eval_accuracy": 0.92,
1045
+ "eval_loss": 0.38154712319374084,
1046
+ "eval_runtime": 13.996,
1047
+ "eval_samples_per_second": 10.717,
1048
+ "eval_steps_per_second": 2.715,
1049
+ "step": 1300
1050
+ },
1051
+ {
1052
+ "epoch": 4.049079754601227,
1053
+ "grad_norm": 0.0037327792961150408,
1054
+ "learning_rate": 3.8036809815950928e-06,
1055
+ "loss": 0.0212,
1056
+ "step": 1320
1057
+ },
1058
+ {
1059
+ "epoch": 4.049079754601227,
1060
+ "eval_accuracy": 0.9266666666666666,
1061
+ "eval_loss": 0.4598991274833679,
1062
+ "eval_runtime": 13.9374,
1063
+ "eval_samples_per_second": 10.762,
1064
+ "eval_steps_per_second": 2.726,
1065
+ "step": 1320
1066
+ },
1067
+ {
1068
+ "epoch": 4.110429447852761,
1069
+ "grad_norm": 0.010530122555792332,
1070
+ "learning_rate": 3.5582822085889574e-06,
1071
+ "loss": 0.1678,
1072
+ "step": 1340
1073
+ },
1074
+ {
1075
+ "epoch": 4.110429447852761,
1076
+ "eval_accuracy": 0.92,
1077
+ "eval_loss": 0.34579145908355713,
1078
+ "eval_runtime": 13.9042,
1079
+ "eval_samples_per_second": 10.788,
1080
+ "eval_steps_per_second": 2.733,
1081
+ "step": 1340
1082
+ },
1083
+ {
1084
+ "epoch": 4.171779141104294,
1085
+ "grad_norm": 0.015701429918408394,
1086
+ "learning_rate": 3.312883435582822e-06,
1087
+ "loss": 0.1153,
1088
+ "step": 1360
1089
+ },
1090
+ {
1091
+ "epoch": 4.171779141104294,
1092
+ "eval_accuracy": 0.92,
1093
+ "eval_loss": 0.3261447250843048,
1094
+ "eval_runtime": 13.9236,
1095
+ "eval_samples_per_second": 10.773,
1096
+ "eval_steps_per_second": 2.729,
1097
+ "step": 1360
1098
+ },
1099
+ {
1100
+ "epoch": 4.233128834355828,
1101
+ "grad_norm": 0.011384344659745693,
1102
+ "learning_rate": 3.0674846625766875e-06,
1103
+ "loss": 0.0006,
1104
+ "step": 1380
1105
+ },
1106
+ {
1107
+ "epoch": 4.233128834355828,
1108
+ "eval_accuracy": 0.9133333333333333,
1109
+ "eval_loss": 0.3404422998428345,
1110
+ "eval_runtime": 13.9552,
1111
+ "eval_samples_per_second": 10.749,
1112
+ "eval_steps_per_second": 2.723,
1113
+ "step": 1380
1114
+ },
1115
+ {
1116
+ "epoch": 4.294478527607362,
1117
+ "grad_norm": 0.08929850906133652,
1118
+ "learning_rate": 2.822085889570552e-06,
1119
+ "loss": 0.0193,
1120
+ "step": 1400
1121
+ },
1122
+ {
1123
+ "epoch": 4.294478527607362,
1124
+ "eval_accuracy": 0.92,
1125
+ "eval_loss": 0.3601679503917694,
1126
+ "eval_runtime": 13.9574,
1127
+ "eval_samples_per_second": 10.747,
1128
+ "eval_steps_per_second": 2.723,
1129
+ "step": 1400
1130
+ },
1131
+ {
1132
+ "epoch": 4.355828220858895,
1133
+ "grad_norm": 129.2814483642578,
1134
+ "learning_rate": 2.5766871165644175e-06,
1135
+ "loss": 0.0994,
1136
+ "step": 1420
1137
+ },
1138
+ {
1139
+ "epoch": 4.355828220858895,
1140
+ "eval_accuracy": 0.94,
1141
+ "eval_loss": 0.33025145530700684,
1142
+ "eval_runtime": 13.9627,
1143
+ "eval_samples_per_second": 10.743,
1144
+ "eval_steps_per_second": 2.722,
1145
+ "step": 1420
1146
+ },
1147
+ {
1148
+ "epoch": 4.41717791411043,
1149
+ "grad_norm": 0.024497592821717262,
1150
+ "learning_rate": 2.331288343558282e-06,
1151
+ "loss": 0.0032,
1152
+ "step": 1440
1153
+ },
1154
+ {
1155
+ "epoch": 4.41717791411043,
1156
+ "eval_accuracy": 0.94,
1157
+ "eval_loss": 0.2884907126426697,
1158
+ "eval_runtime": 13.9317,
1159
+ "eval_samples_per_second": 10.767,
1160
+ "eval_steps_per_second": 2.728,
1161
+ "step": 1440
1162
+ },
1163
+ {
1164
+ "epoch": 4.478527607361963,
1165
+ "grad_norm": 0.009228991344571114,
1166
+ "learning_rate": 2.085889570552147e-06,
1167
+ "loss": 0.0008,
1168
+ "step": 1460
1169
+ },
1170
+ {
1171
+ "epoch": 4.478527607361963,
1172
+ "eval_accuracy": 0.92,
1173
+ "eval_loss": 0.31121641397476196,
1174
+ "eval_runtime": 13.9417,
1175
+ "eval_samples_per_second": 10.759,
1176
+ "eval_steps_per_second": 2.726,
1177
+ "step": 1460
1178
+ },
1179
+ {
1180
+ "epoch": 4.539877300613497,
1181
+ "grad_norm": 0.22681556642055511,
1182
+ "learning_rate": 1.8404907975460124e-06,
1183
+ "loss": 0.0823,
1184
+ "step": 1480
1185
+ },
1186
+ {
1187
+ "epoch": 4.539877300613497,
1188
+ "eval_accuracy": 0.9266666666666666,
1189
+ "eval_loss": 0.3145321011543274,
1190
+ "eval_runtime": 13.9075,
1191
+ "eval_samples_per_second": 10.786,
1192
+ "eval_steps_per_second": 2.732,
1193
+ "step": 1480
1194
+ },
1195
+ {
1196
+ "epoch": 4.601226993865031,
1197
+ "grad_norm": 0.010360241867601871,
1198
+ "learning_rate": 1.5950920245398775e-06,
1199
+ "loss": 0.0086,
1200
+ "step": 1500
1201
+ },
1202
+ {
1203
+ "epoch": 4.601226993865031,
1204
+ "eval_accuracy": 0.94,
1205
+ "eval_loss": 0.2954442799091339,
1206
+ "eval_runtime": 14.0025,
1207
+ "eval_samples_per_second": 10.712,
1208
+ "eval_steps_per_second": 2.714,
1209
+ "step": 1500
1210
  }
1211
  ],
1212
  "logging_steps": 20,
 
1226
  "attributes": {}
1227
  }
1228
  },
1229
+ "total_flos": 4859676225851160.0,
1230
  "train_batch_size": 4,
1231
  "trial_name": null,
1232
  "trial_params": null