wcyat commited on
Commit
acf2487
·
verified ·
1 Parent(s): e15367e

Training in progress, step 1500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50ca524f54edaad2efaf52b5094690e6a44a5385b9e7f53ef2c19f513752a265
3
  size 1304192904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95ceecdd0b1aa2dbf1ec2f23a4ddd928595c382511d085f0d1663c9722b61ee1
3
  size 1304192904
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eec4148b0f8e105171cc96f1d25b9ea52f1a3ccfeb7c98ad54edc804e3b3c50c
3
  size 2608620781
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6a4cdb23fbf57119940af942e16735f7e6ed513337f644de9af3a2da6bc01e6
3
  size 2608620781
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6410f39a0e7c645ed67ec28686db1c7ce44af3a9fe7fbe74340514fa7e64b446
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0dde6cfb71c791201a1b9da1c3a5b4ebadc80456fb340adc64cdc8144e3ec77
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a73fd5a100d4ccdd00ffce46e090807f48fb5f542090f5c0a86653f3b6372be2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94619124ebef073ef434567921b91695bdc23ddab6d107310abf209634914efe
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.2986587882041931,
3
  "best_model_checkpoint": "./results/checkpoint-180",
4
- "epoch": 3.0211480362537766,
5
  "eval_steps": 20,
6
- "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -807,6 +807,406 @@
807
  "eval_samples_per_second": 9.897,
808
  "eval_steps_per_second": 2.523,
809
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
810
  }
811
  ],
812
  "logging_steps": 20,
@@ -826,7 +1226,7 @@
826
  "attributes": {}
827
  }
828
  },
829
- "total_flos": 3235818588464112.0,
830
  "train_batch_size": 4,
831
  "trial_name": null,
832
  "trial_params": null
 
1
  {
2
  "best_metric": 0.2986587882041931,
3
  "best_model_checkpoint": "./results/checkpoint-180",
4
+ "epoch": 4.531722054380665,
5
  "eval_steps": 20,
6
+ "global_step": 1500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
807
  "eval_samples_per_second": 9.897,
808
  "eval_steps_per_second": 2.523,
809
  "step": 1000
810
+ },
811
+ {
812
+ "epoch": 3.081570996978852,
813
+ "grad_norm": 0.047077979892492294,
814
+ "learning_rate": 7.673716012084593e-06,
815
+ "loss": 0.2093,
816
+ "step": 1020
817
+ },
818
+ {
819
+ "epoch": 3.081570996978852,
820
+ "eval_accuracy": 0.9019607843137255,
821
+ "eval_loss": 0.4630826711654663,
822
+ "eval_runtime": 16.4745,
823
+ "eval_samples_per_second": 9.287,
824
+ "eval_steps_per_second": 2.367,
825
+ "step": 1020
826
+ },
827
+ {
828
+ "epoch": 3.1419939577039275,
829
+ "grad_norm": 0.07947070896625519,
830
+ "learning_rate": 7.432024169184291e-06,
831
+ "loss": 0.2436,
832
+ "step": 1040
833
+ },
834
+ {
835
+ "epoch": 3.1419939577039275,
836
+ "eval_accuracy": 0.869281045751634,
837
+ "eval_loss": 0.5888116359710693,
838
+ "eval_runtime": 15.2662,
839
+ "eval_samples_per_second": 10.022,
840
+ "eval_steps_per_second": 2.555,
841
+ "step": 1040
842
+ },
843
+ {
844
+ "epoch": 3.202416918429003,
845
+ "grad_norm": 1.2857632637023926,
846
+ "learning_rate": 7.190332326283988e-06,
847
+ "loss": 0.1375,
848
+ "step": 1060
849
+ },
850
+ {
851
+ "epoch": 3.202416918429003,
852
+ "eval_accuracy": 0.8888888888888888,
853
+ "eval_loss": 0.6457459926605225,
854
+ "eval_runtime": 15.6557,
855
+ "eval_samples_per_second": 9.773,
856
+ "eval_steps_per_second": 2.491,
857
+ "step": 1060
858
+ },
859
+ {
860
+ "epoch": 3.2628398791540785,
861
+ "grad_norm": 0.029293788596987724,
862
+ "learning_rate": 6.948640483383686e-06,
863
+ "loss": 0.0049,
864
+ "step": 1080
865
+ },
866
+ {
867
+ "epoch": 3.2628398791540785,
868
+ "eval_accuracy": 0.8888888888888888,
869
+ "eval_loss": 0.6600757837295532,
870
+ "eval_runtime": 15.5566,
871
+ "eval_samples_per_second": 9.835,
872
+ "eval_steps_per_second": 2.507,
873
+ "step": 1080
874
+ },
875
+ {
876
+ "epoch": 3.323262839879154,
877
+ "grad_norm": 0.01973886974155903,
878
+ "learning_rate": 6.706948640483384e-06,
879
+ "loss": 0.0089,
880
+ "step": 1100
881
+ },
882
+ {
883
+ "epoch": 3.323262839879154,
884
+ "eval_accuracy": 0.8823529411764706,
885
+ "eval_loss": 0.6461706161499023,
886
+ "eval_runtime": 15.4515,
887
+ "eval_samples_per_second": 9.902,
888
+ "eval_steps_per_second": 2.524,
889
+ "step": 1100
890
+ },
891
+ {
892
+ "epoch": 3.38368580060423,
893
+ "grad_norm": 0.0038618145044893026,
894
+ "learning_rate": 6.465256797583082e-06,
895
+ "loss": 0.0616,
896
+ "step": 1120
897
+ },
898
+ {
899
+ "epoch": 3.38368580060423,
900
+ "eval_accuracy": 0.8888888888888888,
901
+ "eval_loss": 0.6607339978218079,
902
+ "eval_runtime": 15.5198,
903
+ "eval_samples_per_second": 9.858,
904
+ "eval_steps_per_second": 2.513,
905
+ "step": 1120
906
+ },
907
+ {
908
+ "epoch": 3.4441087613293053,
909
+ "grad_norm": 0.009797470644116402,
910
+ "learning_rate": 6.22356495468278e-06,
911
+ "loss": 0.006,
912
+ "step": 1140
913
+ },
914
+ {
915
+ "epoch": 3.4441087613293053,
916
+ "eval_accuracy": 0.9019607843137255,
917
+ "eval_loss": 0.6243405938148499,
918
+ "eval_runtime": 15.5129,
919
+ "eval_samples_per_second": 9.863,
920
+ "eval_steps_per_second": 2.514,
921
+ "step": 1140
922
+ },
923
+ {
924
+ "epoch": 3.504531722054381,
925
+ "grad_norm": 0.01924210786819458,
926
+ "learning_rate": 5.981873111782478e-06,
927
+ "loss": 0.1769,
928
+ "step": 1160
929
+ },
930
+ {
931
+ "epoch": 3.504531722054381,
932
+ "eval_accuracy": 0.9019607843137255,
933
+ "eval_loss": 0.5256864428520203,
934
+ "eval_runtime": 15.4982,
935
+ "eval_samples_per_second": 9.872,
936
+ "eval_steps_per_second": 2.516,
937
+ "step": 1160
938
+ },
939
+ {
940
+ "epoch": 3.5649546827794563,
941
+ "grad_norm": 0.06478149443864822,
942
+ "learning_rate": 5.7401812688821754e-06,
943
+ "loss": 0.0044,
944
+ "step": 1180
945
+ },
946
+ {
947
+ "epoch": 3.5649546827794563,
948
+ "eval_accuracy": 0.9084967320261438,
949
+ "eval_loss": 0.5507912039756775,
950
+ "eval_runtime": 15.4829,
951
+ "eval_samples_per_second": 9.882,
952
+ "eval_steps_per_second": 2.519,
953
+ "step": 1180
954
+ },
955
+ {
956
+ "epoch": 3.6253776435045317,
957
+ "grad_norm": 0.14462168514728546,
958
+ "learning_rate": 5.498489425981873e-06,
959
+ "loss": 0.2295,
960
+ "step": 1200
961
+ },
962
+ {
963
+ "epoch": 3.6253776435045317,
964
+ "eval_accuracy": 0.9150326797385621,
965
+ "eval_loss": 0.48460787534713745,
966
+ "eval_runtime": 15.4689,
967
+ "eval_samples_per_second": 9.891,
968
+ "eval_steps_per_second": 2.521,
969
+ "step": 1200
970
+ },
971
+ {
972
+ "epoch": 3.685800604229607,
973
+ "grad_norm": 0.004326341208070517,
974
+ "learning_rate": 5.2567975830815706e-06,
975
+ "loss": 0.1175,
976
+ "step": 1220
977
+ },
978
+ {
979
+ "epoch": 3.685800604229607,
980
+ "eval_accuracy": 0.9019607843137255,
981
+ "eval_loss": 0.4763535261154175,
982
+ "eval_runtime": 15.4639,
983
+ "eval_samples_per_second": 9.894,
984
+ "eval_steps_per_second": 2.522,
985
+ "step": 1220
986
+ },
987
+ {
988
+ "epoch": 3.7462235649546827,
989
+ "grad_norm": 0.016803044825792313,
990
+ "learning_rate": 5.01510574018127e-06,
991
+ "loss": 0.0746,
992
+ "step": 1240
993
+ },
994
+ {
995
+ "epoch": 3.7462235649546827,
996
+ "eval_accuracy": 0.9019607843137255,
997
+ "eval_loss": 0.4760640561580658,
998
+ "eval_runtime": 15.4735,
999
+ "eval_samples_per_second": 9.888,
1000
+ "eval_steps_per_second": 2.52,
1001
+ "step": 1240
1002
+ },
1003
+ {
1004
+ "epoch": 3.806646525679758,
1005
+ "grad_norm": 0.024552155286073685,
1006
+ "learning_rate": 4.773413897280967e-06,
1007
+ "loss": 0.0222,
1008
+ "step": 1260
1009
+ },
1010
+ {
1011
+ "epoch": 3.806646525679758,
1012
+ "eval_accuracy": 0.9019607843137255,
1013
+ "eval_loss": 0.48361214995384216,
1014
+ "eval_runtime": 15.4614,
1015
+ "eval_samples_per_second": 9.896,
1016
+ "eval_steps_per_second": 2.522,
1017
+ "step": 1260
1018
+ },
1019
+ {
1020
+ "epoch": 3.8670694864048336,
1021
+ "grad_norm": 0.04030178114771843,
1022
+ "learning_rate": 4.531722054380665e-06,
1023
+ "loss": 0.0012,
1024
+ "step": 1280
1025
+ },
1026
+ {
1027
+ "epoch": 3.8670694864048336,
1028
+ "eval_accuracy": 0.9215686274509803,
1029
+ "eval_loss": 0.4774629771709442,
1030
+ "eval_runtime": 15.4388,
1031
+ "eval_samples_per_second": 9.91,
1032
+ "eval_steps_per_second": 2.526,
1033
+ "step": 1280
1034
+ },
1035
+ {
1036
+ "epoch": 3.9274924471299095,
1037
+ "grad_norm": 0.030293526127934456,
1038
+ "learning_rate": 4.2900302114803626e-06,
1039
+ "loss": 0.2131,
1040
+ "step": 1300
1041
+ },
1042
+ {
1043
+ "epoch": 3.9274924471299095,
1044
+ "eval_accuracy": 0.9019607843137255,
1045
+ "eval_loss": 0.46071678400039673,
1046
+ "eval_runtime": 15.4357,
1047
+ "eval_samples_per_second": 9.912,
1048
+ "eval_steps_per_second": 2.527,
1049
+ "step": 1300
1050
+ },
1051
+ {
1052
+ "epoch": 3.987915407854985,
1053
+ "grad_norm": 0.04355171322822571,
1054
+ "learning_rate": 4.048338368580061e-06,
1055
+ "loss": 0.0006,
1056
+ "step": 1320
1057
+ },
1058
+ {
1059
+ "epoch": 3.987915407854985,
1060
+ "eval_accuracy": 0.9084967320261438,
1061
+ "eval_loss": 0.4934905767440796,
1062
+ "eval_runtime": 15.4993,
1063
+ "eval_samples_per_second": 9.871,
1064
+ "eval_steps_per_second": 2.516,
1065
+ "step": 1320
1066
+ },
1067
+ {
1068
+ "epoch": 4.04833836858006,
1069
+ "grad_norm": 0.023308318108320236,
1070
+ "learning_rate": 3.8066465256797586e-06,
1071
+ "loss": 0.0758,
1072
+ "step": 1340
1073
+ },
1074
+ {
1075
+ "epoch": 4.04833836858006,
1076
+ "eval_accuracy": 0.9019607843137255,
1077
+ "eval_loss": 0.4591919183731079,
1078
+ "eval_runtime": 15.4907,
1079
+ "eval_samples_per_second": 9.877,
1080
+ "eval_steps_per_second": 2.518,
1081
+ "step": 1340
1082
+ },
1083
+ {
1084
+ "epoch": 4.108761329305136,
1085
+ "grad_norm": 0.007429028861224651,
1086
+ "learning_rate": 3.564954682779456e-06,
1087
+ "loss": 0.1466,
1088
+ "step": 1360
1089
+ },
1090
+ {
1091
+ "epoch": 4.108761329305136,
1092
+ "eval_accuracy": 0.9084967320261438,
1093
+ "eval_loss": 0.4464338719844818,
1094
+ "eval_runtime": 15.4771,
1095
+ "eval_samples_per_second": 9.886,
1096
+ "eval_steps_per_second": 2.52,
1097
+ "step": 1360
1098
+ },
1099
+ {
1100
+ "epoch": 4.169184290030212,
1101
+ "grad_norm": 0.019125748425722122,
1102
+ "learning_rate": 3.3232628398791546e-06,
1103
+ "loss": 0.0488,
1104
+ "step": 1380
1105
+ },
1106
+ {
1107
+ "epoch": 4.169184290030212,
1108
+ "eval_accuracy": 0.9084967320261438,
1109
+ "eval_loss": 0.4816044867038727,
1110
+ "eval_runtime": 15.4856,
1111
+ "eval_samples_per_second": 9.88,
1112
+ "eval_steps_per_second": 2.518,
1113
+ "step": 1380
1114
+ },
1115
+ {
1116
+ "epoch": 4.229607250755287,
1117
+ "grad_norm": 0.02935463935136795,
1118
+ "learning_rate": 3.081570996978852e-06,
1119
+ "loss": 0.0014,
1120
+ "step": 1400
1121
+ },
1122
+ {
1123
+ "epoch": 4.229607250755287,
1124
+ "eval_accuracy": 0.9150326797385621,
1125
+ "eval_loss": 0.4570343494415283,
1126
+ "eval_runtime": 15.452,
1127
+ "eval_samples_per_second": 9.902,
1128
+ "eval_steps_per_second": 2.524,
1129
+ "step": 1400
1130
+ },
1131
+ {
1132
+ "epoch": 4.290030211480363,
1133
+ "grad_norm": 0.05864783003926277,
1134
+ "learning_rate": 2.83987915407855e-06,
1135
+ "loss": 0.082,
1136
+ "step": 1420
1137
+ },
1138
+ {
1139
+ "epoch": 4.290030211480363,
1140
+ "eval_accuracy": 0.9215686274509803,
1141
+ "eval_loss": 0.45447495579719543,
1142
+ "eval_runtime": 15.4821,
1143
+ "eval_samples_per_second": 9.882,
1144
+ "eval_steps_per_second": 2.519,
1145
+ "step": 1420
1146
+ },
1147
+ {
1148
+ "epoch": 4.350453172205438,
1149
+ "grad_norm": 1.1889474391937256,
1150
+ "learning_rate": 2.598187311178248e-06,
1151
+ "loss": 0.0009,
1152
+ "step": 1440
1153
+ },
1154
+ {
1155
+ "epoch": 4.350453172205438,
1156
+ "eval_accuracy": 0.9150326797385621,
1157
+ "eval_loss": 0.4721324145793915,
1158
+ "eval_runtime": 15.4932,
1159
+ "eval_samples_per_second": 9.875,
1160
+ "eval_steps_per_second": 2.517,
1161
+ "step": 1440
1162
+ },
1163
+ {
1164
+ "epoch": 4.410876132930514,
1165
+ "grad_norm": 0.017764601856470108,
1166
+ "learning_rate": 2.3564954682779457e-06,
1167
+ "loss": 0.0008,
1168
+ "step": 1460
1169
+ },
1170
+ {
1171
+ "epoch": 4.410876132930514,
1172
+ "eval_accuracy": 0.9215686274509803,
1173
+ "eval_loss": 0.4873809814453125,
1174
+ "eval_runtime": 15.477,
1175
+ "eval_samples_per_second": 9.886,
1176
+ "eval_steps_per_second": 2.52,
1177
+ "step": 1460
1178
+ },
1179
+ {
1180
+ "epoch": 4.471299093655589,
1181
+ "grad_norm": 0.016238484531641006,
1182
+ "learning_rate": 2.1148036253776437e-06,
1183
+ "loss": 0.0014,
1184
+ "step": 1480
1185
+ },
1186
+ {
1187
+ "epoch": 4.471299093655589,
1188
+ "eval_accuracy": 0.9150326797385621,
1189
+ "eval_loss": 0.5002758502960205,
1190
+ "eval_runtime": 15.4881,
1191
+ "eval_samples_per_second": 9.879,
1192
+ "eval_steps_per_second": 2.518,
1193
+ "step": 1480
1194
+ },
1195
+ {
1196
+ "epoch": 4.531722054380665,
1197
+ "grad_norm": 0.037868522107601166,
1198
+ "learning_rate": 1.8731117824773415e-06,
1199
+ "loss": 0.1612,
1200
+ "step": 1500
1201
+ },
1202
+ {
1203
+ "epoch": 4.531722054380665,
1204
+ "eval_accuracy": 0.9150326797385621,
1205
+ "eval_loss": 0.5064195394515991,
1206
+ "eval_runtime": 15.5013,
1207
+ "eval_samples_per_second": 9.87,
1208
+ "eval_steps_per_second": 2.516,
1209
+ "step": 1500
1210
  }
1211
  ],
1212
  "logging_steps": 20,
 
1226
  "attributes": {}
1227
  }
1228
  },
1229
+ "total_flos": 4844949162060756.0,
1230
  "train_batch_size": 4,
1231
  "trial_name": null,
1232
  "trial_params": null