ErrorAI commited on
Commit
998a78b
·
verified ·
1 Parent(s): c542472

Training in progress, step 248, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c900d238d14e1037bad8471b2ad3ccccd90a5fec518db1c8ce72007d170398ad
3
  size 144805440
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58100fe655c8ed92d4518fef45e29df4d60c7e291f66f716456f73e4ea77f392
3
  size 144805440
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c360ceccad5f60fb95639dbf4e0513e8c0f0b3ff27fb6d0739312326efc435e
3
  size 74291604
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab152d90f6d47ebd8a356ceecca0993fb077dcff867c7c58ca00456e2cfcd04b
3
  size 74291604
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bac592a9ca2514bc2cfa4738b780303ed581353d6599570fd57ea5aeacd531f9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f660891b19594633ced246d59eedd400fe2556d319f4e5ca333df7fb57888180
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5f72f9735f8242853fd5033caee72818d745afc25ff8221fe23de7a6ff33743
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81771ff96e80b84ed048126e169640f8617ceb476fe2f91b8561190057e53b0d
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.25012607160867373,
5
  "eval_steps": 500,
6
- "global_step": 124,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -875,6 +875,874 @@
875
  "learning_rate": 8.61933911810608e-05,
876
  "loss": 0.5258,
877
  "step": 124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
878
  }
879
  ],
880
  "logging_steps": 1,
@@ -894,7 +1762,7 @@
894
  "attributes": {}
895
  }
896
  },
897
- "total_flos": 2.1893950480161178e+17,
898
  "train_batch_size": 4,
899
  "trial_name": null,
900
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5002521432173475,
5
  "eval_steps": 500,
6
+ "global_step": 248,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
875
  "learning_rate": 8.61933911810608e-05,
876
  "loss": 0.5258,
877
  "step": 124
878
+ },
879
+ {
880
+ "epoch": 0.2521432173474534,
881
+ "grad_norm": 0.4010732173919678,
882
+ "learning_rate": 8.597192817707122e-05,
883
+ "loss": 0.6078,
884
+ "step": 125
885
+ },
886
+ {
887
+ "epoch": 0.25416036308623297,
888
+ "grad_norm": 0.3758286237716675,
889
+ "learning_rate": 8.574899252492833e-05,
890
+ "loss": 0.6296,
891
+ "step": 126
892
+ },
893
+ {
894
+ "epoch": 0.2561775088250126,
895
+ "grad_norm": 0.4148157238960266,
896
+ "learning_rate": 8.552459335135381e-05,
897
+ "loss": 0.5566,
898
+ "step": 127
899
+ },
900
+ {
901
+ "epoch": 0.25819465456379226,
902
+ "grad_norm": 0.45975151658058167,
903
+ "learning_rate": 8.529873984298418e-05,
904
+ "loss": 0.7839,
905
+ "step": 128
906
+ },
907
+ {
908
+ "epoch": 0.26021180030257185,
909
+ "grad_norm": 0.3960277736186981,
910
+ "learning_rate": 8.507144124599467e-05,
911
+ "loss": 0.5523,
912
+ "step": 129
913
+ },
914
+ {
915
+ "epoch": 0.2622289460413515,
916
+ "grad_norm": 0.42408132553100586,
917
+ "learning_rate": 8.484270686572078e-05,
918
+ "loss": 0.6672,
919
+ "step": 130
920
+ },
921
+ {
922
+ "epoch": 0.2642460917801311,
923
+ "grad_norm": 0.42156749963760376,
924
+ "learning_rate": 8.46125460662772e-05,
925
+ "loss": 0.6694,
926
+ "step": 131
927
+ },
928
+ {
929
+ "epoch": 0.26626323751891073,
930
+ "grad_norm": 0.4371163547039032,
931
+ "learning_rate": 8.43809682701746e-05,
932
+ "loss": 0.6765,
933
+ "step": 132
934
+ },
935
+ {
936
+ "epoch": 0.2682803832576904,
937
+ "grad_norm": 0.4190014600753784,
938
+ "learning_rate": 8.41479829579338e-05,
939
+ "loss": 0.5559,
940
+ "step": 133
941
+ },
942
+ {
943
+ "epoch": 0.27029752899646997,
944
+ "grad_norm": 0.44778770208358765,
945
+ "learning_rate": 8.391359966769765e-05,
946
+ "loss": 0.6315,
947
+ "step": 134
948
+ },
949
+ {
950
+ "epoch": 0.2723146747352496,
951
+ "grad_norm": 0.4291159212589264,
952
+ "learning_rate": 8.367782799484057e-05,
953
+ "loss": 0.6501,
954
+ "step": 135
955
+ },
956
+ {
957
+ "epoch": 0.27433182047402926,
958
+ "grad_norm": 0.4089667797088623,
959
+ "learning_rate": 8.344067759157572e-05,
960
+ "loss": 0.6162,
961
+ "step": 136
962
+ },
963
+ {
964
+ "epoch": 0.27634896621280886,
965
+ "grad_norm": 0.4689813256263733,
966
+ "learning_rate": 8.320215816655987e-05,
967
+ "loss": 0.7768,
968
+ "step": 137
969
+ },
970
+ {
971
+ "epoch": 0.2783661119515885,
972
+ "grad_norm": 0.41810306906700134,
973
+ "learning_rate": 8.296227948449589e-05,
974
+ "loss": 0.5548,
975
+ "step": 138
976
+ },
977
+ {
978
+ "epoch": 0.28038325769036815,
979
+ "grad_norm": 0.4520469009876251,
980
+ "learning_rate": 8.272105136573303e-05,
981
+ "loss": 0.6789,
982
+ "step": 139
983
+ },
984
+ {
985
+ "epoch": 0.28240040342914774,
986
+ "grad_norm": 0.4686160087585449,
987
+ "learning_rate": 8.24784836858649e-05,
988
+ "loss": 0.6922,
989
+ "step": 140
990
+ },
991
+ {
992
+ "epoch": 0.2844175491679274,
993
+ "grad_norm": 0.5166224837303162,
994
+ "learning_rate": 8.223458637532515e-05,
995
+ "loss": 0.7206,
996
+ "step": 141
997
+ },
998
+ {
999
+ "epoch": 0.28643469490670703,
1000
+ "grad_norm": 0.486186683177948,
1001
+ "learning_rate": 8.198936941898091e-05,
1002
+ "loss": 0.599,
1003
+ "step": 142
1004
+ },
1005
+ {
1006
+ "epoch": 0.2884518406454866,
1007
+ "grad_norm": 0.4760117530822754,
1008
+ "learning_rate": 8.174284285572408e-05,
1009
+ "loss": 0.6445,
1010
+ "step": 143
1011
+ },
1012
+ {
1013
+ "epoch": 0.29046898638426627,
1014
+ "grad_norm": 0.49186456203460693,
1015
+ "learning_rate": 8.14950167780603e-05,
1016
+ "loss": 0.635,
1017
+ "step": 144
1018
+ },
1019
+ {
1020
+ "epoch": 0.2924861321230459,
1021
+ "grad_norm": 0.5166196227073669,
1022
+ "learning_rate": 8.12459013316958e-05,
1023
+ "loss": 0.6888,
1024
+ "step": 145
1025
+ },
1026
+ {
1027
+ "epoch": 0.2945032778618255,
1028
+ "grad_norm": 0.5494632720947266,
1029
+ "learning_rate": 8.099550671512202e-05,
1030
+ "loss": 0.6592,
1031
+ "step": 146
1032
+ },
1033
+ {
1034
+ "epoch": 0.29652042360060515,
1035
+ "grad_norm": 0.5840097069740295,
1036
+ "learning_rate": 8.074384317919812e-05,
1037
+ "loss": 0.7137,
1038
+ "step": 147
1039
+ },
1040
+ {
1041
+ "epoch": 0.29853756933938475,
1042
+ "grad_norm": 0.5433554649353027,
1043
+ "learning_rate": 8.049092102673135e-05,
1044
+ "loss": 0.6446,
1045
+ "step": 148
1046
+ },
1047
+ {
1048
+ "epoch": 0.3005547150781644,
1049
+ "grad_norm": 0.5587112903594971,
1050
+ "learning_rate": 8.023675061205519e-05,
1051
+ "loss": 0.6222,
1052
+ "step": 149
1053
+ },
1054
+ {
1055
+ "epoch": 0.30257186081694404,
1056
+ "grad_norm": 0.6611265540122986,
1057
+ "learning_rate": 7.998134234060551e-05,
1058
+ "loss": 0.7365,
1059
+ "step": 150
1060
+ },
1061
+ {
1062
+ "epoch": 0.30458900655572363,
1063
+ "grad_norm": 0.49480316042900085,
1064
+ "learning_rate": 7.972470666849457e-05,
1065
+ "loss": 0.5276,
1066
+ "step": 151
1067
+ },
1068
+ {
1069
+ "epoch": 0.3066061522945033,
1070
+ "grad_norm": 0.5728369951248169,
1071
+ "learning_rate": 7.946685410208296e-05,
1072
+ "loss": 0.6442,
1073
+ "step": 152
1074
+ },
1075
+ {
1076
+ "epoch": 0.3086232980332829,
1077
+ "grad_norm": 0.4378463923931122,
1078
+ "learning_rate": 7.920779519754948e-05,
1079
+ "loss": 0.6289,
1080
+ "step": 153
1081
+ },
1082
+ {
1083
+ "epoch": 0.3106404437720625,
1084
+ "grad_norm": 0.42006805539131165,
1085
+ "learning_rate": 7.894754056045901e-05,
1086
+ "loss": 0.545,
1087
+ "step": 154
1088
+ },
1089
+ {
1090
+ "epoch": 0.31265758951084216,
1091
+ "grad_norm": 0.4681999683380127,
1092
+ "learning_rate": 7.868610084532828e-05,
1093
+ "loss": 0.5703,
1094
+ "step": 155
1095
+ },
1096
+ {
1097
+ "epoch": 0.3146747352496218,
1098
+ "grad_norm": 0.537784218788147,
1099
+ "learning_rate": 7.842348675518968e-05,
1100
+ "loss": 0.6163,
1101
+ "step": 156
1102
+ },
1103
+ {
1104
+ "epoch": 0.3166918809884014,
1105
+ "grad_norm": 0.4741944372653961,
1106
+ "learning_rate": 7.815970904115319e-05,
1107
+ "loss": 0.6115,
1108
+ "step": 157
1109
+ },
1110
+ {
1111
+ "epoch": 0.31870902672718104,
1112
+ "grad_norm": 0.42608484625816345,
1113
+ "learning_rate": 7.789477850196614e-05,
1114
+ "loss": 0.5959,
1115
+ "step": 158
1116
+ },
1117
+ {
1118
+ "epoch": 0.3207261724659607,
1119
+ "grad_norm": 0.44388729333877563,
1120
+ "learning_rate": 7.762870598357115e-05,
1121
+ "loss": 0.5518,
1122
+ "step": 159
1123
+ },
1124
+ {
1125
+ "epoch": 0.3227433182047403,
1126
+ "grad_norm": 0.3990045189857483,
1127
+ "learning_rate": 7.736150237866213e-05,
1128
+ "loss": 0.5256,
1129
+ "step": 160
1130
+ },
1131
+ {
1132
+ "epoch": 0.32476046394351993,
1133
+ "grad_norm": 1.0403790473937988,
1134
+ "learning_rate": 7.709317862623833e-05,
1135
+ "loss": 0.5994,
1136
+ "step": 161
1137
+ },
1138
+ {
1139
+ "epoch": 0.3267776096822995,
1140
+ "grad_norm": 0.38422009348869324,
1141
+ "learning_rate": 7.682374571115651e-05,
1142
+ "loss": 0.5976,
1143
+ "step": 162
1144
+ },
1145
+ {
1146
+ "epoch": 0.32879475542107917,
1147
+ "grad_norm": 0.3627714216709137,
1148
+ "learning_rate": 7.655321466368126e-05,
1149
+ "loss": 0.6134,
1150
+ "step": 163
1151
+ },
1152
+ {
1153
+ "epoch": 0.3308119011598588,
1154
+ "grad_norm": 0.4387800097465515,
1155
+ "learning_rate": 7.628159655903336e-05,
1156
+ "loss": 0.6016,
1157
+ "step": 164
1158
+ },
1159
+ {
1160
+ "epoch": 0.3328290468986384,
1161
+ "grad_norm": 0.38703906536102295,
1162
+ "learning_rate": 7.600890251693645e-05,
1163
+ "loss": 0.5804,
1164
+ "step": 165
1165
+ },
1166
+ {
1167
+ "epoch": 0.33484619263741805,
1168
+ "grad_norm": 0.37045204639434814,
1169
+ "learning_rate": 7.57351437011618e-05,
1170
+ "loss": 0.5176,
1171
+ "step": 166
1172
+ },
1173
+ {
1174
+ "epoch": 0.3368633383761977,
1175
+ "grad_norm": 0.37859445810317993,
1176
+ "learning_rate": 7.546033131907122e-05,
1177
+ "loss": 0.5251,
1178
+ "step": 167
1179
+ },
1180
+ {
1181
+ "epoch": 0.3388804841149773,
1182
+ "grad_norm": 0.3897157907485962,
1183
+ "learning_rate": 7.51844766211583e-05,
1184
+ "loss": 0.5355,
1185
+ "step": 168
1186
+ },
1187
+ {
1188
+ "epoch": 0.34089762985375693,
1189
+ "grad_norm": 0.384641170501709,
1190
+ "learning_rate": 7.490759090058778e-05,
1191
+ "loss": 0.5872,
1192
+ "step": 169
1193
+ },
1194
+ {
1195
+ "epoch": 0.3429147755925366,
1196
+ "grad_norm": 0.3863964378833771,
1197
+ "learning_rate": 7.462968549273326e-05,
1198
+ "loss": 0.5901,
1199
+ "step": 170
1200
+ },
1201
+ {
1202
+ "epoch": 0.34493192133131617,
1203
+ "grad_norm": 0.3983001708984375,
1204
+ "learning_rate": 7.435077177471315e-05,
1205
+ "loss": 0.5495,
1206
+ "step": 171
1207
+ },
1208
+ {
1209
+ "epoch": 0.3469490670700958,
1210
+ "grad_norm": 0.419036865234375,
1211
+ "learning_rate": 7.407086116492484e-05,
1212
+ "loss": 0.5547,
1213
+ "step": 172
1214
+ },
1215
+ {
1216
+ "epoch": 0.34896621280887546,
1217
+ "grad_norm": 0.39494702219963074,
1218
+ "learning_rate": 7.378996512257735e-05,
1219
+ "loss": 0.5727,
1220
+ "step": 173
1221
+ },
1222
+ {
1223
+ "epoch": 0.35098335854765506,
1224
+ "grad_norm": 0.3954225182533264,
1225
+ "learning_rate": 7.35080951472221e-05,
1226
+ "loss": 0.5669,
1227
+ "step": 174
1228
+ },
1229
+ {
1230
+ "epoch": 0.3530005042864347,
1231
+ "grad_norm": 0.43136945366859436,
1232
+ "learning_rate": 7.322526277828216e-05,
1233
+ "loss": 0.6174,
1234
+ "step": 175
1235
+ },
1236
+ {
1237
+ "epoch": 0.35501765002521435,
1238
+ "grad_norm": 0.37220558524131775,
1239
+ "learning_rate": 7.294147959457989e-05,
1240
+ "loss": 0.5389,
1241
+ "step": 176
1242
+ },
1243
+ {
1244
+ "epoch": 0.35703479576399394,
1245
+ "grad_norm": 0.40839049220085144,
1246
+ "learning_rate": 7.265675721386285e-05,
1247
+ "loss": 0.6032,
1248
+ "step": 177
1249
+ },
1250
+ {
1251
+ "epoch": 0.3590519415027736,
1252
+ "grad_norm": 0.37361785769462585,
1253
+ "learning_rate": 7.237110729232825e-05,
1254
+ "loss": 0.5343,
1255
+ "step": 178
1256
+ },
1257
+ {
1258
+ "epoch": 0.3610690872415532,
1259
+ "grad_norm": 0.4027640223503113,
1260
+ "learning_rate": 7.208454152414571e-05,
1261
+ "loss": 0.5687,
1262
+ "step": 179
1263
+ },
1264
+ {
1265
+ "epoch": 0.3630862329803328,
1266
+ "grad_norm": 0.4593392014503479,
1267
+ "learning_rate": 7.179707164097851e-05,
1268
+ "loss": 0.633,
1269
+ "step": 180
1270
+ },
1271
+ {
1272
+ "epoch": 0.36510337871911247,
1273
+ "grad_norm": 0.3781510889530182,
1274
+ "learning_rate": 7.150870941150336e-05,
1275
+ "loss": 0.5225,
1276
+ "step": 181
1277
+ },
1278
+ {
1279
+ "epoch": 0.36712052445789206,
1280
+ "grad_norm": 0.40543264150619507,
1281
+ "learning_rate": 7.12194666409285e-05,
1282
+ "loss": 0.5756,
1283
+ "step": 182
1284
+ },
1285
+ {
1286
+ "epoch": 0.3691376701966717,
1287
+ "grad_norm": 0.42058733105659485,
1288
+ "learning_rate": 7.092935517051058e-05,
1289
+ "loss": 0.6274,
1290
+ "step": 183
1291
+ },
1292
+ {
1293
+ "epoch": 0.37115481593545135,
1294
+ "grad_norm": 0.4416365623474121,
1295
+ "learning_rate": 7.063838687706971e-05,
1296
+ "loss": 0.6606,
1297
+ "step": 184
1298
+ },
1299
+ {
1300
+ "epoch": 0.37317196167423095,
1301
+ "grad_norm": 0.4479544460773468,
1302
+ "learning_rate": 7.034657367250337e-05,
1303
+ "loss": 0.5858,
1304
+ "step": 185
1305
+ },
1306
+ {
1307
+ "epoch": 0.3751891074130106,
1308
+ "grad_norm": 0.45392027497291565,
1309
+ "learning_rate": 7.005392750329868e-05,
1310
+ "loss": 0.6059,
1311
+ "step": 186
1312
+ },
1313
+ {
1314
+ "epoch": 0.37720625315179024,
1315
+ "grad_norm": 0.4504959285259247,
1316
+ "learning_rate": 6.976046035004335e-05,
1317
+ "loss": 0.5292,
1318
+ "step": 187
1319
+ },
1320
+ {
1321
+ "epoch": 0.37922339889056983,
1322
+ "grad_norm": 0.4504992365837097,
1323
+ "learning_rate": 6.946618422693521e-05,
1324
+ "loss": 0.5815,
1325
+ "step": 188
1326
+ },
1327
+ {
1328
+ "epoch": 0.3812405446293495,
1329
+ "grad_norm": 0.44799065589904785,
1330
+ "learning_rate": 6.917111118129035e-05,
1331
+ "loss": 0.5636,
1332
+ "step": 189
1333
+ },
1334
+ {
1335
+ "epoch": 0.3832576903681291,
1336
+ "grad_norm": 0.49823272228240967,
1337
+ "learning_rate": 6.887525329304994e-05,
1338
+ "loss": 0.7133,
1339
+ "step": 190
1340
+ },
1341
+ {
1342
+ "epoch": 0.3852748361069087,
1343
+ "grad_norm": 0.50130295753479,
1344
+ "learning_rate": 6.857862267428563e-05,
1345
+ "loss": 0.6892,
1346
+ "step": 191
1347
+ },
1348
+ {
1349
+ "epoch": 0.38729198184568836,
1350
+ "grad_norm": 0.47600454092025757,
1351
+ "learning_rate": 6.828123146870383e-05,
1352
+ "loss": 0.5957,
1353
+ "step": 192
1354
+ },
1355
+ {
1356
+ "epoch": 0.38930912758446795,
1357
+ "grad_norm": 0.44867298007011414,
1358
+ "learning_rate": 6.79830918511484e-05,
1359
+ "loss": 0.6613,
1360
+ "step": 193
1361
+ },
1362
+ {
1363
+ "epoch": 0.3913262733232476,
1364
+ "grad_norm": 0.4927201569080353,
1365
+ "learning_rate": 6.76842160271023e-05,
1366
+ "loss": 0.6486,
1367
+ "step": 194
1368
+ },
1369
+ {
1370
+ "epoch": 0.39334341906202724,
1371
+ "grad_norm": 0.43527671694755554,
1372
+ "learning_rate": 6.738461623218795e-05,
1373
+ "loss": 0.5978,
1374
+ "step": 195
1375
+ },
1376
+ {
1377
+ "epoch": 0.39536056480080684,
1378
+ "grad_norm": 0.5360156297683716,
1379
+ "learning_rate": 6.708430473166628e-05,
1380
+ "loss": 0.6932,
1381
+ "step": 196
1382
+ },
1383
+ {
1384
+ "epoch": 0.3973777105395865,
1385
+ "grad_norm": 0.5388275384902954,
1386
+ "learning_rate": 6.678329381993458e-05,
1387
+ "loss": 0.6456,
1388
+ "step": 197
1389
+ },
1390
+ {
1391
+ "epoch": 0.39939485627836613,
1392
+ "grad_norm": 0.544686496257782,
1393
+ "learning_rate": 6.648159582002322e-05,
1394
+ "loss": 0.6551,
1395
+ "step": 198
1396
+ },
1397
+ {
1398
+ "epoch": 0.4014120020171457,
1399
+ "grad_norm": 0.6613960266113281,
1400
+ "learning_rate": 6.617922308309115e-05,
1401
+ "loss": 0.6676,
1402
+ "step": 199
1403
+ },
1404
+ {
1405
+ "epoch": 0.40342914775592537,
1406
+ "grad_norm": 0.7599596381187439,
1407
+ "learning_rate": 6.587618798792022e-05,
1408
+ "loss": 0.6209,
1409
+ "step": 200
1410
+ },
1411
+ {
1412
+ "epoch": 0.405446293494705,
1413
+ "grad_norm": 0.3868520259857178,
1414
+ "learning_rate": 6.557250294040849e-05,
1415
+ "loss": 0.505,
1416
+ "step": 201
1417
+ },
1418
+ {
1419
+ "epoch": 0.4074634392334846,
1420
+ "grad_norm": 0.4449853003025055,
1421
+ "learning_rate": 6.526818037306228e-05,
1422
+ "loss": 0.5904,
1423
+ "step": 202
1424
+ },
1425
+ {
1426
+ "epoch": 0.40948058497226425,
1427
+ "grad_norm": 0.45996958017349243,
1428
+ "learning_rate": 6.496323274448721e-05,
1429
+ "loss": 0.543,
1430
+ "step": 203
1431
+ },
1432
+ {
1433
+ "epoch": 0.4114977307110439,
1434
+ "grad_norm": 0.49261391162872314,
1435
+ "learning_rate": 6.46576725388782e-05,
1436
+ "loss": 0.6735,
1437
+ "step": 204
1438
+ },
1439
+ {
1440
+ "epoch": 0.4135148764498235,
1441
+ "grad_norm": 0.43575477600097656,
1442
+ "learning_rate": 6.435151226550829e-05,
1443
+ "loss": 0.5711,
1444
+ "step": 205
1445
+ },
1446
+ {
1447
+ "epoch": 0.41553202218860313,
1448
+ "grad_norm": 0.44983670115470886,
1449
+ "learning_rate": 6.404476445821663e-05,
1450
+ "loss": 0.5746,
1451
+ "step": 206
1452
+ },
1453
+ {
1454
+ "epoch": 0.4175491679273828,
1455
+ "grad_norm": 0.3756658136844635,
1456
+ "learning_rate": 6.373744167489531e-05,
1457
+ "loss": 0.5298,
1458
+ "step": 207
1459
+ },
1460
+ {
1461
+ "epoch": 0.41956631366616237,
1462
+ "grad_norm": 0.4081321954727173,
1463
+ "learning_rate": 6.342955649697523e-05,
1464
+ "loss": 0.6043,
1465
+ "step": 208
1466
+ },
1467
+ {
1468
+ "epoch": 0.421583459404942,
1469
+ "grad_norm": 0.42804038524627686,
1470
+ "learning_rate": 6.312112152891107e-05,
1471
+ "loss": 0.5759,
1472
+ "step": 209
1473
+ },
1474
+ {
1475
+ "epoch": 0.4236006051437216,
1476
+ "grad_norm": 0.3999483287334442,
1477
+ "learning_rate": 6.28121493976653e-05,
1478
+ "loss": 0.5183,
1479
+ "step": 210
1480
+ },
1481
+ {
1482
+ "epoch": 0.42561775088250126,
1483
+ "grad_norm": 0.40822234749794006,
1484
+ "learning_rate": 6.250265275219116e-05,
1485
+ "loss": 0.6014,
1486
+ "step": 211
1487
+ },
1488
+ {
1489
+ "epoch": 0.4276348966212809,
1490
+ "grad_norm": 0.388310045003891,
1491
+ "learning_rate": 6.219264426291494e-05,
1492
+ "loss": 0.5951,
1493
+ "step": 212
1494
+ },
1495
+ {
1496
+ "epoch": 0.4296520423600605,
1497
+ "grad_norm": 0.39607641100883484,
1498
+ "learning_rate": 6.188213662121716e-05,
1499
+ "loss": 0.5743,
1500
+ "step": 213
1501
+ },
1502
+ {
1503
+ "epoch": 0.43166918809884014,
1504
+ "grad_norm": 0.37904173135757446,
1505
+ "learning_rate": 6.157114253891307e-05,
1506
+ "loss": 0.5331,
1507
+ "step": 214
1508
+ },
1509
+ {
1510
+ "epoch": 0.4336863338376198,
1511
+ "grad_norm": 0.36542603373527527,
1512
+ "learning_rate": 6.125967474773223e-05,
1513
+ "loss": 0.5145,
1514
+ "step": 215
1515
+ },
1516
+ {
1517
+ "epoch": 0.4357034795763994,
1518
+ "grad_norm": 0.37045159935951233,
1519
+ "learning_rate": 6.0947745998797266e-05,
1520
+ "loss": 0.5091,
1521
+ "step": 216
1522
+ },
1523
+ {
1524
+ "epoch": 0.437720625315179,
1525
+ "grad_norm": 0.41620463132858276,
1526
+ "learning_rate": 6.0635369062101875e-05,
1527
+ "loss": 0.6119,
1528
+ "step": 217
1529
+ },
1530
+ {
1531
+ "epoch": 0.43973777105395867,
1532
+ "grad_norm": 0.4387664794921875,
1533
+ "learning_rate": 6.032255672598803e-05,
1534
+ "loss": 0.6682,
1535
+ "step": 218
1536
+ },
1537
+ {
1538
+ "epoch": 0.44175491679273826,
1539
+ "grad_norm": 0.4237648844718933,
1540
+ "learning_rate": 6.0009321796622444e-05,
1541
+ "loss": 0.5406,
1542
+ "step": 219
1543
+ },
1544
+ {
1545
+ "epoch": 0.4437720625315179,
1546
+ "grad_norm": 0.43322858214378357,
1547
+ "learning_rate": 5.969567709747228e-05,
1548
+ "loss": 0.5316,
1549
+ "step": 220
1550
+ },
1551
+ {
1552
+ "epoch": 0.44578920827029755,
1553
+ "grad_norm": 0.4600161910057068,
1554
+ "learning_rate": 5.938163546878024e-05,
1555
+ "loss": 0.567,
1556
+ "step": 221
1557
+ },
1558
+ {
1559
+ "epoch": 0.44780635400907715,
1560
+ "grad_norm": 0.3887118399143219,
1561
+ "learning_rate": 5.906720976703877e-05,
1562
+ "loss": 0.5762,
1563
+ "step": 222
1564
+ },
1565
+ {
1566
+ "epoch": 0.4498234997478568,
1567
+ "grad_norm": 0.4044482409954071,
1568
+ "learning_rate": 5.87524128644639e-05,
1569
+ "loss": 0.6196,
1570
+ "step": 223
1571
+ },
1572
+ {
1573
+ "epoch": 0.4518406454866364,
1574
+ "grad_norm": 0.4436338543891907,
1575
+ "learning_rate": 5.843725764846812e-05,
1576
+ "loss": 0.5333,
1577
+ "step": 224
1578
+ },
1579
+ {
1580
+ "epoch": 0.45385779122541603,
1581
+ "grad_norm": 0.454662948846817,
1582
+ "learning_rate": 5.812175702113286e-05,
1583
+ "loss": 0.557,
1584
+ "step": 225
1585
+ },
1586
+ {
1587
+ "epoch": 0.4558749369641957,
1588
+ "grad_norm": 0.445640504360199,
1589
+ "learning_rate": 5.7805923898680305e-05,
1590
+ "loss": 0.5332,
1591
+ "step": 226
1592
+ },
1593
+ {
1594
+ "epoch": 0.45789208270297527,
1595
+ "grad_norm": 0.4665926992893219,
1596
+ "learning_rate": 5.7489771210944564e-05,
1597
+ "loss": 0.6367,
1598
+ "step": 227
1599
+ },
1600
+ {
1601
+ "epoch": 0.4599092284417549,
1602
+ "grad_norm": 0.45142456889152527,
1603
+ "learning_rate": 5.717331190084243e-05,
1604
+ "loss": 0.5792,
1605
+ "step": 228
1606
+ },
1607
+ {
1608
+ "epoch": 0.46192637418053456,
1609
+ "grad_norm": 0.4474796652793884,
1610
+ "learning_rate": 5.6856558923843364e-05,
1611
+ "loss": 0.575,
1612
+ "step": 229
1613
+ },
1614
+ {
1615
+ "epoch": 0.46394351991931415,
1616
+ "grad_norm": 0.4471490681171417,
1617
+ "learning_rate": 5.6539525247439274e-05,
1618
+ "loss": 0.5461,
1619
+ "step": 230
1620
+ },
1621
+ {
1622
+ "epoch": 0.4659606656580938,
1623
+ "grad_norm": 0.4579067528247833,
1624
+ "learning_rate": 5.622222385061353e-05,
1625
+ "loss": 0.6183,
1626
+ "step": 231
1627
+ },
1628
+ {
1629
+ "epoch": 0.46797781139687344,
1630
+ "grad_norm": 0.4598318636417389,
1631
+ "learning_rate": 5.590466772330968e-05,
1632
+ "loss": 0.6318,
1633
+ "step": 232
1634
+ },
1635
+ {
1636
+ "epoch": 0.46999495713565304,
1637
+ "grad_norm": 0.4309539496898651,
1638
+ "learning_rate": 5.558686986589963e-05,
1639
+ "loss": 0.6644,
1640
+ "step": 233
1641
+ },
1642
+ {
1643
+ "epoch": 0.4720121028744327,
1644
+ "grad_norm": 0.46067124605178833,
1645
+ "learning_rate": 5.526884328865142e-05,
1646
+ "loss": 0.6401,
1647
+ "step": 234
1648
+ },
1649
+ {
1650
+ "epoch": 0.47402924861321233,
1651
+ "grad_norm": 0.450652539730072,
1652
+ "learning_rate": 5.495060101119662e-05,
1653
+ "loss": 0.5475,
1654
+ "step": 235
1655
+ },
1656
+ {
1657
+ "epoch": 0.4760463943519919,
1658
+ "grad_norm": 0.44922497868537903,
1659
+ "learning_rate": 5.463215606199733e-05,
1660
+ "loss": 0.6372,
1661
+ "step": 236
1662
+ },
1663
+ {
1664
+ "epoch": 0.47806354009077157,
1665
+ "grad_norm": 0.4988616704940796,
1666
+ "learning_rate": 5.431352147781275e-05,
1667
+ "loss": 0.6847,
1668
+ "step": 237
1669
+ },
1670
+ {
1671
+ "epoch": 0.4800806858295512,
1672
+ "grad_norm": 0.45364323258399963,
1673
+ "learning_rate": 5.399471030316554e-05,
1674
+ "loss": 0.6203,
1675
+ "step": 238
1676
+ },
1677
+ {
1678
+ "epoch": 0.4820978315683308,
1679
+ "grad_norm": 0.4748307466506958,
1680
+ "learning_rate": 5.367573558980775e-05,
1681
+ "loss": 0.571,
1682
+ "step": 239
1683
+ },
1684
+ {
1685
+ "epoch": 0.48411497730711045,
1686
+ "grad_norm": 0.46398502588272095,
1687
+ "learning_rate": 5.335661039618653e-05,
1688
+ "loss": 0.6339,
1689
+ "step": 240
1690
+ },
1691
+ {
1692
+ "epoch": 0.48613212304589004,
1693
+ "grad_norm": 0.49548378586769104,
1694
+ "learning_rate": 5.3037347786909495e-05,
1695
+ "loss": 0.6616,
1696
+ "step": 241
1697
+ },
1698
+ {
1699
+ "epoch": 0.4881492687846697,
1700
+ "grad_norm": 0.4696505665779114,
1701
+ "learning_rate": 5.2717960832209914e-05,
1702
+ "loss": 0.6289,
1703
+ "step": 242
1704
+ },
1705
+ {
1706
+ "epoch": 0.49016641452344933,
1707
+ "grad_norm": 0.4907710552215576,
1708
+ "learning_rate": 5.239846260741158e-05,
1709
+ "loss": 0.5494,
1710
+ "step": 243
1711
+ },
1712
+ {
1713
+ "epoch": 0.4921835602622289,
1714
+ "grad_norm": 0.522820234298706,
1715
+ "learning_rate": 5.2078866192393574e-05,
1716
+ "loss": 0.6741,
1717
+ "step": 244
1718
+ },
1719
+ {
1720
+ "epoch": 0.49420070600100857,
1721
+ "grad_norm": 0.5473442673683167,
1722
+ "learning_rate": 5.1759184671054785e-05,
1723
+ "loss": 0.6574,
1724
+ "step": 245
1725
+ },
1726
+ {
1727
+ "epoch": 0.4962178517397882,
1728
+ "grad_norm": 0.5453885197639465,
1729
+ "learning_rate": 5.1439431130778206e-05,
1730
+ "loss": 0.6463,
1731
+ "step": 246
1732
+ },
1733
+ {
1734
+ "epoch": 0.4982349974785678,
1735
+ "grad_norm": 0.5924301743507385,
1736
+ "learning_rate": 5.111961866189524e-05,
1737
+ "loss": 0.675,
1738
+ "step": 247
1739
+ },
1740
+ {
1741
+ "epoch": 0.5002521432173475,
1742
+ "grad_norm": 0.6364018321037292,
1743
+ "learning_rate": 5.079976035714976e-05,
1744
+ "loss": 0.7198,
1745
+ "step": 248
1746
  }
1747
  ],
1748
  "logging_steps": 1,
 
1762
  "attributes": {}
1763
  }
1764
  },
1765
+ "total_flos": 4.194181341983539e+17,
1766
  "train_batch_size": 4,
1767
  "trial_name": null,
1768
  "trial_params": null