johnomeara commited on
Commit
92ddf35
·
verified ·
1 Parent(s): 526dd18

Upload 8 files

Browse files
config.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/dinov2-large",
3
+ "apply_layernorm": true,
4
+ "architectures": [
5
+ "Dinov2ForImageClassification"
6
+ ],
7
+ "attention_probs_dropout_prob": 0.0,
8
+ "drop_path_rate": 0.0,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.0,
11
+ "hidden_size": 1024,
12
+ "id2label": {
13
+ "0": "correct",
14
+ "1": "incorrect"
15
+ },
16
+ "image_size": 518,
17
+ "initializer_range": 0.02,
18
+ "label2id": {
19
+ "correct": 0,
20
+ "incorrect": 1
21
+ },
22
+ "layer_norm_eps": 1e-06,
23
+ "layerscale_value": 1.0,
24
+ "mlp_ratio": 4,
25
+ "model_type": "dinov2",
26
+ "num_attention_heads": 16,
27
+ "num_channels": 3,
28
+ "num_hidden_layers": 24,
29
+ "out_features": [
30
+ "stage24"
31
+ ],
32
+ "out_indices": [
33
+ 24
34
+ ],
35
+ "patch_size": 14,
36
+ "problem_type": "single_label_classification",
37
+ "qkv_bias": true,
38
+ "reshape_hidden_states": true,
39
+ "stage_names": [
40
+ "stem",
41
+ "stage1",
42
+ "stage2",
43
+ "stage3",
44
+ "stage4",
45
+ "stage5",
46
+ "stage6",
47
+ "stage7",
48
+ "stage8",
49
+ "stage9",
50
+ "stage10",
51
+ "stage11",
52
+ "stage12",
53
+ "stage13",
54
+ "stage14",
55
+ "stage15",
56
+ "stage16",
57
+ "stage17",
58
+ "stage18",
59
+ "stage19",
60
+ "stage20",
61
+ "stage21",
62
+ "stage22",
63
+ "stage23",
64
+ "stage24"
65
+ ],
66
+ "torch_dtype": "float32",
67
+ "transformers_version": "4.47.1",
68
+ "use_swiglu_ffn": false
69
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4db45e1375a111330f445f36615ec59a496219282a40ba73885298d28e14472
3
+ size 1217542512
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d60761ac8c8b5a161ea9b8f512bb1924d67f201c99d9adca3a338bf88aac23f
3
+ size 2435341946
preprocessor_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.485,
13
+ 0.456,
14
+ 0.406
15
+ ],
16
+ "image_processor_type": "BitImageProcessor",
17
+ "image_std": [
18
+ 0.229,
19
+ 0.224,
20
+ 0.225
21
+ ],
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "shortest_edge": 256
26
+ }
27
+ }
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:660ccec1688742a08ac50147863109a12b04472562e401ba83f158155084b971
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fc364201fbceac17ee28f0c25f6cd2003904f59d3a8ddb2469ba7bfdd346578
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,875 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9417692129092176,
3
+ "best_model_checkpoint": "Crosswalk/dinov2/checkpoint-924",
4
+ "epoch": 22.0,
5
+ "eval_steps": 500,
6
+ "global_step": 924,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.24242424242424243,
13
+ "grad_norm": 1809.6781005859375,
14
+ "learning_rate": 9.70873786407767e-07,
15
+ "loss": 4.7087,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.48484848484848486,
20
+ "grad_norm": 190.5601806640625,
21
+ "learning_rate": 1.941747572815534e-06,
22
+ "loss": 3.034,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.7272727272727273,
27
+ "grad_norm": 76.29146575927734,
28
+ "learning_rate": 2.912621359223301e-06,
29
+ "loss": 2.0024,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.9696969696969697,
34
+ "grad_norm": 75.1009750366211,
35
+ "learning_rate": 3.883495145631068e-06,
36
+ "loss": 1.4019,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 1.0,
41
+ "eval_loss": 0.2978227138519287,
42
+ "eval_macro_f1": 0.8815037150933147,
43
+ "eval_runtime": 7.5247,
44
+ "eval_samples_per_second": 43.856,
45
+ "eval_steps_per_second": 5.582,
46
+ "step": 42
47
+ },
48
+ {
49
+ "epoch": 1.1939393939393939,
50
+ "grad_norm": 86.90376281738281,
51
+ "learning_rate": 4.854368932038836e-06,
52
+ "loss": 0.7179,
53
+ "step": 50
54
+ },
55
+ {
56
+ "epoch": 1.4363636363636363,
57
+ "grad_norm": 118.144287109375,
58
+ "learning_rate": 5.825242718446602e-06,
59
+ "loss": 0.9737,
60
+ "step": 60
61
+ },
62
+ {
63
+ "epoch": 1.6787878787878787,
64
+ "grad_norm": 174.987548828125,
65
+ "learning_rate": 6.79611650485437e-06,
66
+ "loss": 1.3976,
67
+ "step": 70
68
+ },
69
+ {
70
+ "epoch": 1.9212121212121214,
71
+ "grad_norm": 35.86643981933594,
72
+ "learning_rate": 7.766990291262136e-06,
73
+ "loss": 0.967,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 2.0,
78
+ "eval_loss": 0.22084859013557434,
79
+ "eval_macro_f1": 0.9229339286881953,
80
+ "eval_runtime": 5.8751,
81
+ "eval_samples_per_second": 56.169,
82
+ "eval_steps_per_second": 7.149,
83
+ "step": 84
84
+ },
85
+ {
86
+ "epoch": 2.1454545454545455,
87
+ "grad_norm": 232.57371520996094,
88
+ "learning_rate": 8.737864077669904e-06,
89
+ "loss": 1.323,
90
+ "step": 90
91
+ },
92
+ {
93
+ "epoch": 2.3878787878787877,
94
+ "grad_norm": 72.19599914550781,
95
+ "learning_rate": 9.708737864077671e-06,
96
+ "loss": 1.005,
97
+ "step": 100
98
+ },
99
+ {
100
+ "epoch": 2.6303030303030304,
101
+ "grad_norm": 55.63515853881836,
102
+ "learning_rate": 9.924078091106291e-06,
103
+ "loss": 0.6989,
104
+ "step": 110
105
+ },
106
+ {
107
+ "epoch": 2.8727272727272726,
108
+ "grad_norm": 56.631591796875,
109
+ "learning_rate": 9.815618221258135e-06,
110
+ "loss": 0.7527,
111
+ "step": 120
112
+ },
113
+ {
114
+ "epoch": 3.0,
115
+ "eval_loss": 0.3025018870830536,
116
+ "eval_macro_f1": 0.9009343690194753,
117
+ "eval_runtime": 6.0811,
118
+ "eval_samples_per_second": 54.266,
119
+ "eval_steps_per_second": 6.907,
120
+ "step": 126
121
+ },
122
+ {
123
+ "epoch": 3.096969696969697,
124
+ "grad_norm": 40.3350715637207,
125
+ "learning_rate": 9.70715835140998e-06,
126
+ "loss": 0.6221,
127
+ "step": 130
128
+ },
129
+ {
130
+ "epoch": 3.3393939393939394,
131
+ "grad_norm": 95.85454559326172,
132
+ "learning_rate": 9.598698481561823e-06,
133
+ "loss": 0.9071,
134
+ "step": 140
135
+ },
136
+ {
137
+ "epoch": 3.581818181818182,
138
+ "grad_norm": 94.18701934814453,
139
+ "learning_rate": 9.490238611713667e-06,
140
+ "loss": 0.7042,
141
+ "step": 150
142
+ },
143
+ {
144
+ "epoch": 3.824242424242424,
145
+ "grad_norm": 477.8069763183594,
146
+ "learning_rate": 9.38177874186551e-06,
147
+ "loss": 0.635,
148
+ "step": 160
149
+ },
150
+ {
151
+ "epoch": 4.0,
152
+ "eval_loss": 0.22365985810756683,
153
+ "eval_macro_f1": 0.9049918736939866,
154
+ "eval_runtime": 5.9294,
155
+ "eval_samples_per_second": 55.655,
156
+ "eval_steps_per_second": 7.083,
157
+ "step": 168
158
+ },
159
+ {
160
+ "epoch": 4.048484848484849,
161
+ "grad_norm": 110.27919006347656,
162
+ "learning_rate": 9.273318872017354e-06,
163
+ "loss": 1.1381,
164
+ "step": 170
165
+ },
166
+ {
167
+ "epoch": 4.290909090909091,
168
+ "grad_norm": 58.735958099365234,
169
+ "learning_rate": 9.1648590021692e-06,
170
+ "loss": 0.7225,
171
+ "step": 180
172
+ },
173
+ {
174
+ "epoch": 4.533333333333333,
175
+ "grad_norm": 75.20926666259766,
176
+ "learning_rate": 9.056399132321042e-06,
177
+ "loss": 0.4634,
178
+ "step": 190
179
+ },
180
+ {
181
+ "epoch": 4.775757575757575,
182
+ "grad_norm": 18.876136779785156,
183
+ "learning_rate": 8.947939262472886e-06,
184
+ "loss": 0.6293,
185
+ "step": 200
186
+ },
187
+ {
188
+ "epoch": 5.0,
189
+ "grad_norm": 8.281109809875488,
190
+ "learning_rate": 8.83947939262473e-06,
191
+ "loss": 0.6632,
192
+ "step": 210
193
+ },
194
+ {
195
+ "epoch": 5.0,
196
+ "eval_loss": 0.2299780696630478,
197
+ "eval_macro_f1": 0.9176304185040354,
198
+ "eval_runtime": 6.0281,
199
+ "eval_samples_per_second": 54.744,
200
+ "eval_steps_per_second": 6.967,
201
+ "step": 210
202
+ },
203
+ {
204
+ "epoch": 5.242424242424242,
205
+ "grad_norm": 13.756321907043457,
206
+ "learning_rate": 8.731019522776574e-06,
207
+ "loss": 0.4708,
208
+ "step": 220
209
+ },
210
+ {
211
+ "epoch": 5.484848484848484,
212
+ "grad_norm": 59.22605895996094,
213
+ "learning_rate": 8.622559652928418e-06,
214
+ "loss": 0.7127,
215
+ "step": 230
216
+ },
217
+ {
218
+ "epoch": 5.7272727272727275,
219
+ "grad_norm": 32.43043899536133,
220
+ "learning_rate": 8.514099783080262e-06,
221
+ "loss": 0.5682,
222
+ "step": 240
223
+ },
224
+ {
225
+ "epoch": 5.96969696969697,
226
+ "grad_norm": 54.722599029541016,
227
+ "learning_rate": 8.405639913232104e-06,
228
+ "loss": 0.8667,
229
+ "step": 250
230
+ },
231
+ {
232
+ "epoch": 6.0,
233
+ "eval_loss": 0.2767850160598755,
234
+ "eval_macro_f1": 0.9210700618192522,
235
+ "eval_runtime": 6.2004,
236
+ "eval_samples_per_second": 53.223,
237
+ "eval_steps_per_second": 6.774,
238
+ "step": 252
239
+ },
240
+ {
241
+ "epoch": 6.193939393939394,
242
+ "grad_norm": 18.175823211669922,
243
+ "learning_rate": 8.29718004338395e-06,
244
+ "loss": 0.6752,
245
+ "step": 260
246
+ },
247
+ {
248
+ "epoch": 6.4363636363636365,
249
+ "grad_norm": 56.679988861083984,
250
+ "learning_rate": 8.188720173535792e-06,
251
+ "loss": 0.3727,
252
+ "step": 270
253
+ },
254
+ {
255
+ "epoch": 6.678787878787879,
256
+ "grad_norm": 57.3917236328125,
257
+ "learning_rate": 8.080260303687636e-06,
258
+ "loss": 1.0167,
259
+ "step": 280
260
+ },
261
+ {
262
+ "epoch": 6.921212121212121,
263
+ "grad_norm": 42.186038970947266,
264
+ "learning_rate": 7.97180043383948e-06,
265
+ "loss": 0.9377,
266
+ "step": 290
267
+ },
268
+ {
269
+ "epoch": 7.0,
270
+ "eval_loss": 0.29274508357048035,
271
+ "eval_macro_f1": 0.9138863000931967,
272
+ "eval_runtime": 6.1213,
273
+ "eval_samples_per_second": 53.91,
274
+ "eval_steps_per_second": 6.861,
275
+ "step": 294
276
+ },
277
+ {
278
+ "epoch": 7.1454545454545455,
279
+ "grad_norm": 41.31782531738281,
280
+ "learning_rate": 7.863340563991324e-06,
281
+ "loss": 0.3818,
282
+ "step": 300
283
+ },
284
+ {
285
+ "epoch": 7.387878787878788,
286
+ "grad_norm": 4.223178863525391,
287
+ "learning_rate": 7.754880694143168e-06,
288
+ "loss": 0.4503,
289
+ "step": 310
290
+ },
291
+ {
292
+ "epoch": 7.63030303030303,
293
+ "grad_norm": 35.64258575439453,
294
+ "learning_rate": 7.646420824295012e-06,
295
+ "loss": 0.6038,
296
+ "step": 320
297
+ },
298
+ {
299
+ "epoch": 7.872727272727273,
300
+ "grad_norm": 37.91206359863281,
301
+ "learning_rate": 7.537960954446856e-06,
302
+ "loss": 0.5407,
303
+ "step": 330
304
+ },
305
+ {
306
+ "epoch": 8.0,
307
+ "eval_loss": 0.20143219828605652,
308
+ "eval_macro_f1": 0.9357970705676355,
309
+ "eval_runtime": 5.9715,
310
+ "eval_samples_per_second": 55.263,
311
+ "eval_steps_per_second": 7.033,
312
+ "step": 336
313
+ },
314
+ {
315
+ "epoch": 8.096969696969698,
316
+ "grad_norm": 9.571391105651855,
317
+ "learning_rate": 7.429501084598699e-06,
318
+ "loss": 0.3311,
319
+ "step": 340
320
+ },
321
+ {
322
+ "epoch": 8.33939393939394,
323
+ "grad_norm": 30.14655876159668,
324
+ "learning_rate": 7.321041214750543e-06,
325
+ "loss": 0.5367,
326
+ "step": 350
327
+ },
328
+ {
329
+ "epoch": 8.581818181818182,
330
+ "grad_norm": 125.38350677490234,
331
+ "learning_rate": 7.212581344902386e-06,
332
+ "loss": 0.4511,
333
+ "step": 360
334
+ },
335
+ {
336
+ "epoch": 8.824242424242424,
337
+ "grad_norm": 283.20819091796875,
338
+ "learning_rate": 7.104121475054231e-06,
339
+ "loss": 0.5474,
340
+ "step": 370
341
+ },
342
+ {
343
+ "epoch": 9.0,
344
+ "eval_loss": 0.329227477312088,
345
+ "eval_macro_f1": 0.8817302125547928,
346
+ "eval_runtime": 5.984,
347
+ "eval_samples_per_second": 55.147,
348
+ "eval_steps_per_second": 7.019,
349
+ "step": 378
350
+ },
351
+ {
352
+ "epoch": 9.048484848484849,
353
+ "grad_norm": 31.927379608154297,
354
+ "learning_rate": 6.995661605206075e-06,
355
+ "loss": 0.3963,
356
+ "step": 380
357
+ },
358
+ {
359
+ "epoch": 9.290909090909091,
360
+ "grad_norm": 10.10098934173584,
361
+ "learning_rate": 6.887201735357918e-06,
362
+ "loss": 0.445,
363
+ "step": 390
364
+ },
365
+ {
366
+ "epoch": 9.533333333333333,
367
+ "grad_norm": 1.947770118713379,
368
+ "learning_rate": 6.778741865509761e-06,
369
+ "loss": 0.5947,
370
+ "step": 400
371
+ },
372
+ {
373
+ "epoch": 9.775757575757575,
374
+ "grad_norm": 54.11802673339844,
375
+ "learning_rate": 6.670281995661606e-06,
376
+ "loss": 0.6001,
377
+ "step": 410
378
+ },
379
+ {
380
+ "epoch": 10.0,
381
+ "grad_norm": 0.004518165718764067,
382
+ "learning_rate": 6.56182212581345e-06,
383
+ "loss": 0.412,
384
+ "step": 420
385
+ },
386
+ {
387
+ "epoch": 10.0,
388
+ "eval_loss": 0.3594599962234497,
389
+ "eval_macro_f1": 0.907735321528425,
390
+ "eval_runtime": 6.0125,
391
+ "eval_samples_per_second": 54.885,
392
+ "eval_steps_per_second": 6.985,
393
+ "step": 420
394
+ },
395
+ {
396
+ "epoch": 10.242424242424242,
397
+ "grad_norm": 0.22337216138839722,
398
+ "learning_rate": 6.453362255965293e-06,
399
+ "loss": 0.2798,
400
+ "step": 430
401
+ },
402
+ {
403
+ "epoch": 10.484848484848484,
404
+ "grad_norm": 39.639984130859375,
405
+ "learning_rate": 6.344902386117138e-06,
406
+ "loss": 0.4377,
407
+ "step": 440
408
+ },
409
+ {
410
+ "epoch": 10.727272727272727,
411
+ "grad_norm": 53.85198211669922,
412
+ "learning_rate": 6.236442516268981e-06,
413
+ "loss": 0.2063,
414
+ "step": 450
415
+ },
416
+ {
417
+ "epoch": 10.969696969696969,
418
+ "grad_norm": 69.08942413330078,
419
+ "learning_rate": 6.127982646420825e-06,
420
+ "loss": 0.2884,
421
+ "step": 460
422
+ },
423
+ {
424
+ "epoch": 11.0,
425
+ "eval_loss": 0.2930862307548523,
426
+ "eval_macro_f1": 0.9380839806371721,
427
+ "eval_runtime": 6.0147,
428
+ "eval_samples_per_second": 54.866,
429
+ "eval_steps_per_second": 6.983,
430
+ "step": 462
431
+ },
432
+ {
433
+ "epoch": 11.193939393939393,
434
+ "grad_norm": 63.143218994140625,
435
+ "learning_rate": 6.019522776572668e-06,
436
+ "loss": 0.6075,
437
+ "step": 470
438
+ },
439
+ {
440
+ "epoch": 11.436363636363636,
441
+ "grad_norm": 7.950187683105469,
442
+ "learning_rate": 5.911062906724513e-06,
443
+ "loss": 0.2654,
444
+ "step": 480
445
+ },
446
+ {
447
+ "epoch": 11.67878787878788,
448
+ "grad_norm": 82.30758666992188,
449
+ "learning_rate": 5.802603036876356e-06,
450
+ "loss": 0.2474,
451
+ "step": 490
452
+ },
453
+ {
454
+ "epoch": 11.921212121212122,
455
+ "grad_norm": 7.340689182281494,
456
+ "learning_rate": 5.6941431670282e-06,
457
+ "loss": 0.2405,
458
+ "step": 500
459
+ },
460
+ {
461
+ "epoch": 12.0,
462
+ "eval_loss": 0.3316686451435089,
463
+ "eval_macro_f1": 0.9209216589861751,
464
+ "eval_runtime": 5.8916,
465
+ "eval_samples_per_second": 56.012,
466
+ "eval_steps_per_second": 7.129,
467
+ "step": 504
468
+ },
469
+ {
470
+ "epoch": 12.145454545454545,
471
+ "grad_norm": 37.141632080078125,
472
+ "learning_rate": 5.585683297180043e-06,
473
+ "loss": 0.3349,
474
+ "step": 510
475
+ },
476
+ {
477
+ "epoch": 12.387878787878789,
478
+ "grad_norm": 34.024383544921875,
479
+ "learning_rate": 5.477223427331888e-06,
480
+ "loss": 0.1742,
481
+ "step": 520
482
+ },
483
+ {
484
+ "epoch": 12.63030303030303,
485
+ "grad_norm": 46.10781478881836,
486
+ "learning_rate": 5.368763557483731e-06,
487
+ "loss": 0.2115,
488
+ "step": 530
489
+ },
490
+ {
491
+ "epoch": 12.872727272727273,
492
+ "grad_norm": 126.67015838623047,
493
+ "learning_rate": 5.260303687635575e-06,
494
+ "loss": 0.8788,
495
+ "step": 540
496
+ },
497
+ {
498
+ "epoch": 13.0,
499
+ "eval_loss": 0.37741926312446594,
500
+ "eval_macro_f1": 0.9058106453305834,
501
+ "eval_runtime": 6.6329,
502
+ "eval_samples_per_second": 49.752,
503
+ "eval_steps_per_second": 6.332,
504
+ "step": 546
505
+ },
506
+ {
507
+ "epoch": 13.096969696969698,
508
+ "grad_norm": 35.44662094116211,
509
+ "learning_rate": 5.151843817787418e-06,
510
+ "loss": 0.5591,
511
+ "step": 550
512
+ },
513
+ {
514
+ "epoch": 13.33939393939394,
515
+ "grad_norm": 57.34544372558594,
516
+ "learning_rate": 5.043383947939263e-06,
517
+ "loss": 0.213,
518
+ "step": 560
519
+ },
520
+ {
521
+ "epoch": 13.581818181818182,
522
+ "grad_norm": 15.285223960876465,
523
+ "learning_rate": 4.934924078091107e-06,
524
+ "loss": 0.203,
525
+ "step": 570
526
+ },
527
+ {
528
+ "epoch": 13.824242424242424,
529
+ "grad_norm": 28.99003028869629,
530
+ "learning_rate": 4.82646420824295e-06,
531
+ "loss": 0.4163,
532
+ "step": 580
533
+ },
534
+ {
535
+ "epoch": 14.0,
536
+ "eval_loss": 0.39865490794181824,
537
+ "eval_macro_f1": 0.9196508840275697,
538
+ "eval_runtime": 5.8701,
539
+ "eval_samples_per_second": 56.217,
540
+ "eval_steps_per_second": 7.155,
541
+ "step": 588
542
+ },
543
+ {
544
+ "epoch": 14.048484848484849,
545
+ "grad_norm": 346.8255310058594,
546
+ "learning_rate": 4.718004338394794e-06,
547
+ "loss": 0.26,
548
+ "step": 590
549
+ },
550
+ {
551
+ "epoch": 14.290909090909091,
552
+ "grad_norm": 38.04654312133789,
553
+ "learning_rate": 4.609544468546638e-06,
554
+ "loss": 0.3813,
555
+ "step": 600
556
+ },
557
+ {
558
+ "epoch": 14.533333333333333,
559
+ "grad_norm": 34.71643829345703,
560
+ "learning_rate": 4.501084598698482e-06,
561
+ "loss": 0.0974,
562
+ "step": 610
563
+ },
564
+ {
565
+ "epoch": 14.775757575757575,
566
+ "grad_norm": 34.031890869140625,
567
+ "learning_rate": 4.392624728850326e-06,
568
+ "loss": 0.4881,
569
+ "step": 620
570
+ },
571
+ {
572
+ "epoch": 15.0,
573
+ "grad_norm": 0.0002771662548184395,
574
+ "learning_rate": 4.284164859002169e-06,
575
+ "loss": 0.4126,
576
+ "step": 630
577
+ },
578
+ {
579
+ "epoch": 15.0,
580
+ "eval_loss": 0.35451531410217285,
581
+ "eval_macro_f1": 0.9235679411519468,
582
+ "eval_runtime": 6.0428,
583
+ "eval_samples_per_second": 54.611,
584
+ "eval_steps_per_second": 6.95,
585
+ "step": 630
586
+ },
587
+ {
588
+ "epoch": 15.242424242424242,
589
+ "grad_norm": 82.48748779296875,
590
+ "learning_rate": 4.175704989154013e-06,
591
+ "loss": 0.4444,
592
+ "step": 640
593
+ },
594
+ {
595
+ "epoch": 15.484848484848484,
596
+ "grad_norm": 0.2618753910064697,
597
+ "learning_rate": 4.067245119305857e-06,
598
+ "loss": 0.2083,
599
+ "step": 650
600
+ },
601
+ {
602
+ "epoch": 15.727272727272727,
603
+ "grad_norm": 98.34405517578125,
604
+ "learning_rate": 3.958785249457701e-06,
605
+ "loss": 0.4785,
606
+ "step": 660
607
+ },
608
+ {
609
+ "epoch": 15.969696969696969,
610
+ "grad_norm": 0.37142229080200195,
611
+ "learning_rate": 3.8503253796095445e-06,
612
+ "loss": 0.1583,
613
+ "step": 670
614
+ },
615
+ {
616
+ "epoch": 16.0,
617
+ "eval_loss": 0.38117873668670654,
618
+ "eval_macro_f1": 0.9268860086407444,
619
+ "eval_runtime": 6.9311,
620
+ "eval_samples_per_second": 47.612,
621
+ "eval_steps_per_second": 6.06,
622
+ "step": 672
623
+ },
624
+ {
625
+ "epoch": 16.193939393939395,
626
+ "grad_norm": 40.54330825805664,
627
+ "learning_rate": 3.741865509761389e-06,
628
+ "loss": 0.0774,
629
+ "step": 680
630
+ },
631
+ {
632
+ "epoch": 16.436363636363637,
633
+ "grad_norm": 0.23675695061683655,
634
+ "learning_rate": 3.6334056399132324e-06,
635
+ "loss": 0.1639,
636
+ "step": 690
637
+ },
638
+ {
639
+ "epoch": 16.67878787878788,
640
+ "grad_norm": 47.12529373168945,
641
+ "learning_rate": 3.5249457700650764e-06,
642
+ "loss": 0.306,
643
+ "step": 700
644
+ },
645
+ {
646
+ "epoch": 16.921212121212122,
647
+ "grad_norm": 0.3993530571460724,
648
+ "learning_rate": 3.41648590021692e-06,
649
+ "loss": 0.2376,
650
+ "step": 710
651
+ },
652
+ {
653
+ "epoch": 17.0,
654
+ "eval_loss": 0.4087267816066742,
655
+ "eval_macro_f1": 0.9295990205081115,
656
+ "eval_runtime": 6.1306,
657
+ "eval_samples_per_second": 53.828,
658
+ "eval_steps_per_second": 6.851,
659
+ "step": 714
660
+ },
661
+ {
662
+ "epoch": 17.145454545454545,
663
+ "grad_norm": 0.34205177426338196,
664
+ "learning_rate": 3.308026030368764e-06,
665
+ "loss": 0.0332,
666
+ "step": 720
667
+ },
668
+ {
669
+ "epoch": 17.387878787878787,
670
+ "grad_norm": 0.5112647414207458,
671
+ "learning_rate": 3.1995661605206075e-06,
672
+ "loss": 0.1332,
673
+ "step": 730
674
+ },
675
+ {
676
+ "epoch": 17.63030303030303,
677
+ "grad_norm": 120.2950439453125,
678
+ "learning_rate": 3.0911062906724515e-06,
679
+ "loss": 0.2503,
680
+ "step": 740
681
+ },
682
+ {
683
+ "epoch": 17.87272727272727,
684
+ "grad_norm": 223.04759216308594,
685
+ "learning_rate": 2.982646420824295e-06,
686
+ "loss": 0.2703,
687
+ "step": 750
688
+ },
689
+ {
690
+ "epoch": 18.0,
691
+ "eval_loss": 0.43362897634506226,
692
+ "eval_macro_f1": 0.9264924264924266,
693
+ "eval_runtime": 5.861,
694
+ "eval_samples_per_second": 56.305,
695
+ "eval_steps_per_second": 7.166,
696
+ "step": 756
697
+ },
698
+ {
699
+ "epoch": 18.096969696969698,
700
+ "grad_norm": 54.66193771362305,
701
+ "learning_rate": 2.874186550976139e-06,
702
+ "loss": 0.1274,
703
+ "step": 760
704
+ },
705
+ {
706
+ "epoch": 18.33939393939394,
707
+ "grad_norm": 54.846466064453125,
708
+ "learning_rate": 2.765726681127983e-06,
709
+ "loss": 0.2751,
710
+ "step": 770
711
+ },
712
+ {
713
+ "epoch": 18.581818181818182,
714
+ "grad_norm": 53.97863006591797,
715
+ "learning_rate": 2.6572668112798266e-06,
716
+ "loss": 0.359,
717
+ "step": 780
718
+ },
719
+ {
720
+ "epoch": 18.824242424242424,
721
+ "grad_norm": 81.63549041748047,
722
+ "learning_rate": 2.5488069414316706e-06,
723
+ "loss": 0.1819,
724
+ "step": 790
725
+ },
726
+ {
727
+ "epoch": 19.0,
728
+ "eval_loss": 0.3480012118816376,
729
+ "eval_macro_f1": 0.9236528192931639,
730
+ "eval_runtime": 7.0471,
731
+ "eval_samples_per_second": 46.828,
732
+ "eval_steps_per_second": 5.96,
733
+ "step": 798
734
+ },
735
+ {
736
+ "epoch": 19.048484848484847,
737
+ "grad_norm": 41.54087448120117,
738
+ "learning_rate": 2.440347071583514e-06,
739
+ "loss": 0.6373,
740
+ "step": 800
741
+ },
742
+ {
743
+ "epoch": 19.29090909090909,
744
+ "grad_norm": 9.001028060913086,
745
+ "learning_rate": 2.331887201735358e-06,
746
+ "loss": 0.2971,
747
+ "step": 810
748
+ },
749
+ {
750
+ "epoch": 19.533333333333335,
751
+ "grad_norm": 114.6279525756836,
752
+ "learning_rate": 2.2234273318872017e-06,
753
+ "loss": 0.1943,
754
+ "step": 820
755
+ },
756
+ {
757
+ "epoch": 19.775757575757577,
758
+ "grad_norm": 18.022676467895508,
759
+ "learning_rate": 2.1149674620390457e-06,
760
+ "loss": 0.1207,
761
+ "step": 830
762
+ },
763
+ {
764
+ "epoch": 20.0,
765
+ "grad_norm": 0.0001882202341221273,
766
+ "learning_rate": 2.0065075921908892e-06,
767
+ "loss": 0.1324,
768
+ "step": 840
769
+ },
770
+ {
771
+ "epoch": 20.0,
772
+ "eval_loss": 0.4493299424648285,
773
+ "eval_macro_f1": 0.9384902143522833,
774
+ "eval_runtime": 6.0147,
775
+ "eval_samples_per_second": 54.865,
776
+ "eval_steps_per_second": 6.983,
777
+ "step": 840
778
+ },
779
+ {
780
+ "epoch": 20.242424242424242,
781
+ "grad_norm": 13.740226745605469,
782
+ "learning_rate": 1.8980477223427332e-06,
783
+ "loss": 0.294,
784
+ "step": 850
785
+ },
786
+ {
787
+ "epoch": 20.484848484848484,
788
+ "grad_norm": 6.870513916015625,
789
+ "learning_rate": 1.7895878524945772e-06,
790
+ "loss": 0.1323,
791
+ "step": 860
792
+ },
793
+ {
794
+ "epoch": 20.727272727272727,
795
+ "grad_norm": 2.74729585647583,
796
+ "learning_rate": 1.681127982646421e-06,
797
+ "loss": 0.019,
798
+ "step": 870
799
+ },
800
+ {
801
+ "epoch": 20.96969696969697,
802
+ "grad_norm": 17.18338966369629,
803
+ "learning_rate": 1.572668112798265e-06,
804
+ "loss": 0.1312,
805
+ "step": 880
806
+ },
807
+ {
808
+ "epoch": 21.0,
809
+ "eval_loss": 0.40448498725891113,
810
+ "eval_macro_f1": 0.9384902143522833,
811
+ "eval_runtime": 6.133,
812
+ "eval_samples_per_second": 53.808,
813
+ "eval_steps_per_second": 6.848,
814
+ "step": 882
815
+ },
816
+ {
817
+ "epoch": 21.193939393939395,
818
+ "grad_norm": 108.9833755493164,
819
+ "learning_rate": 1.4642082429501087e-06,
820
+ "loss": 0.2499,
821
+ "step": 890
822
+ },
823
+ {
824
+ "epoch": 21.436363636363637,
825
+ "grad_norm": 0.291847825050354,
826
+ "learning_rate": 1.3557483731019525e-06,
827
+ "loss": 0.1708,
828
+ "step": 900
829
+ },
830
+ {
831
+ "epoch": 21.67878787878788,
832
+ "grad_norm": 35.08168029785156,
833
+ "learning_rate": 1.2472885032537963e-06,
834
+ "loss": 0.0802,
835
+ "step": 910
836
+ },
837
+ {
838
+ "epoch": 21.921212121212122,
839
+ "grad_norm": 0.05401836335659027,
840
+ "learning_rate": 1.13882863340564e-06,
841
+ "loss": 0.1662,
842
+ "step": 920
843
+ },
844
+ {
845
+ "epoch": 22.0,
846
+ "eval_loss": 0.3166828453540802,
847
+ "eval_macro_f1": 0.9417692129092176,
848
+ "eval_runtime": 6.0442,
849
+ "eval_samples_per_second": 54.598,
850
+ "eval_steps_per_second": 6.949,
851
+ "step": 924
852
+ }
853
+ ],
854
+ "logging_steps": 10,
855
+ "max_steps": 1025,
856
+ "num_input_tokens_seen": 0,
857
+ "num_train_epochs": 25,
858
+ "save_steps": 500,
859
+ "stateful_callbacks": {
860
+ "TrainerControl": {
861
+ "args": {
862
+ "should_epoch_stop": false,
863
+ "should_evaluate": false,
864
+ "should_log": false,
865
+ "should_save": true,
866
+ "should_training_stop": false
867
+ },
868
+ "attributes": {}
869
+ }
870
+ },
871
+ "total_flos": 1.0371596050603966e+19,
872
+ "train_batch_size": 8,
873
+ "trial_name": null,
874
+ "trial_params": null
875
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0b1d45c6c028d437456c5af083b3618770c1bf92ea996d351710771438e3073
3
+ size 5304