bmedeiros commited on
Commit
678fb07
·
verified ·
1 Parent(s): ef1d5bb

End of training

Browse files
README.md CHANGED
@@ -23,7 +23,7 @@ model-index:
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
- value: 0.7617021276595745
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -33,8 +33,8 @@ should probably proofread and complete it, then remove this comment. -->
33
 
34
  This model is a fine-tuned version of [facebook/vit-msn-base](https://huggingface.co/facebook/vit-msn-base) on the imagefolder dataset.
35
  It achieves the following results on the evaluation set:
36
- - Loss: 0.5346
37
- - Accuracy: 0.7617
38
 
39
  ## Model description
40
 
 
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
+ value: 0.9234042553191489
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
33
 
34
  This model is a fine-tuned version of [facebook/vit-msn-base](https://huggingface.co/facebook/vit-msn-base) on the imagefolder dataset.
35
  It achieves the following results on the evaluation set:
36
+ - Loss: 0.2414
37
+ - Accuracy: 0.9234
38
 
39
  ## Model description
40
 
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 76.8,
3
+ "eval_accuracy": 0.9234042553191489,
4
+ "eval_loss": 0.24138842523097992,
5
+ "eval_runtime": 4.1974,
6
+ "eval_samples_per_second": 111.975,
7
+ "eval_steps_per_second": 3.574,
8
+ "total_flos": 4.5903154968099717e+18,
9
+ "train_loss": 0.19423907659947873,
10
+ "train_runtime": 1687.359,
11
+ "train_samples_per_second": 36.554,
12
+ "train_steps_per_second": 0.284
13
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 76.8,
3
+ "eval_accuracy": 0.9234042553191489,
4
+ "eval_loss": 0.24138842523097992,
5
+ "eval_runtime": 4.1974,
6
+ "eval_samples_per_second": 111.975,
7
+ "eval_steps_per_second": 3.574
8
+ }
runs/Dec11_13-06-33_c3e1296b588a/events.out.tfevents.1733924630.c3e1296b588a.292864.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:374fd6200b40c7636a17200d0aa24cba92819919e5243b7045e0b084f8c73f82
3
+ size 411
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 76.8,
3
+ "total_flos": 4.5903154968099717e+18,
4
+ "train_loss": 0.19423907659947873,
5
+ "train_runtime": 1687.359,
6
+ "train_samples_per_second": 36.554,
7
+ "train_steps_per_second": 0.284
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1071 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9234042553191489,
3
+ "best_model_checkpoint": "vit-msn-base-finetuned-lf-invalidation/checkpoint-62",
4
+ "epoch": 76.8,
5
+ "eval_steps": 500,
6
+ "global_step": 480,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.96,
13
+ "eval_accuracy": 0.6957446808510638,
14
+ "eval_loss": 0.6511951088905334,
15
+ "eval_runtime": 4.1128,
16
+ "eval_samples_per_second": 114.277,
17
+ "eval_steps_per_second": 3.647,
18
+ "step": 6
19
+ },
20
+ {
21
+ "epoch": 1.6,
22
+ "grad_norm": 100.99421691894531,
23
+ "learning_rate": 1.0416666666666668e-05,
24
+ "loss": 0.7053,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 1.92,
29
+ "eval_accuracy": 0.6808510638297872,
30
+ "eval_loss": 0.6310930848121643,
31
+ "eval_runtime": 4.1662,
32
+ "eval_samples_per_second": 112.813,
33
+ "eval_steps_per_second": 3.6,
34
+ "step": 12
35
+ },
36
+ {
37
+ "epoch": 2.88,
38
+ "eval_accuracy": 0.7276595744680852,
39
+ "eval_loss": 0.5360996127128601,
40
+ "eval_runtime": 4.0581,
41
+ "eval_samples_per_second": 115.819,
42
+ "eval_steps_per_second": 3.696,
43
+ "step": 18
44
+ },
45
+ {
46
+ "epoch": 3.2,
47
+ "grad_norm": 14.486075401306152,
48
+ "learning_rate": 2.0833333333333336e-05,
49
+ "loss": 0.5163,
50
+ "step": 20
51
+ },
52
+ {
53
+ "epoch": 4.0,
54
+ "eval_accuracy": 0.8680851063829788,
55
+ "eval_loss": 0.3341110050678253,
56
+ "eval_runtime": 3.919,
57
+ "eval_samples_per_second": 119.928,
58
+ "eval_steps_per_second": 3.827,
59
+ "step": 25
60
+ },
61
+ {
62
+ "epoch": 4.8,
63
+ "grad_norm": 19.806690216064453,
64
+ "learning_rate": 3.125e-05,
65
+ "loss": 0.3242,
66
+ "step": 30
67
+ },
68
+ {
69
+ "epoch": 4.96,
70
+ "eval_accuracy": 0.8808510638297873,
71
+ "eval_loss": 0.3167176842689514,
72
+ "eval_runtime": 3.8854,
73
+ "eval_samples_per_second": 120.965,
74
+ "eval_steps_per_second": 3.861,
75
+ "step": 31
76
+ },
77
+ {
78
+ "epoch": 5.92,
79
+ "eval_accuracy": 0.8191489361702128,
80
+ "eval_loss": 0.39598795771598816,
81
+ "eval_runtime": 4.17,
82
+ "eval_samples_per_second": 112.71,
83
+ "eval_steps_per_second": 3.597,
84
+ "step": 37
85
+ },
86
+ {
87
+ "epoch": 6.4,
88
+ "grad_norm": 18.975563049316406,
89
+ "learning_rate": 4.166666666666667e-05,
90
+ "loss": 0.2779,
91
+ "step": 40
92
+ },
93
+ {
94
+ "epoch": 6.88,
95
+ "eval_accuracy": 0.825531914893617,
96
+ "eval_loss": 0.3817645013332367,
97
+ "eval_runtime": 3.9864,
98
+ "eval_samples_per_second": 117.9,
99
+ "eval_steps_per_second": 3.763,
100
+ "step": 43
101
+ },
102
+ {
103
+ "epoch": 8.0,
104
+ "grad_norm": 50.12718963623047,
105
+ "learning_rate": 4.976851851851852e-05,
106
+ "loss": 0.2348,
107
+ "step": 50
108
+ },
109
+ {
110
+ "epoch": 8.0,
111
+ "eval_accuracy": 0.7361702127659574,
112
+ "eval_loss": 0.5018748641014099,
113
+ "eval_runtime": 4.0244,
114
+ "eval_samples_per_second": 116.789,
115
+ "eval_steps_per_second": 3.727,
116
+ "step": 50
117
+ },
118
+ {
119
+ "epoch": 8.96,
120
+ "eval_accuracy": 0.8851063829787233,
121
+ "eval_loss": 0.29437732696533203,
122
+ "eval_runtime": 3.863,
123
+ "eval_samples_per_second": 121.668,
124
+ "eval_steps_per_second": 3.883,
125
+ "step": 56
126
+ },
127
+ {
128
+ "epoch": 9.6,
129
+ "grad_norm": 87.47682189941406,
130
+ "learning_rate": 4.8611111111111115e-05,
131
+ "loss": 0.26,
132
+ "step": 60
133
+ },
134
+ {
135
+ "epoch": 9.92,
136
+ "eval_accuracy": 0.9234042553191489,
137
+ "eval_loss": 0.24138842523097992,
138
+ "eval_runtime": 3.9931,
139
+ "eval_samples_per_second": 117.702,
140
+ "eval_steps_per_second": 3.756,
141
+ "step": 62
142
+ },
143
+ {
144
+ "epoch": 10.88,
145
+ "eval_accuracy": 0.8297872340425532,
146
+ "eval_loss": 0.36644989252090454,
147
+ "eval_runtime": 3.8761,
148
+ "eval_samples_per_second": 121.257,
149
+ "eval_steps_per_second": 3.87,
150
+ "step": 68
151
+ },
152
+ {
153
+ "epoch": 11.2,
154
+ "grad_norm": 37.262699127197266,
155
+ "learning_rate": 4.745370370370371e-05,
156
+ "loss": 0.2778,
157
+ "step": 70
158
+ },
159
+ {
160
+ "epoch": 12.0,
161
+ "eval_accuracy": 0.9042553191489362,
162
+ "eval_loss": 0.2505495548248291,
163
+ "eval_runtime": 3.8837,
164
+ "eval_samples_per_second": 121.018,
165
+ "eval_steps_per_second": 3.862,
166
+ "step": 75
167
+ },
168
+ {
169
+ "epoch": 12.8,
170
+ "grad_norm": 39.793601989746094,
171
+ "learning_rate": 4.62962962962963e-05,
172
+ "loss": 0.2271,
173
+ "step": 80
174
+ },
175
+ {
176
+ "epoch": 12.96,
177
+ "eval_accuracy": 0.6297872340425532,
178
+ "eval_loss": 0.6277480721473694,
179
+ "eval_runtime": 3.9373,
180
+ "eval_samples_per_second": 119.372,
181
+ "eval_steps_per_second": 3.81,
182
+ "step": 81
183
+ },
184
+ {
185
+ "epoch": 13.92,
186
+ "eval_accuracy": 0.874468085106383,
187
+ "eval_loss": 0.275258332490921,
188
+ "eval_runtime": 3.9907,
189
+ "eval_samples_per_second": 117.773,
190
+ "eval_steps_per_second": 3.759,
191
+ "step": 87
192
+ },
193
+ {
194
+ "epoch": 14.4,
195
+ "grad_norm": 45.56210708618164,
196
+ "learning_rate": 4.5138888888888894e-05,
197
+ "loss": 0.2488,
198
+ "step": 90
199
+ },
200
+ {
201
+ "epoch": 14.88,
202
+ "eval_accuracy": 0.6957446808510638,
203
+ "eval_loss": 0.6249393820762634,
204
+ "eval_runtime": 4.032,
205
+ "eval_samples_per_second": 116.567,
206
+ "eval_steps_per_second": 3.72,
207
+ "step": 93
208
+ },
209
+ {
210
+ "epoch": 16.0,
211
+ "grad_norm": 39.393192291259766,
212
+ "learning_rate": 4.3981481481481486e-05,
213
+ "loss": 0.2729,
214
+ "step": 100
215
+ },
216
+ {
217
+ "epoch": 16.0,
218
+ "eval_accuracy": 0.7148936170212766,
219
+ "eval_loss": 0.519493043422699,
220
+ "eval_runtime": 4.1537,
221
+ "eval_samples_per_second": 113.151,
222
+ "eval_steps_per_second": 3.611,
223
+ "step": 100
224
+ },
225
+ {
226
+ "epoch": 16.96,
227
+ "eval_accuracy": 0.574468085106383,
228
+ "eval_loss": 0.7983953952789307,
229
+ "eval_runtime": 4.0505,
230
+ "eval_samples_per_second": 116.036,
231
+ "eval_steps_per_second": 3.703,
232
+ "step": 106
233
+ },
234
+ {
235
+ "epoch": 17.6,
236
+ "grad_norm": 52.11545944213867,
237
+ "learning_rate": 4.282407407407408e-05,
238
+ "loss": 0.3261,
239
+ "step": 110
240
+ },
241
+ {
242
+ "epoch": 17.92,
243
+ "eval_accuracy": 0.7723404255319148,
244
+ "eval_loss": 0.4630971848964691,
245
+ "eval_runtime": 4.0296,
246
+ "eval_samples_per_second": 116.636,
247
+ "eval_steps_per_second": 3.722,
248
+ "step": 112
249
+ },
250
+ {
251
+ "epoch": 18.88,
252
+ "eval_accuracy": 0.5148936170212766,
253
+ "eval_loss": 1.100958228111267,
254
+ "eval_runtime": 4.0952,
255
+ "eval_samples_per_second": 114.768,
256
+ "eval_steps_per_second": 3.663,
257
+ "step": 118
258
+ },
259
+ {
260
+ "epoch": 19.2,
261
+ "grad_norm": 80.0373764038086,
262
+ "learning_rate": 4.166666666666667e-05,
263
+ "loss": 0.2212,
264
+ "step": 120
265
+ },
266
+ {
267
+ "epoch": 20.0,
268
+ "eval_accuracy": 0.9170212765957447,
269
+ "eval_loss": 0.23374585807323456,
270
+ "eval_runtime": 4.0961,
271
+ "eval_samples_per_second": 114.744,
272
+ "eval_steps_per_second": 3.662,
273
+ "step": 125
274
+ },
275
+ {
276
+ "epoch": 20.8,
277
+ "grad_norm": 2.179731607437134,
278
+ "learning_rate": 4.0509259259259265e-05,
279
+ "loss": 0.2802,
280
+ "step": 130
281
+ },
282
+ {
283
+ "epoch": 20.96,
284
+ "eval_accuracy": 0.7574468085106383,
285
+ "eval_loss": 0.46376925706863403,
286
+ "eval_runtime": 4.1103,
287
+ "eval_samples_per_second": 114.347,
288
+ "eval_steps_per_second": 3.649,
289
+ "step": 131
290
+ },
291
+ {
292
+ "epoch": 21.92,
293
+ "eval_accuracy": 0.8361702127659575,
294
+ "eval_loss": 0.38592880964279175,
295
+ "eval_runtime": 4.1405,
296
+ "eval_samples_per_second": 113.513,
297
+ "eval_steps_per_second": 3.623,
298
+ "step": 137
299
+ },
300
+ {
301
+ "epoch": 22.4,
302
+ "grad_norm": 4.62510871887207,
303
+ "learning_rate": 3.935185185185186e-05,
304
+ "loss": 0.2112,
305
+ "step": 140
306
+ },
307
+ {
308
+ "epoch": 22.88,
309
+ "eval_accuracy": 0.6893617021276596,
310
+ "eval_loss": 0.6708246469497681,
311
+ "eval_runtime": 4.1195,
312
+ "eval_samples_per_second": 114.091,
313
+ "eval_steps_per_second": 3.641,
314
+ "step": 143
315
+ },
316
+ {
317
+ "epoch": 24.0,
318
+ "grad_norm": 5.395915508270264,
319
+ "learning_rate": 3.8194444444444444e-05,
320
+ "loss": 0.2231,
321
+ "step": 150
322
+ },
323
+ {
324
+ "epoch": 24.0,
325
+ "eval_accuracy": 0.8680851063829788,
326
+ "eval_loss": 0.3386794626712799,
327
+ "eval_runtime": 4.0237,
328
+ "eval_samples_per_second": 116.808,
329
+ "eval_steps_per_second": 3.728,
330
+ "step": 150
331
+ },
332
+ {
333
+ "epoch": 24.96,
334
+ "eval_accuracy": 0.6553191489361702,
335
+ "eval_loss": 0.7044735550880432,
336
+ "eval_runtime": 4.1207,
337
+ "eval_samples_per_second": 114.059,
338
+ "eval_steps_per_second": 3.64,
339
+ "step": 156
340
+ },
341
+ {
342
+ "epoch": 25.6,
343
+ "grad_norm": 12.411273002624512,
344
+ "learning_rate": 3.7037037037037037e-05,
345
+ "loss": 0.2037,
346
+ "step": 160
347
+ },
348
+ {
349
+ "epoch": 25.92,
350
+ "eval_accuracy": 0.8276595744680851,
351
+ "eval_loss": 0.3957701325416565,
352
+ "eval_runtime": 4.0677,
353
+ "eval_samples_per_second": 115.543,
354
+ "eval_steps_per_second": 3.688,
355
+ "step": 162
356
+ },
357
+ {
358
+ "epoch": 26.88,
359
+ "eval_accuracy": 0.7702127659574468,
360
+ "eval_loss": 0.5082454681396484,
361
+ "eval_runtime": 4.0429,
362
+ "eval_samples_per_second": 116.254,
363
+ "eval_steps_per_second": 3.71,
364
+ "step": 168
365
+ },
366
+ {
367
+ "epoch": 27.2,
368
+ "grad_norm": 13.522443771362305,
369
+ "learning_rate": 3.587962962962963e-05,
370
+ "loss": 0.1845,
371
+ "step": 170
372
+ },
373
+ {
374
+ "epoch": 28.0,
375
+ "eval_accuracy": 0.723404255319149,
376
+ "eval_loss": 0.5990515351295471,
377
+ "eval_runtime": 4.0619,
378
+ "eval_samples_per_second": 115.71,
379
+ "eval_steps_per_second": 3.693,
380
+ "step": 175
381
+ },
382
+ {
383
+ "epoch": 28.8,
384
+ "grad_norm": 40.79707336425781,
385
+ "learning_rate": 3.472222222222222e-05,
386
+ "loss": 0.1898,
387
+ "step": 180
388
+ },
389
+ {
390
+ "epoch": 28.96,
391
+ "eval_accuracy": 0.7617021276595745,
392
+ "eval_loss": 0.510837197303772,
393
+ "eval_runtime": 4.1139,
394
+ "eval_samples_per_second": 114.248,
395
+ "eval_steps_per_second": 3.646,
396
+ "step": 181
397
+ },
398
+ {
399
+ "epoch": 29.92,
400
+ "eval_accuracy": 0.9085106382978724,
401
+ "eval_loss": 0.27203500270843506,
402
+ "eval_runtime": 4.1071,
403
+ "eval_samples_per_second": 114.435,
404
+ "eval_steps_per_second": 3.652,
405
+ "step": 187
406
+ },
407
+ {
408
+ "epoch": 30.4,
409
+ "grad_norm": 34.6284065246582,
410
+ "learning_rate": 3.3564814814814815e-05,
411
+ "loss": 0.2118,
412
+ "step": 190
413
+ },
414
+ {
415
+ "epoch": 30.88,
416
+ "eval_accuracy": 0.7851063829787234,
417
+ "eval_loss": 0.4935612976551056,
418
+ "eval_runtime": 4.1447,
419
+ "eval_samples_per_second": 113.398,
420
+ "eval_steps_per_second": 3.619,
421
+ "step": 193
422
+ },
423
+ {
424
+ "epoch": 32.0,
425
+ "grad_norm": 5.604860782623291,
426
+ "learning_rate": 3.240740740740741e-05,
427
+ "loss": 0.2097,
428
+ "step": 200
429
+ },
430
+ {
431
+ "epoch": 32.0,
432
+ "eval_accuracy": 0.8404255319148937,
433
+ "eval_loss": 0.37482374906539917,
434
+ "eval_runtime": 4.0858,
435
+ "eval_samples_per_second": 115.032,
436
+ "eval_steps_per_second": 3.671,
437
+ "step": 200
438
+ },
439
+ {
440
+ "epoch": 32.96,
441
+ "eval_accuracy": 0.776595744680851,
442
+ "eval_loss": 0.5048179626464844,
443
+ "eval_runtime": 4.0089,
444
+ "eval_samples_per_second": 117.24,
445
+ "eval_steps_per_second": 3.742,
446
+ "step": 206
447
+ },
448
+ {
449
+ "epoch": 33.6,
450
+ "grad_norm": 7.735608100891113,
451
+ "learning_rate": 3.125e-05,
452
+ "loss": 0.1704,
453
+ "step": 210
454
+ },
455
+ {
456
+ "epoch": 33.92,
457
+ "eval_accuracy": 0.7957446808510639,
458
+ "eval_loss": 0.43682861328125,
459
+ "eval_runtime": 4.0913,
460
+ "eval_samples_per_second": 114.879,
461
+ "eval_steps_per_second": 3.666,
462
+ "step": 212
463
+ },
464
+ {
465
+ "epoch": 34.88,
466
+ "eval_accuracy": 0.6829787234042554,
467
+ "eval_loss": 0.6958675384521484,
468
+ "eval_runtime": 4.1104,
469
+ "eval_samples_per_second": 114.345,
470
+ "eval_steps_per_second": 3.649,
471
+ "step": 218
472
+ },
473
+ {
474
+ "epoch": 35.2,
475
+ "grad_norm": 25.477895736694336,
476
+ "learning_rate": 3.0092592592592593e-05,
477
+ "loss": 0.1962,
478
+ "step": 220
479
+ },
480
+ {
481
+ "epoch": 36.0,
482
+ "eval_accuracy": 0.5957446808510638,
483
+ "eval_loss": 1.009740948677063,
484
+ "eval_runtime": 4.0288,
485
+ "eval_samples_per_second": 116.66,
486
+ "eval_steps_per_second": 3.723,
487
+ "step": 225
488
+ },
489
+ {
490
+ "epoch": 36.8,
491
+ "grad_norm": 7.080097198486328,
492
+ "learning_rate": 2.8935185185185186e-05,
493
+ "loss": 0.1686,
494
+ "step": 230
495
+ },
496
+ {
497
+ "epoch": 36.96,
498
+ "eval_accuracy": 0.7914893617021277,
499
+ "eval_loss": 0.4992178976535797,
500
+ "eval_runtime": 4.0814,
501
+ "eval_samples_per_second": 115.157,
502
+ "eval_steps_per_second": 3.675,
503
+ "step": 231
504
+ },
505
+ {
506
+ "epoch": 37.92,
507
+ "eval_accuracy": 0.7574468085106383,
508
+ "eval_loss": 0.5373654365539551,
509
+ "eval_runtime": 4.2322,
510
+ "eval_samples_per_second": 111.052,
511
+ "eval_steps_per_second": 3.544,
512
+ "step": 237
513
+ },
514
+ {
515
+ "epoch": 38.4,
516
+ "grad_norm": 39.29030227661133,
517
+ "learning_rate": 2.777777777777778e-05,
518
+ "loss": 0.1855,
519
+ "step": 240
520
+ },
521
+ {
522
+ "epoch": 38.88,
523
+ "eval_accuracy": 0.8340425531914893,
524
+ "eval_loss": 0.371025025844574,
525
+ "eval_runtime": 4.1514,
526
+ "eval_samples_per_second": 113.216,
527
+ "eval_steps_per_second": 3.613,
528
+ "step": 243
529
+ },
530
+ {
531
+ "epoch": 40.0,
532
+ "grad_norm": 21.52515983581543,
533
+ "learning_rate": 2.6620370370370372e-05,
534
+ "loss": 0.1528,
535
+ "step": 250
536
+ },
537
+ {
538
+ "epoch": 40.0,
539
+ "eval_accuracy": 0.8446808510638298,
540
+ "eval_loss": 0.3630984425544739,
541
+ "eval_runtime": 4.1723,
542
+ "eval_samples_per_second": 112.647,
543
+ "eval_steps_per_second": 3.595,
544
+ "step": 250
545
+ },
546
+ {
547
+ "epoch": 40.96,
548
+ "eval_accuracy": 0.7680851063829788,
549
+ "eval_loss": 0.5588864088058472,
550
+ "eval_runtime": 4.2314,
551
+ "eval_samples_per_second": 111.075,
552
+ "eval_steps_per_second": 3.545,
553
+ "step": 256
554
+ },
555
+ {
556
+ "epoch": 41.6,
557
+ "grad_norm": 2.9336180686950684,
558
+ "learning_rate": 2.5462962962962965e-05,
559
+ "loss": 0.1523,
560
+ "step": 260
561
+ },
562
+ {
563
+ "epoch": 41.92,
564
+ "eval_accuracy": 0.7808510638297872,
565
+ "eval_loss": 0.5147323608398438,
566
+ "eval_runtime": 4.1942,
567
+ "eval_samples_per_second": 112.059,
568
+ "eval_steps_per_second": 3.576,
569
+ "step": 262
570
+ },
571
+ {
572
+ "epoch": 42.88,
573
+ "eval_accuracy": 0.7638297872340426,
574
+ "eval_loss": 0.5298714637756348,
575
+ "eval_runtime": 4.0956,
576
+ "eval_samples_per_second": 114.756,
577
+ "eval_steps_per_second": 3.662,
578
+ "step": 268
579
+ },
580
+ {
581
+ "epoch": 43.2,
582
+ "grad_norm": 20.56193733215332,
583
+ "learning_rate": 2.4305555555555558e-05,
584
+ "loss": 0.1709,
585
+ "step": 270
586
+ },
587
+ {
588
+ "epoch": 44.0,
589
+ "eval_accuracy": 0.7446808510638298,
590
+ "eval_loss": 0.5937234163284302,
591
+ "eval_runtime": 4.0352,
592
+ "eval_samples_per_second": 116.474,
593
+ "eval_steps_per_second": 3.717,
594
+ "step": 275
595
+ },
596
+ {
597
+ "epoch": 44.8,
598
+ "grad_norm": 17.483304977416992,
599
+ "learning_rate": 2.314814814814815e-05,
600
+ "loss": 0.1527,
601
+ "step": 280
602
+ },
603
+ {
604
+ "epoch": 44.96,
605
+ "eval_accuracy": 0.7382978723404255,
606
+ "eval_loss": 0.5969159603118896,
607
+ "eval_runtime": 4.1383,
608
+ "eval_samples_per_second": 113.574,
609
+ "eval_steps_per_second": 3.625,
610
+ "step": 281
611
+ },
612
+ {
613
+ "epoch": 45.92,
614
+ "eval_accuracy": 0.725531914893617,
615
+ "eval_loss": 0.6439131498336792,
616
+ "eval_runtime": 4.1256,
617
+ "eval_samples_per_second": 113.922,
618
+ "eval_steps_per_second": 3.636,
619
+ "step": 287
620
+ },
621
+ {
622
+ "epoch": 46.4,
623
+ "grad_norm": 13.123701095581055,
624
+ "learning_rate": 2.1990740740740743e-05,
625
+ "loss": 0.1397,
626
+ "step": 290
627
+ },
628
+ {
629
+ "epoch": 46.88,
630
+ "eval_accuracy": 0.6723404255319149,
631
+ "eval_loss": 0.7720506191253662,
632
+ "eval_runtime": 4.0907,
633
+ "eval_samples_per_second": 114.894,
634
+ "eval_steps_per_second": 3.667,
635
+ "step": 293
636
+ },
637
+ {
638
+ "epoch": 48.0,
639
+ "grad_norm": 15.003984451293945,
640
+ "learning_rate": 2.0833333333333336e-05,
641
+ "loss": 0.1538,
642
+ "step": 300
643
+ },
644
+ {
645
+ "epoch": 48.0,
646
+ "eval_accuracy": 0.7702127659574468,
647
+ "eval_loss": 0.5767794251441956,
648
+ "eval_runtime": 4.0083,
649
+ "eval_samples_per_second": 117.257,
650
+ "eval_steps_per_second": 3.742,
651
+ "step": 300
652
+ },
653
+ {
654
+ "epoch": 48.96,
655
+ "eval_accuracy": 0.7595744680851064,
656
+ "eval_loss": 0.5801470875740051,
657
+ "eval_runtime": 3.9427,
658
+ "eval_samples_per_second": 119.209,
659
+ "eval_steps_per_second": 3.805,
660
+ "step": 306
661
+ },
662
+ {
663
+ "epoch": 49.6,
664
+ "grad_norm": 2.649744987487793,
665
+ "learning_rate": 1.967592592592593e-05,
666
+ "loss": 0.1466,
667
+ "step": 310
668
+ },
669
+ {
670
+ "epoch": 49.92,
671
+ "eval_accuracy": 0.7574468085106383,
672
+ "eval_loss": 0.5672721266746521,
673
+ "eval_runtime": 4.0569,
674
+ "eval_samples_per_second": 115.852,
675
+ "eval_steps_per_second": 3.697,
676
+ "step": 312
677
+ },
678
+ {
679
+ "epoch": 50.88,
680
+ "eval_accuracy": 0.7085106382978723,
681
+ "eval_loss": 0.6468719244003296,
682
+ "eval_runtime": 4.0234,
683
+ "eval_samples_per_second": 116.818,
684
+ "eval_steps_per_second": 3.728,
685
+ "step": 318
686
+ },
687
+ {
688
+ "epoch": 51.2,
689
+ "grad_norm": 12.5094633102417,
690
+ "learning_rate": 1.8518518518518518e-05,
691
+ "loss": 0.1302,
692
+ "step": 320
693
+ },
694
+ {
695
+ "epoch": 52.0,
696
+ "eval_accuracy": 0.6957446808510638,
697
+ "eval_loss": 0.7276235222816467,
698
+ "eval_runtime": 4.0008,
699
+ "eval_samples_per_second": 117.475,
700
+ "eval_steps_per_second": 3.749,
701
+ "step": 325
702
+ },
703
+ {
704
+ "epoch": 52.8,
705
+ "grad_norm": 6.599560737609863,
706
+ "learning_rate": 1.736111111111111e-05,
707
+ "loss": 0.1565,
708
+ "step": 330
709
+ },
710
+ {
711
+ "epoch": 52.96,
712
+ "eval_accuracy": 0.6723404255319149,
713
+ "eval_loss": 0.8247136473655701,
714
+ "eval_runtime": 4.0526,
715
+ "eval_samples_per_second": 115.976,
716
+ "eval_steps_per_second": 3.701,
717
+ "step": 331
718
+ },
719
+ {
720
+ "epoch": 53.92,
721
+ "eval_accuracy": 0.7978723404255319,
722
+ "eval_loss": 0.4810582399368286,
723
+ "eval_runtime": 4.0005,
724
+ "eval_samples_per_second": 117.486,
725
+ "eval_steps_per_second": 3.75,
726
+ "step": 337
727
+ },
728
+ {
729
+ "epoch": 54.4,
730
+ "grad_norm": 12.574357986450195,
731
+ "learning_rate": 1.6203703703703704e-05,
732
+ "loss": 0.1267,
733
+ "step": 340
734
+ },
735
+ {
736
+ "epoch": 54.88,
737
+ "eval_accuracy": 0.7021276595744681,
738
+ "eval_loss": 0.6372675895690918,
739
+ "eval_runtime": 4.047,
740
+ "eval_samples_per_second": 116.135,
741
+ "eval_steps_per_second": 3.706,
742
+ "step": 343
743
+ },
744
+ {
745
+ "epoch": 56.0,
746
+ "grad_norm": 29.667768478393555,
747
+ "learning_rate": 1.5046296296296297e-05,
748
+ "loss": 0.1424,
749
+ "step": 350
750
+ },
751
+ {
752
+ "epoch": 56.0,
753
+ "eval_accuracy": 0.6723404255319149,
754
+ "eval_loss": 0.7251705527305603,
755
+ "eval_runtime": 4.0519,
756
+ "eval_samples_per_second": 115.995,
757
+ "eval_steps_per_second": 3.702,
758
+ "step": 350
759
+ },
760
+ {
761
+ "epoch": 56.96,
762
+ "eval_accuracy": 0.7489361702127659,
763
+ "eval_loss": 0.5696622729301453,
764
+ "eval_runtime": 4.0808,
765
+ "eval_samples_per_second": 115.174,
766
+ "eval_steps_per_second": 3.676,
767
+ "step": 356
768
+ },
769
+ {
770
+ "epoch": 57.6,
771
+ "grad_norm": 3.276287317276001,
772
+ "learning_rate": 1.388888888888889e-05,
773
+ "loss": 0.1053,
774
+ "step": 360
775
+ },
776
+ {
777
+ "epoch": 57.92,
778
+ "eval_accuracy": 0.6957446808510638,
779
+ "eval_loss": 0.7066917419433594,
780
+ "eval_runtime": 4.0845,
781
+ "eval_samples_per_second": 115.07,
782
+ "eval_steps_per_second": 3.672,
783
+ "step": 362
784
+ },
785
+ {
786
+ "epoch": 58.88,
787
+ "eval_accuracy": 0.7063829787234043,
788
+ "eval_loss": 0.6576955318450928,
789
+ "eval_runtime": 4.1852,
790
+ "eval_samples_per_second": 112.301,
791
+ "eval_steps_per_second": 3.584,
792
+ "step": 368
793
+ },
794
+ {
795
+ "epoch": 59.2,
796
+ "grad_norm": 7.9189982414245605,
797
+ "learning_rate": 1.2731481481481482e-05,
798
+ "loss": 0.1301,
799
+ "step": 370
800
+ },
801
+ {
802
+ "epoch": 60.0,
803
+ "eval_accuracy": 0.774468085106383,
804
+ "eval_loss": 0.5325801372528076,
805
+ "eval_runtime": 4.0787,
806
+ "eval_samples_per_second": 115.233,
807
+ "eval_steps_per_second": 3.678,
808
+ "step": 375
809
+ },
810
+ {
811
+ "epoch": 60.8,
812
+ "grad_norm": 14.686637878417969,
813
+ "learning_rate": 1.1574074074074075e-05,
814
+ "loss": 0.0906,
815
+ "step": 380
816
+ },
817
+ {
818
+ "epoch": 60.96,
819
+ "eval_accuracy": 0.7851063829787234,
820
+ "eval_loss": 0.546753466129303,
821
+ "eval_runtime": 4.0812,
822
+ "eval_samples_per_second": 115.163,
823
+ "eval_steps_per_second": 3.675,
824
+ "step": 381
825
+ },
826
+ {
827
+ "epoch": 61.92,
828
+ "eval_accuracy": 0.8276595744680851,
829
+ "eval_loss": 0.4413163959980011,
830
+ "eval_runtime": 4.1408,
831
+ "eval_samples_per_second": 113.504,
832
+ "eval_steps_per_second": 3.622,
833
+ "step": 387
834
+ },
835
+ {
836
+ "epoch": 62.4,
837
+ "grad_norm": 9.14445686340332,
838
+ "learning_rate": 1.0416666666666668e-05,
839
+ "loss": 0.0974,
840
+ "step": 390
841
+ },
842
+ {
843
+ "epoch": 62.88,
844
+ "eval_accuracy": 0.7659574468085106,
845
+ "eval_loss": 0.5478885173797607,
846
+ "eval_runtime": 4.1286,
847
+ "eval_samples_per_second": 113.839,
848
+ "eval_steps_per_second": 3.633,
849
+ "step": 393
850
+ },
851
+ {
852
+ "epoch": 64.0,
853
+ "grad_norm": 4.1058526039123535,
854
+ "learning_rate": 9.259259259259259e-06,
855
+ "loss": 0.1133,
856
+ "step": 400
857
+ },
858
+ {
859
+ "epoch": 64.0,
860
+ "eval_accuracy": 0.7042553191489361,
861
+ "eval_loss": 0.7109193801879883,
862
+ "eval_runtime": 4.2149,
863
+ "eval_samples_per_second": 111.508,
864
+ "eval_steps_per_second": 3.559,
865
+ "step": 400
866
+ },
867
+ {
868
+ "epoch": 64.96,
869
+ "eval_accuracy": 0.7617021276595745,
870
+ "eval_loss": 0.5734679102897644,
871
+ "eval_runtime": 4.1133,
872
+ "eval_samples_per_second": 114.265,
873
+ "eval_steps_per_second": 3.647,
874
+ "step": 406
875
+ },
876
+ {
877
+ "epoch": 65.6,
878
+ "grad_norm": 5.876250267028809,
879
+ "learning_rate": 8.101851851851852e-06,
880
+ "loss": 0.1189,
881
+ "step": 410
882
+ },
883
+ {
884
+ "epoch": 65.92,
885
+ "eval_accuracy": 0.8297872340425532,
886
+ "eval_loss": 0.4084050953388214,
887
+ "eval_runtime": 4.1883,
888
+ "eval_samples_per_second": 112.218,
889
+ "eval_steps_per_second": 3.581,
890
+ "step": 412
891
+ },
892
+ {
893
+ "epoch": 66.88,
894
+ "eval_accuracy": 0.7489361702127659,
895
+ "eval_loss": 0.5716192722320557,
896
+ "eval_runtime": 4.1124,
897
+ "eval_samples_per_second": 114.289,
898
+ "eval_steps_per_second": 3.648,
899
+ "step": 418
900
+ },
901
+ {
902
+ "epoch": 67.2,
903
+ "grad_norm": 2.931035280227661,
904
+ "learning_rate": 6.944444444444445e-06,
905
+ "loss": 0.1064,
906
+ "step": 420
907
+ },
908
+ {
909
+ "epoch": 68.0,
910
+ "eval_accuracy": 0.7553191489361702,
911
+ "eval_loss": 0.5537174940109253,
912
+ "eval_runtime": 4.0965,
913
+ "eval_samples_per_second": 114.731,
914
+ "eval_steps_per_second": 3.662,
915
+ "step": 425
916
+ },
917
+ {
918
+ "epoch": 68.8,
919
+ "grad_norm": 4.387136936187744,
920
+ "learning_rate": 5.787037037037038e-06,
921
+ "loss": 0.1084,
922
+ "step": 430
923
+ },
924
+ {
925
+ "epoch": 68.96,
926
+ "eval_accuracy": 0.8021276595744681,
927
+ "eval_loss": 0.456912100315094,
928
+ "eval_runtime": 4.1477,
929
+ "eval_samples_per_second": 113.315,
930
+ "eval_steps_per_second": 3.616,
931
+ "step": 431
932
+ },
933
+ {
934
+ "epoch": 69.92,
935
+ "eval_accuracy": 0.7617021276595745,
936
+ "eval_loss": 0.5227068066596985,
937
+ "eval_runtime": 4.1656,
938
+ "eval_samples_per_second": 112.828,
939
+ "eval_steps_per_second": 3.601,
940
+ "step": 437
941
+ },
942
+ {
943
+ "epoch": 70.4,
944
+ "grad_norm": 6.693394184112549,
945
+ "learning_rate": 4.6296296296296296e-06,
946
+ "loss": 0.1054,
947
+ "step": 440
948
+ },
949
+ {
950
+ "epoch": 70.88,
951
+ "eval_accuracy": 0.7276595744680852,
952
+ "eval_loss": 0.5995042324066162,
953
+ "eval_runtime": 4.1654,
954
+ "eval_samples_per_second": 112.834,
955
+ "eval_steps_per_second": 3.601,
956
+ "step": 443
957
+ },
958
+ {
959
+ "epoch": 72.0,
960
+ "grad_norm": 8.600502014160156,
961
+ "learning_rate": 3.4722222222222224e-06,
962
+ "loss": 0.1005,
963
+ "step": 450
964
+ },
965
+ {
966
+ "epoch": 72.0,
967
+ "eval_accuracy": 0.7638297872340426,
968
+ "eval_loss": 0.5560170412063599,
969
+ "eval_runtime": 4.1827,
970
+ "eval_samples_per_second": 112.367,
971
+ "eval_steps_per_second": 3.586,
972
+ "step": 450
973
+ },
974
+ {
975
+ "epoch": 72.96,
976
+ "eval_accuracy": 0.8063829787234043,
977
+ "eval_loss": 0.45502665638923645,
978
+ "eval_runtime": 4.2071,
979
+ "eval_samples_per_second": 111.715,
980
+ "eval_steps_per_second": 3.565,
981
+ "step": 456
982
+ },
983
+ {
984
+ "epoch": 73.6,
985
+ "grad_norm": 10.198132514953613,
986
+ "learning_rate": 2.3148148148148148e-06,
987
+ "loss": 0.1028,
988
+ "step": 460
989
+ },
990
+ {
991
+ "epoch": 73.92,
992
+ "eval_accuracy": 0.823404255319149,
993
+ "eval_loss": 0.4404470920562744,
994
+ "eval_runtime": 4.1806,
995
+ "eval_samples_per_second": 112.425,
996
+ "eval_steps_per_second": 3.588,
997
+ "step": 462
998
+ },
999
+ {
1000
+ "epoch": 74.88,
1001
+ "eval_accuracy": 0.7957446808510639,
1002
+ "eval_loss": 0.4761447310447693,
1003
+ "eval_runtime": 4.1871,
1004
+ "eval_samples_per_second": 112.251,
1005
+ "eval_steps_per_second": 3.582,
1006
+ "step": 468
1007
+ },
1008
+ {
1009
+ "epoch": 75.2,
1010
+ "grad_norm": 7.506448268890381,
1011
+ "learning_rate": 1.1574074074074074e-06,
1012
+ "loss": 0.0917,
1013
+ "step": 470
1014
+ },
1015
+ {
1016
+ "epoch": 76.0,
1017
+ "eval_accuracy": 0.7680851063829788,
1018
+ "eval_loss": 0.5278272032737732,
1019
+ "eval_runtime": 4.1615,
1020
+ "eval_samples_per_second": 112.939,
1021
+ "eval_steps_per_second": 3.604,
1022
+ "step": 475
1023
+ },
1024
+ {
1025
+ "epoch": 76.8,
1026
+ "grad_norm": 5.474030494689941,
1027
+ "learning_rate": 0.0,
1028
+ "loss": 0.1009,
1029
+ "step": 480
1030
+ },
1031
+ {
1032
+ "epoch": 76.8,
1033
+ "eval_accuracy": 0.7617021276595745,
1034
+ "eval_loss": 0.5345979332923889,
1035
+ "eval_runtime": 4.1813,
1036
+ "eval_samples_per_second": 112.405,
1037
+ "eval_steps_per_second": 3.587,
1038
+ "step": 480
1039
+ },
1040
+ {
1041
+ "epoch": 76.8,
1042
+ "step": 480,
1043
+ "total_flos": 4.5903154968099717e+18,
1044
+ "train_loss": 0.19423907659947873,
1045
+ "train_runtime": 1687.359,
1046
+ "train_samples_per_second": 36.554,
1047
+ "train_steps_per_second": 0.284
1048
+ }
1049
+ ],
1050
+ "logging_steps": 10,
1051
+ "max_steps": 480,
1052
+ "num_input_tokens_seen": 0,
1053
+ "num_train_epochs": 80,
1054
+ "save_steps": 500,
1055
+ "stateful_callbacks": {
1056
+ "TrainerControl": {
1057
+ "args": {
1058
+ "should_epoch_stop": false,
1059
+ "should_evaluate": false,
1060
+ "should_log": false,
1061
+ "should_save": true,
1062
+ "should_training_stop": true
1063
+ },
1064
+ "attributes": {}
1065
+ }
1066
+ },
1067
+ "total_flos": 4.5903154968099717e+18,
1068
+ "train_batch_size": 32,
1069
+ "trial_name": null,
1070
+ "trial_params": null
1071
+ }