ihsanahakiim commited on
Commit
b32f884
·
verified ·
1 Parent(s): 6c7cbcf

Training in progress, epoch 0

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 30.03125,
3
- "eval_accuracy": 0.4882943143812709,
4
- "eval_loss": 2.180596113204956,
5
- "eval_runtime": 179.6038,
6
- "eval_samples_per_second": 1.665,
7
- "eval_steps_per_second": 0.056
8
  }
 
1
  {
2
+ "epoch": 30.026442307692307,
3
+ "eval_accuracy": 0.35570469798657717,
4
+ "eval_loss": 2.520230293273926,
5
+ "eval_runtime": 112.9248,
6
+ "eval_samples_per_second": 3.958,
7
+ "eval_steps_per_second": 0.124
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:502b11c24d362ff19a589c7424060c5f3e5cab5aebadcaad304ec293b34394b3
3
  size 345137324
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0f584e71e8d912882ac655a4e0279ff005951102243f1e266ceb5849410ec72
3
  size 345137324
runs/Jan14_10-39-57_GAN-SVR/events.out.tfevents.1736833764.GAN-SVR.3870842.6 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51873b587f4b80d00c03de799f218dac181c849b162ba30c592ded55bc2c6197
3
- size 411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d575c44ad6d0d6f2158c03dc150d181b1295137095262b375bd39814326263c
3
+ size 734
runs/Jan14_16-01-40_GAN-SVR/events.out.tfevents.1736838126.GAN-SVR.3870842.7 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b62de6581040ab6ac102c5221b3c202e3e36a1ccccbc73cc5f5cd2a0033f3ae1
3
+ size 7783
test_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 30.03125,
3
- "eval_accuracy": 0.4882943143812709,
4
- "eval_loss": 2.180596113204956,
5
- "eval_runtime": 179.6038,
6
- "eval_samples_per_second": 1.665,
7
- "eval_steps_per_second": 0.056
8
  }
 
1
  {
2
+ "epoch": 30.026442307692307,
3
+ "eval_accuracy": 0.35570469798657717,
4
+ "eval_loss": 2.520230293273926,
5
+ "eval_runtime": 112.9248,
6
+ "eval_samples_per_second": 3.958,
7
+ "eval_steps_per_second": 0.124
8
  }
trainer_state.json CHANGED
@@ -1,994 +1,903 @@
1
  {
2
- "best_metric": 0.4882943143812709,
3
- "best_model_checkpoint": "videomae-base-finetuned-ucf101-subset/checkpoint-806",
4
- "epoch": 30.03125,
5
  "eval_steps": 500,
6
- "global_step": 960,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.010416666666666666,
13
- "grad_norm": 4.111547470092773,
14
- "learning_rate": 5.208333333333334e-06,
15
- "loss": 4.2127,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.020833333333333332,
20
- "grad_norm": 4.218038082122803,
21
- "learning_rate": 1.0416666666666668e-05,
22
- "loss": 4.2247,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.03125,
27
- "grad_norm": 3.766047954559326,
28
- "learning_rate": 1.5625e-05,
29
- "loss": 4.2427,
30
- "step": 30
 
 
31
  },
32
  {
33
- "epoch": 0.03229166666666667,
34
- "eval_accuracy": 0.0033444816053511705,
35
- "eval_loss": 4.22645378112793,
36
- "eval_runtime": 157.4916,
37
- "eval_samples_per_second": 1.899,
38
- "eval_steps_per_second": 0.063,
39
- "step": 31
40
  },
41
  {
42
- "epoch": 1.009375,
43
- "grad_norm": 4.1204304695129395,
44
- "learning_rate": 2.0833333333333336e-05,
45
- "loss": 4.2374,
46
  "step": 40
47
  },
48
  {
49
- "epoch": 1.0197916666666667,
50
- "grad_norm": 4.202085971832275,
51
- "learning_rate": 2.604166666666667e-05,
52
- "loss": 4.228,
53
  "step": 50
54
  },
55
  {
56
- "epoch": 1.0302083333333334,
57
- "grad_norm": 3.5194547176361084,
58
- "learning_rate": 3.125e-05,
59
- "loss": 4.2321,
60
- "step": 60
 
 
61
  },
62
  {
63
- "epoch": 1.0322916666666666,
64
- "eval_accuracy": 0.010033444816053512,
65
- "eval_loss": 4.223534107208252,
66
- "eval_runtime": 156.7976,
67
- "eval_samples_per_second": 1.907,
68
- "eval_steps_per_second": 0.064,
69
- "step": 62
70
  },
71
  {
72
- "epoch": 2.0083333333333333,
73
- "grad_norm": 3.131030559539795,
74
- "learning_rate": 3.6458333333333336e-05,
75
- "loss": 4.2293,
76
  "step": 70
77
  },
78
  {
79
- "epoch": 2.01875,
80
- "grad_norm": 3.120128631591797,
81
- "learning_rate": 4.166666666666667e-05,
82
- "loss": 4.2347,
83
  "step": 80
84
  },
85
  {
86
- "epoch": 2.029166666666667,
87
- "grad_norm": 3.0988192558288574,
88
- "learning_rate": 4.6875e-05,
89
- "loss": 4.24,
90
- "step": 90
 
 
91
  },
92
  {
93
- "epoch": 2.0322916666666666,
94
- "eval_accuracy": 0.010033444816053512,
95
- "eval_loss": 4.228224754333496,
96
- "eval_runtime": 156.5504,
97
- "eval_samples_per_second": 1.91,
98
- "eval_steps_per_second": 0.064,
99
- "step": 93
100
  },
101
  {
102
- "epoch": 3.0072916666666667,
103
- "grad_norm": 3.102827310562134,
104
- "learning_rate": 4.976851851851852e-05,
105
- "loss": 4.2195,
106
  "step": 100
107
  },
108
  {
109
- "epoch": 3.017708333333333,
110
- "grad_norm": 2.8366568088531494,
111
- "learning_rate": 4.9189814814814815e-05,
112
- "loss": 4.2354,
 
 
 
 
 
 
 
 
 
113
  "step": 110
114
  },
115
  {
116
- "epoch": 3.028125,
117
- "grad_norm": 3.0085482597351074,
118
- "learning_rate": 4.8611111111111115e-05,
119
- "loss": 4.2445,
120
  "step": 120
121
  },
122
  {
123
- "epoch": 3.0322916666666666,
124
- "eval_accuracy": 0.006688963210702341,
125
- "eval_loss": 4.224982261657715,
126
- "eval_runtime": 151.1437,
127
- "eval_samples_per_second": 1.978,
128
- "eval_steps_per_second": 0.066,
129
- "step": 124
130
  },
131
  {
132
- "epoch": 4.00625,
133
- "grad_norm": 3.300618886947632,
134
- "learning_rate": 4.803240740740741e-05,
135
- "loss": 4.2226,
136
- "step": 130
 
 
137
  },
138
  {
139
- "epoch": 4.016666666666667,
140
- "grad_norm": 3.202220916748047,
141
- "learning_rate": 4.745370370370371e-05,
142
- "loss": 4.2265,
143
  "step": 140
144
  },
145
  {
146
- "epoch": 4.027083333333334,
147
- "grad_norm": 2.977271556854248,
148
- "learning_rate": 4.6875e-05,
149
- "loss": 4.2327,
150
  "step": 150
151
  },
152
  {
153
- "epoch": 4.032291666666667,
154
- "eval_accuracy": 0.010033444816053512,
155
- "eval_loss": 4.2244367599487305,
156
- "eval_runtime": 156.5602,
157
- "eval_samples_per_second": 1.91,
158
- "eval_steps_per_second": 0.064,
159
- "step": 155
160
  },
161
  {
162
- "epoch": 5.005208333333333,
163
- "grad_norm": 3.7291171550750732,
164
- "learning_rate": 4.62962962962963e-05,
165
- "loss": 4.2106,
166
- "step": 160
 
 
167
  },
168
  {
169
- "epoch": 5.015625,
170
- "grad_norm": 3.581210136413574,
171
- "learning_rate": 4.5717592592592594e-05,
172
  "loss": 4.2112,
173
  "step": 170
174
  },
175
  {
176
- "epoch": 5.026041666666667,
177
- "grad_norm": 3.7153217792510986,
178
- "learning_rate": 4.5138888888888894e-05,
179
- "loss": 4.2104,
180
  "step": 180
181
  },
182
  {
183
- "epoch": 5.032291666666667,
184
- "eval_accuracy": 0.020066889632107024,
185
- "eval_loss": 4.2100114822387695,
186
- "eval_runtime": 150.442,
187
- "eval_samples_per_second": 1.987,
188
- "eval_steps_per_second": 0.066,
189
- "step": 186
190
  },
191
  {
192
- "epoch": 6.004166666666666,
193
- "grad_norm": 3.1238365173339844,
194
- "learning_rate": 4.456018518518519e-05,
195
- "loss": 4.2205,
196
  "step": 190
197
  },
198
  {
199
- "epoch": 6.014583333333333,
200
- "grad_norm": 3.4233925342559814,
201
- "learning_rate": 4.3981481481481486e-05,
202
- "loss": 4.2012,
203
  "step": 200
204
  },
205
  {
206
- "epoch": 6.025,
207
- "grad_norm": 3.511300563812256,
208
- "learning_rate": 4.340277777777778e-05,
209
- "loss": 4.2374,
210
  "step": 210
211
  },
212
  {
213
- "epoch": 6.032291666666667,
214
- "eval_accuracy": 0.006688963210702341,
215
- "eval_loss": 4.2022294998168945,
216
- "eval_runtime": 153.3689,
217
- "eval_samples_per_second": 1.95,
218
- "eval_steps_per_second": 0.065,
219
- "step": 217
220
  },
221
  {
222
- "epoch": 7.003125,
223
- "grad_norm": 3.343992233276367,
224
- "learning_rate": 4.282407407407408e-05,
225
- "loss": 4.2055,
226
  "step": 220
227
  },
228
  {
229
- "epoch": 7.013541666666667,
230
- "grad_norm": 2.911720037460327,
231
- "learning_rate": 4.224537037037037e-05,
232
- "loss": 4.1687,
233
  "step": 230
234
  },
235
  {
236
- "epoch": 7.023958333333334,
237
- "grad_norm": 3.1043291091918945,
238
- "learning_rate": 4.166666666666667e-05,
239
- "loss": 4.1597,
240
  "step": 240
241
  },
242
  {
243
- "epoch": 7.032291666666667,
244
- "eval_accuracy": 0.030100334448160536,
245
- "eval_loss": 4.118756294250488,
246
- "eval_runtime": 152.3194,
247
- "eval_samples_per_second": 1.963,
248
- "eval_steps_per_second": 0.066,
249
- "step": 248
250
  },
251
  {
252
- "epoch": 8.002083333333333,
253
- "grad_norm": 3.4208502769470215,
254
- "learning_rate": 4.1087962962962965e-05,
255
- "loss": 4.1418,
256
  "step": 250
257
  },
258
  {
259
- "epoch": 8.0125,
260
- "grad_norm": 4.338245868682861,
261
- "learning_rate": 4.0509259259259265e-05,
262
- "loss": 4.0753,
263
  "step": 260
264
  },
265
  {
266
- "epoch": 8.022916666666667,
267
- "grad_norm": 4.86409854888916,
268
- "learning_rate": 3.993055555555556e-05,
269
- "loss": 4.0522,
270
  "step": 270
271
  },
272
  {
273
- "epoch": 8.032291666666667,
274
- "eval_accuracy": 0.07023411371237458,
275
- "eval_loss": 3.935123920440674,
276
- "eval_runtime": 155.145,
277
- "eval_samples_per_second": 1.927,
278
- "eval_steps_per_second": 0.064,
279
- "step": 279
280
  },
281
  {
282
- "epoch": 9.001041666666667,
283
- "grad_norm": 4.609180927276611,
284
- "learning_rate": 3.935185185185186e-05,
285
- "loss": 3.9967,
286
  "step": 280
287
  },
288
  {
289
- "epoch": 9.011458333333334,
290
- "grad_norm": 5.407143592834473,
291
- "learning_rate": 3.877314814814815e-05,
292
- "loss": 3.8626,
293
  "step": 290
294
  },
295
  {
296
- "epoch": 9.021875,
297
- "grad_norm": 4.2119598388671875,
298
- "learning_rate": 3.8194444444444444e-05,
299
- "loss": 3.821,
300
- "step": 300
 
 
301
  },
302
  {
303
- "epoch": 9.032291666666667,
304
- "grad_norm": 14.079527854919434,
305
- "learning_rate": 3.7615740740740744e-05,
306
- "loss": 3.768,
307
- "step": 310
308
  },
309
  {
310
- "epoch": 9.032291666666667,
311
- "eval_accuracy": 0.10702341137123746,
312
- "eval_loss": 3.680009126663208,
313
- "eval_runtime": 152.6981,
314
- "eval_samples_per_second": 1.958,
315
- "eval_steps_per_second": 0.065,
316
  "step": 310
317
  },
318
  {
319
- "epoch": 10.010416666666666,
320
- "grad_norm": 4.879744529724121,
321
- "learning_rate": 3.7037037037037037e-05,
322
- "loss": 3.54,
323
  "step": 320
324
  },
325
  {
326
- "epoch": 10.020833333333334,
327
- "grad_norm": 6.769280910491943,
328
- "learning_rate": 3.6458333333333336e-05,
329
- "loss": 3.6435,
 
 
 
 
 
 
 
 
 
330
  "step": 330
331
  },
332
  {
333
- "epoch": 10.03125,
334
- "grad_norm": 6.974754810333252,
335
- "learning_rate": 3.587962962962963e-05,
336
- "loss": 3.5147,
337
  "step": 340
338
  },
339
  {
340
- "epoch": 10.032291666666667,
341
- "eval_accuracy": 0.11036789297658862,
342
- "eval_loss": 3.541635036468506,
343
- "eval_runtime": 156.1418,
344
- "eval_samples_per_second": 1.915,
345
- "eval_steps_per_second": 0.064,
346
- "step": 341
347
  },
348
  {
349
- "epoch": 11.009375,
350
- "grad_norm": 6.004092216491699,
351
- "learning_rate": 3.530092592592593e-05,
352
- "loss": 3.2736,
353
- "step": 350
 
 
354
  },
355
  {
356
- "epoch": 11.019791666666666,
357
- "grad_norm": 4.7057905197143555,
358
- "learning_rate": 3.472222222222222e-05,
359
- "loss": 3.2459,
360
  "step": 360
361
  },
362
  {
363
- "epoch": 11.030208333333333,
364
- "grad_norm": 7.564170837402344,
365
- "learning_rate": 3.414351851851852e-05,
366
- "loss": 3.2878,
367
  "step": 370
368
  },
369
  {
370
- "epoch": 11.032291666666667,
371
- "eval_accuracy": 0.07023411371237458,
372
- "eval_loss": 3.707416534423828,
373
- "eval_runtime": 151.3197,
374
- "eval_samples_per_second": 1.976,
375
- "eval_steps_per_second": 0.066,
376
- "step": 372
377
  },
378
  {
379
- "epoch": 12.008333333333333,
380
- "grad_norm": 6.44061279296875,
381
- "learning_rate": 3.3564814814814815e-05,
382
- "loss": 3.048,
383
  "step": 380
384
  },
385
  {
386
- "epoch": 12.01875,
387
- "grad_norm": 6.303668022155762,
388
- "learning_rate": 3.2986111111111115e-05,
389
- "loss": 3.0619,
390
  "step": 390
391
  },
392
  {
393
- "epoch": 12.029166666666667,
394
- "grad_norm": 7.074206352233887,
395
- "learning_rate": 3.240740740740741e-05,
396
- "loss": 2.9491,
397
  "step": 400
398
  },
399
  {
400
- "epoch": 12.032291666666667,
401
- "eval_accuracy": 0.10702341137123746,
402
- "eval_loss": 3.3954155445098877,
403
- "eval_runtime": 154.7501,
404
- "eval_samples_per_second": 1.932,
405
- "eval_steps_per_second": 0.065,
406
- "step": 403
407
  },
408
  {
409
- "epoch": 13.007291666666667,
410
- "grad_norm": 6.961267471313477,
411
- "learning_rate": 3.182870370370371e-05,
412
- "loss": 2.8801,
413
  "step": 410
414
  },
415
  {
416
- "epoch": 13.017708333333333,
417
- "grad_norm": 5.816298007965088,
418
- "learning_rate": 3.125e-05,
419
- "loss": 2.9025,
420
  "step": 420
421
  },
422
  {
423
- "epoch": 13.028125,
424
- "grad_norm": 10.059369087219238,
425
- "learning_rate": 3.06712962962963e-05,
426
- "loss": 2.806,
427
  "step": 430
428
  },
429
  {
430
- "epoch": 13.032291666666667,
431
- "eval_accuracy": 0.1705685618729097,
432
- "eval_loss": 3.2551913261413574,
433
- "eval_runtime": 154.5283,
434
- "eval_samples_per_second": 1.935,
435
- "eval_steps_per_second": 0.065,
436
- "step": 434
437
  },
438
  {
439
- "epoch": 14.00625,
440
- "grad_norm": 6.142374515533447,
441
- "learning_rate": 3.0092592592592593e-05,
442
- "loss": 2.6302,
443
  "step": 440
444
  },
445
  {
446
- "epoch": 14.016666666666667,
447
- "grad_norm": 5.806746959686279,
448
- "learning_rate": 2.951388888888889e-05,
449
- "loss": 2.5778,
450
  "step": 450
451
  },
452
  {
453
- "epoch": 14.027083333333334,
454
- "grad_norm": 6.683168888092041,
455
- "learning_rate": 2.8935185185185186e-05,
456
- "loss": 2.4568,
457
- "step": 460
 
 
458
  },
459
  {
460
- "epoch": 14.032291666666667,
461
- "eval_accuracy": 0.2040133779264214,
462
- "eval_loss": 3.0654280185699463,
463
- "eval_runtime": 155.0,
464
- "eval_samples_per_second": 1.929,
465
- "eval_steps_per_second": 0.065,
466
- "step": 465
467
  },
468
  {
469
- "epoch": 15.005208333333334,
470
- "grad_norm": 8.81658935546875,
471
- "learning_rate": 2.8356481481481483e-05,
472
- "loss": 2.4185,
473
  "step": 470
474
  },
475
  {
476
- "epoch": 15.015625,
477
- "grad_norm": 17.862884521484375,
478
- "learning_rate": 2.777777777777778e-05,
479
- "loss": 2.3865,
480
  "step": 480
481
  },
482
  {
483
- "epoch": 15.026041666666666,
484
- "grad_norm": 7.982626438140869,
485
- "learning_rate": 2.7199074074074076e-05,
486
- "loss": 2.3102,
487
- "step": 490
 
 
488
  },
489
  {
490
- "epoch": 15.032291666666667,
491
- "eval_accuracy": 0.3010033444816054,
492
- "eval_loss": 2.7440292835235596,
493
- "eval_runtime": 154.0941,
494
- "eval_samples_per_second": 1.94,
495
- "eval_steps_per_second": 0.065,
496
- "step": 496
497
  },
498
  {
499
- "epoch": 16.004166666666666,
500
- "grad_norm": 7.09642219543457,
501
- "learning_rate": 2.6620370370370372e-05,
502
- "loss": 2.1208,
503
  "step": 500
504
  },
505
  {
506
- "epoch": 16.014583333333334,
507
- "grad_norm": 7.21767520904541,
508
- "learning_rate": 2.604166666666667e-05,
509
- "loss": 2.1481,
510
  "step": 510
511
  },
512
  {
513
- "epoch": 16.025,
514
- "grad_norm": 8.300311088562012,
515
- "learning_rate": 2.5462962962962965e-05,
516
- "loss": 2.2079,
517
- "step": 520
 
 
518
  },
519
  {
520
- "epoch": 16.032291666666666,
521
- "eval_accuracy": 0.31438127090301005,
522
- "eval_loss": 2.6789305210113525,
523
- "eval_runtime": 157.8279,
524
- "eval_samples_per_second": 1.894,
525
- "eval_steps_per_second": 0.063,
526
- "step": 527
527
  },
528
  {
529
- "epoch": 17.003125,
530
- "grad_norm": 8.606009483337402,
531
- "learning_rate": 2.488425925925926e-05,
532
- "loss": 2.0646,
533
  "step": 530
534
  },
535
  {
536
- "epoch": 17.013541666666665,
537
- "grad_norm": 7.775322437286377,
538
- "learning_rate": 2.4305555555555558e-05,
539
- "loss": 1.9341,
540
  "step": 540
541
  },
542
  {
543
- "epoch": 17.023958333333333,
544
- "grad_norm": 7.849579811096191,
545
- "learning_rate": 2.3726851851851854e-05,
546
- "loss": 1.9638,
547
- "step": 550
 
 
548
  },
549
  {
550
- "epoch": 17.032291666666666,
551
- "eval_accuracy": 0.36789297658862874,
552
- "eval_loss": 2.5920491218566895,
553
- "eval_runtime": 157.1641,
554
- "eval_samples_per_second": 1.902,
555
- "eval_steps_per_second": 0.064,
556
- "step": 558
557
  },
558
  {
559
- "epoch": 18.002083333333335,
560
- "grad_norm": 10.023223876953125,
561
- "learning_rate": 2.314814814814815e-05,
562
- "loss": 2.0018,
563
  "step": 560
564
  },
565
  {
566
- "epoch": 18.0125,
567
- "grad_norm": 7.027205467224121,
568
- "learning_rate": 2.2569444444444447e-05,
569
- "loss": 1.8571,
 
 
 
 
 
 
 
 
 
570
  "step": 570
571
  },
572
  {
573
- "epoch": 18.022916666666667,
574
- "grad_norm": 9.41115665435791,
575
- "learning_rate": 2.1990740740740743e-05,
576
- "loss": 1.7914,
577
  "step": 580
578
  },
579
  {
580
- "epoch": 18.032291666666666,
581
- "eval_accuracy": 0.3377926421404682,
582
- "eval_loss": 2.6151952743530273,
583
- "eval_runtime": 162.3006,
584
- "eval_samples_per_second": 1.842,
585
- "eval_steps_per_second": 0.062,
586
- "step": 589
587
  },
588
  {
589
- "epoch": 19.001041666666666,
590
- "grad_norm": 9.176055908203125,
591
- "learning_rate": 2.141203703703704e-05,
592
- "loss": 1.8165,
593
- "step": 590
 
 
594
  },
595
  {
596
- "epoch": 19.011458333333334,
597
- "grad_norm": 7.396921157836914,
598
- "learning_rate": 2.0833333333333336e-05,
599
- "loss": 1.6863,
600
  "step": 600
601
  },
602
  {
603
- "epoch": 19.021875,
604
- "grad_norm": 9.088372230529785,
605
- "learning_rate": 2.0254629629629632e-05,
606
- "loss": 1.597,
607
  "step": 610
608
  },
609
  {
610
- "epoch": 19.032291666666666,
611
- "grad_norm": 16.782529830932617,
612
- "learning_rate": 1.967592592592593e-05,
613
- "loss": 1.6925,
614
  "step": 620
615
  },
616
  {
617
- "epoch": 19.032291666666666,
618
- "eval_accuracy": 0.34448160535117056,
619
- "eval_loss": 2.5970685482025146,
620
- "eval_runtime": 161.5247,
621
- "eval_samples_per_second": 1.851,
622
- "eval_steps_per_second": 0.062,
623
- "step": 620
624
  },
625
  {
626
- "epoch": 20.010416666666668,
627
- "grad_norm": 10.128305435180664,
628
- "learning_rate": 1.9097222222222222e-05,
629
- "loss": 1.6303,
630
  "step": 630
631
  },
632
  {
633
- "epoch": 20.020833333333332,
634
- "grad_norm": 8.396921157836914,
635
- "learning_rate": 1.8518518518518518e-05,
636
- "loss": 1.6306,
637
  "step": 640
638
  },
639
  {
640
- "epoch": 20.03125,
641
- "grad_norm": 8.574676513671875,
642
- "learning_rate": 1.7939814814814815e-05,
643
- "loss": 1.5124,
644
- "step": 650
 
 
645
  },
646
  {
647
- "epoch": 20.032291666666666,
648
- "eval_accuracy": 0.34782608695652173,
649
- "eval_loss": 2.5766701698303223,
650
- "eval_runtime": 162.6227,
651
- "eval_samples_per_second": 1.839,
652
- "eval_steps_per_second": 0.061,
653
- "step": 651
654
  },
655
  {
656
- "epoch": 21.009375,
657
- "grad_norm": 8.482582092285156,
658
- "learning_rate": 1.736111111111111e-05,
659
- "loss": 1.5163,
660
  "step": 660
661
  },
662
  {
663
- "epoch": 21.019791666666666,
664
- "grad_norm": 7.806921482086182,
665
- "learning_rate": 1.6782407407407408e-05,
666
- "loss": 1.4579,
667
  "step": 670
668
  },
669
  {
670
- "epoch": 21.030208333333334,
671
- "grad_norm": 10.889120101928711,
672
- "learning_rate": 1.6203703703703704e-05,
673
- "loss": 1.4834,
674
- "step": 680
 
 
675
  },
676
  {
677
- "epoch": 21.032291666666666,
678
- "eval_accuracy": 0.3879598662207358,
679
- "eval_loss": 2.4438529014587402,
680
- "eval_runtime": 163.8828,
681
- "eval_samples_per_second": 1.824,
682
- "eval_steps_per_second": 0.061,
683
- "step": 682
684
  },
685
  {
686
- "epoch": 22.008333333333333,
687
- "grad_norm": 8.548680305480957,
688
- "learning_rate": 1.5625e-05,
689
- "loss": 1.3913,
690
  "step": 690
691
  },
692
  {
693
- "epoch": 22.01875,
694
- "grad_norm": 10.439925193786621,
695
- "learning_rate": 1.5046296296296297e-05,
696
- "loss": 1.4161,
697
  "step": 700
698
  },
699
  {
700
- "epoch": 22.029166666666665,
701
- "grad_norm": 8.317498207092285,
702
- "learning_rate": 1.4467592592592593e-05,
703
- "loss": 1.4565,
704
- "step": 710
 
 
705
  },
706
  {
707
- "epoch": 22.032291666666666,
708
- "eval_accuracy": 0.38461538461538464,
709
- "eval_loss": 2.405748128890991,
710
- "eval_runtime": 162.732,
711
- "eval_samples_per_second": 1.837,
712
- "eval_steps_per_second": 0.061,
713
- "step": 713
714
  },
715
  {
716
- "epoch": 23.007291666666667,
717
- "grad_norm": 8.510457038879395,
718
- "learning_rate": 1.388888888888889e-05,
719
- "loss": 1.4043,
720
  "step": 720
721
  },
722
  {
723
- "epoch": 23.017708333333335,
724
- "grad_norm": 9.268413543701172,
725
- "learning_rate": 1.3310185185185186e-05,
726
- "loss": 1.2734,
 
 
 
 
 
 
 
 
 
727
  "step": 730
728
  },
729
  {
730
- "epoch": 23.028125,
731
- "grad_norm": 8.000787734985352,
732
- "learning_rate": 1.2731481481481482e-05,
733
- "loss": 1.279,
734
  "step": 740
735
  },
736
  {
737
- "epoch": 23.032291666666666,
738
- "eval_accuracy": 0.35451505016722407,
739
- "eval_loss": 2.550072193145752,
740
- "eval_runtime": 160.4305,
741
- "eval_samples_per_second": 1.864,
742
- "eval_steps_per_second": 0.062,
743
- "step": 744
744
  },
745
  {
746
- "epoch": 24.00625,
747
- "grad_norm": 8.658973693847656,
748
- "learning_rate": 1.2152777777777779e-05,
749
- "loss": 1.1963,
750
- "step": 750
 
 
751
  },
752
  {
753
- "epoch": 24.016666666666666,
754
- "grad_norm": 7.558023929595947,
755
- "learning_rate": 1.1574074074074075e-05,
756
- "loss": 1.2601,
757
  "step": 760
758
  },
759
  {
760
- "epoch": 24.027083333333334,
761
- "grad_norm": 9.91243839263916,
762
- "learning_rate": 1.0995370370370372e-05,
763
- "loss": 1.1477,
764
  "step": 770
765
  },
766
  {
767
- "epoch": 24.032291666666666,
768
- "eval_accuracy": 0.44816053511705684,
769
- "eval_loss": 2.3246614933013916,
770
- "eval_runtime": 157.9743,
771
- "eval_samples_per_second": 1.893,
772
- "eval_steps_per_second": 0.063,
773
- "step": 775
774
  },
775
  {
776
- "epoch": 25.005208333333332,
777
- "grad_norm": 10.049819946289062,
778
- "learning_rate": 1.0416666666666668e-05,
779
- "loss": 1.1727,
780
- "step": 780
 
 
781
  },
782
  {
783
- "epoch": 25.015625,
784
- "grad_norm": 8.463665008544922,
785
- "learning_rate": 9.837962962962964e-06,
786
- "loss": 1.1207,
787
  "step": 790
788
  },
789
  {
790
- "epoch": 25.026041666666668,
791
- "grad_norm": 7.52623176574707,
792
- "learning_rate": 9.259259259259259e-06,
793
- "loss": 1.2573,
794
  "step": 800
795
  },
796
  {
797
- "epoch": 25.032291666666666,
798
- "eval_accuracy": 0.4882943143812709,
799
- "eval_loss": 2.1776490211486816,
800
- "eval_runtime": 158.1033,
801
- "eval_samples_per_second": 1.891,
802
- "eval_steps_per_second": 0.063,
803
- "step": 806
804
  },
805
  {
806
- "epoch": 26.004166666666666,
807
- "grad_norm": 9.336162567138672,
808
- "learning_rate": 8.680555555555556e-06,
809
- "loss": 1.1791,
 
 
810
  "step": 810
811
  },
812
  {
813
- "epoch": 26.014583333333334,
814
- "grad_norm": 6.728664875030518,
815
- "learning_rate": 8.101851851851852e-06,
816
- "loss": 1.1284,
817
  "step": 820
818
  },
819
  {
820
- "epoch": 26.025,
821
- "grad_norm": 7.307468414306641,
822
- "learning_rate": 7.523148148148148e-06,
823
- "loss": 1.0825,
824
  "step": 830
825
  },
826
  {
827
- "epoch": 26.032291666666666,
828
- "eval_accuracy": 0.4782608695652174,
829
- "eval_loss": 2.14430832862854,
830
- "eval_runtime": 156.854,
831
- "eval_samples_per_second": 1.906,
832
- "eval_steps_per_second": 0.064,
833
- "step": 837
834
- },
835
- {
836
- "epoch": 27.003125,
837
- "grad_norm": 8.843693733215332,
838
- "learning_rate": 6.944444444444445e-06,
839
- "loss": 1.1431,
840
- "step": 840
841
- },
842
- {
843
- "epoch": 27.013541666666665,
844
- "grad_norm": 9.648551940917969,
845
- "learning_rate": 6.365740740740741e-06,
846
- "loss": 1.0873,
847
- "step": 850
848
- },
849
- {
850
- "epoch": 27.023958333333333,
851
- "grad_norm": 10.551318168640137,
852
- "learning_rate": 5.787037037037038e-06,
853
- "loss": 1.2121,
854
- "step": 860
855
- },
856
- {
857
- "epoch": 27.032291666666666,
858
- "eval_accuracy": 0.4782608695652174,
859
- "eval_loss": 2.149031639099121,
860
- "eval_runtime": 156.4789,
861
- "eval_samples_per_second": 1.911,
862
- "eval_steps_per_second": 0.064,
863
- "step": 868
864
- },
865
- {
866
- "epoch": 28.002083333333335,
867
- "grad_norm": 7.037572383880615,
868
- "learning_rate": 5.208333333333334e-06,
869
- "loss": 1.0147,
870
- "step": 870
871
- },
872
- {
873
- "epoch": 28.0125,
874
- "grad_norm": 11.031582832336426,
875
- "learning_rate": 4.6296296296296296e-06,
876
- "loss": 1.0574,
877
- "step": 880
878
- },
879
- {
880
- "epoch": 28.022916666666667,
881
- "grad_norm": 7.408577919006348,
882
- "learning_rate": 4.050925925925926e-06,
883
- "loss": 1.0887,
884
- "step": 890
885
- },
886
- {
887
- "epoch": 28.032291666666666,
888
- "eval_accuracy": 0.47157190635451507,
889
- "eval_loss": 2.151550054550171,
890
- "eval_runtime": 157.7417,
891
- "eval_samples_per_second": 1.896,
892
- "eval_steps_per_second": 0.063,
893
- "step": 899
894
- },
895
- {
896
- "epoch": 29.001041666666666,
897
- "grad_norm": 7.416742324829102,
898
- "learning_rate": 3.4722222222222224e-06,
899
- "loss": 1.1448,
900
- "step": 900
901
- },
902
- {
903
- "epoch": 29.011458333333334,
904
- "grad_norm": 8.579631805419922,
905
- "learning_rate": 2.893518518518519e-06,
906
- "loss": 1.051,
907
- "step": 910
908
- },
909
- {
910
- "epoch": 29.021875,
911
- "grad_norm": 7.20076847076416,
912
- "learning_rate": 2.3148148148148148e-06,
913
- "loss": 0.9936,
914
- "step": 920
915
- },
916
- {
917
- "epoch": 29.032291666666666,
918
- "grad_norm": 20.40438461303711,
919
- "learning_rate": 1.7361111111111112e-06,
920
- "loss": 1.1127,
921
- "step": 930
922
- },
923
- {
924
- "epoch": 29.032291666666666,
925
- "eval_accuracy": 0.4882943143812709,
926
- "eval_loss": 2.1050899028778076,
927
- "eval_runtime": 158.1493,
928
- "eval_samples_per_second": 1.891,
929
- "eval_steps_per_second": 0.063,
930
- "step": 930
931
- },
932
- {
933
- "epoch": 30.010416666666668,
934
- "grad_norm": 7.520814895629883,
935
- "learning_rate": 1.1574074074074074e-06,
936
- "loss": 1.0214,
937
- "step": 940
938
- },
939
- {
940
- "epoch": 30.020833333333332,
941
- "grad_norm": 7.830852508544922,
942
- "learning_rate": 5.787037037037037e-07,
943
- "loss": 0.9516,
944
- "step": 950
945
- },
946
- {
947
- "epoch": 30.03125,
948
- "grad_norm": 8.230799674987793,
949
- "learning_rate": 0.0,
950
- "loss": 0.9905,
951
- "step": 960
952
- },
953
- {
954
- "epoch": 30.03125,
955
- "eval_accuracy": 0.4816053511705686,
956
- "eval_loss": 2.117016553878784,
957
- "eval_runtime": 175.2626,
958
- "eval_samples_per_second": 1.706,
959
- "eval_steps_per_second": 0.057,
960
- "step": 960
961
- },
962
- {
963
- "epoch": 30.03125,
964
- "step": 960,
965
- "total_flos": 3.733004678582315e+19,
966
- "train_loss": 2.590478341778119,
967
- "train_runtime": 22304.5183,
968
- "train_samples_per_second": 1.377,
969
- "train_steps_per_second": 0.043
970
- },
971
- {
972
- "epoch": 30.03125,
973
- "eval_accuracy": 0.4882943143812709,
974
- "eval_loss": 2.1814045906066895,
975
- "eval_runtime": 181.2056,
976
- "eval_samples_per_second": 1.65,
977
- "eval_steps_per_second": 0.055,
978
- "step": 960
979
- },
980
- {
981
- "epoch": 30.03125,
982
- "eval_accuracy": 0.4882943143812709,
983
- "eval_loss": 2.180596113204956,
984
- "eval_runtime": 179.6038,
985
- "eval_samples_per_second": 1.665,
986
- "eval_steps_per_second": 0.056,
987
- "step": 960
988
  }
989
  ],
990
  "logging_steps": 10,
991
- "max_steps": 960,
992
  "num_input_tokens_seen": 0,
993
  "num_train_epochs": 9223372036854775807,
994
  "save_steps": 500,
@@ -1004,7 +913,7 @@
1004
  "attributes": {}
1005
  }
1006
  },
1007
- "total_flos": 3.733004678582315e+19,
1008
  "train_batch_size": 32,
1009
  "trial_name": null,
1010
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.35570469798657717,
3
+ "best_model_checkpoint": "videomae-base-finetuned-ucf101-subset/checkpoint-783",
4
+ "epoch": 30.026442307692307,
5
  "eval_steps": 500,
6
+ "global_step": 832,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.01201923076923077,
13
+ "grad_norm": 9.181777954101562,
14
+ "learning_rate": 5.9523809523809525e-06,
15
+ "loss": 4.2532,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.02403846153846154,
20
+ "grad_norm": 6.192462921142578,
21
+ "learning_rate": 1.1904761904761905e-05,
22
+ "loss": 4.2583,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.03245192307692308,
27
+ "eval_accuracy": 0.015659955257270694,
28
+ "eval_loss": 4.225129127502441,
29
+ "eval_runtime": 113.6438,
30
+ "eval_samples_per_second": 3.933,
31
+ "eval_steps_per_second": 0.123,
32
+ "step": 27
33
  },
34
  {
35
+ "epoch": 1.0036057692307692,
36
+ "grad_norm": 6.222796440124512,
37
+ "learning_rate": 1.785714285714286e-05,
38
+ "loss": 4.2505,
39
+ "step": 30
 
 
40
  },
41
  {
42
+ "epoch": 1.015625,
43
+ "grad_norm": 3.965486526489258,
44
+ "learning_rate": 2.380952380952381e-05,
45
+ "loss": 4.2489,
46
  "step": 40
47
  },
48
  {
49
+ "epoch": 1.0276442307692308,
50
+ "grad_norm": 5.0857744216918945,
51
+ "learning_rate": 2.9761904761904762e-05,
52
+ "loss": 4.2374,
53
  "step": 50
54
  },
55
  {
56
+ "epoch": 1.0324519230769231,
57
+ "eval_accuracy": 0.013422818791946308,
58
+ "eval_loss": 4.226736068725586,
59
+ "eval_runtime": 113.905,
60
+ "eval_samples_per_second": 3.924,
61
+ "eval_steps_per_second": 0.123,
62
+ "step": 54
63
  },
64
  {
65
+ "epoch": 2.0072115384615383,
66
+ "grad_norm": 3.5929408073425293,
67
+ "learning_rate": 3.571428571428572e-05,
68
+ "loss": 4.2403,
69
+ "step": 60
 
 
70
  },
71
  {
72
+ "epoch": 2.019230769230769,
73
+ "grad_norm": 3.6417300701141357,
74
+ "learning_rate": 4.166666666666667e-05,
75
+ "loss": 4.255,
76
  "step": 70
77
  },
78
  {
79
+ "epoch": 2.03125,
80
+ "grad_norm": 3.2748305797576904,
81
+ "learning_rate": 4.761904761904762e-05,
82
+ "loss": 4.2678,
83
  "step": 80
84
  },
85
  {
86
+ "epoch": 2.032451923076923,
87
+ "eval_accuracy": 0.013422818791946308,
88
+ "eval_loss": 4.2263360023498535,
89
+ "eval_runtime": 115.125,
90
+ "eval_samples_per_second": 3.883,
91
+ "eval_steps_per_second": 0.122,
92
+ "step": 81
93
  },
94
  {
95
+ "epoch": 3.0108173076923075,
96
+ "grad_norm": 3.1406173706054688,
97
+ "learning_rate": 4.959893048128342e-05,
98
+ "loss": 4.2151,
99
+ "step": 90
 
 
100
  },
101
  {
102
+ "epoch": 3.0228365384615383,
103
+ "grad_norm": 2.8441436290740967,
104
+ "learning_rate": 4.8930481283422465e-05,
105
+ "loss": 4.2537,
106
  "step": 100
107
  },
108
  {
109
+ "epoch": 3.032451923076923,
110
+ "eval_accuracy": 0.015659955257270694,
111
+ "eval_loss": 4.221240997314453,
112
+ "eval_runtime": 115.863,
113
+ "eval_samples_per_second": 3.858,
114
+ "eval_steps_per_second": 0.121,
115
+ "step": 108
116
+ },
117
+ {
118
+ "epoch": 4.002403846153846,
119
+ "grad_norm": 2.7364041805267334,
120
+ "learning_rate": 4.8262032085561496e-05,
121
+ "loss": 4.2568,
122
  "step": 110
123
  },
124
  {
125
+ "epoch": 4.014423076923077,
126
+ "grad_norm": 2.71181058883667,
127
+ "learning_rate": 4.759358288770054e-05,
128
+ "loss": 4.2288,
129
  "step": 120
130
  },
131
  {
132
+ "epoch": 4.0264423076923075,
133
+ "grad_norm": 2.7206692695617676,
134
+ "learning_rate": 4.6925133689839576e-05,
135
+ "loss": 4.2401,
136
+ "step": 130
 
 
137
  },
138
  {
139
+ "epoch": 4.032451923076923,
140
+ "eval_accuracy": 0.015659955257270694,
141
+ "eval_loss": 4.206515312194824,
142
+ "eval_runtime": 114.2862,
143
+ "eval_samples_per_second": 3.911,
144
+ "eval_steps_per_second": 0.122,
145
+ "step": 135
146
  },
147
  {
148
+ "epoch": 5.006009615384615,
149
+ "grad_norm": 2.6117472648620605,
150
+ "learning_rate": 4.625668449197861e-05,
151
+ "loss": 4.2192,
152
  "step": 140
153
  },
154
  {
155
+ "epoch": 5.018028846153846,
156
+ "grad_norm": 2.90580677986145,
157
+ "learning_rate": 4.558823529411765e-05,
158
+ "loss": 4.2204,
159
  "step": 150
160
  },
161
  {
162
+ "epoch": 5.030048076923077,
163
+ "grad_norm": 2.9660141468048096,
164
+ "learning_rate": 4.491978609625669e-05,
165
+ "loss": 4.2519,
166
+ "step": 160
 
 
167
  },
168
  {
169
+ "epoch": 5.032451923076923,
170
+ "eval_accuracy": 0.017897091722595078,
171
+ "eval_loss": 4.207516193389893,
172
+ "eval_runtime": 117.7024,
173
+ "eval_samples_per_second": 3.798,
174
+ "eval_steps_per_second": 0.119,
175
+ "step": 162
176
  },
177
  {
178
+ "epoch": 6.009615384615385,
179
+ "grad_norm": 2.8973159790039062,
180
+ "learning_rate": 4.4251336898395724e-05,
181
  "loss": 4.2112,
182
  "step": 170
183
  },
184
  {
185
+ "epoch": 6.021634615384615,
186
+ "grad_norm": 2.734694480895996,
187
+ "learning_rate": 4.358288770053476e-05,
188
+ "loss": 4.2198,
189
  "step": 180
190
  },
191
  {
192
+ "epoch": 6.032451923076923,
193
+ "eval_accuracy": 0.013422818791946308,
194
+ "eval_loss": 4.205463409423828,
195
+ "eval_runtime": 116.7575,
196
+ "eval_samples_per_second": 3.828,
197
+ "eval_steps_per_second": 0.12,
198
+ "step": 189
199
  },
200
  {
201
+ "epoch": 7.001201923076923,
202
+ "grad_norm": 2.421506881713867,
203
+ "learning_rate": 4.29144385026738e-05,
204
+ "loss": 4.2292,
205
  "step": 190
206
  },
207
  {
208
+ "epoch": 7.013221153846154,
209
+ "grad_norm": 2.82832670211792,
210
+ "learning_rate": 4.224598930481284e-05,
211
+ "loss": 4.2019,
212
  "step": 200
213
  },
214
  {
215
+ "epoch": 7.025240384615385,
216
+ "grad_norm": 3.0022928714752197,
217
+ "learning_rate": 4.157754010695187e-05,
218
+ "loss": 4.2111,
219
  "step": 210
220
  },
221
  {
222
+ "epoch": 7.032451923076923,
223
+ "eval_accuracy": 0.017897091722595078,
224
+ "eval_loss": 4.19577693939209,
225
+ "eval_runtime": 119.1446,
226
+ "eval_samples_per_second": 3.752,
227
+ "eval_steps_per_second": 0.118,
228
+ "step": 216
229
  },
230
  {
231
+ "epoch": 8.004807692307692,
232
+ "grad_norm": 2.6406443119049072,
233
+ "learning_rate": 4.0909090909090915e-05,
234
+ "loss": 4.2068,
235
  "step": 220
236
  },
237
  {
238
+ "epoch": 8.016826923076923,
239
+ "grad_norm": 3.169625759124756,
240
+ "learning_rate": 4.024064171122995e-05,
241
+ "loss": 4.1759,
242
  "step": 230
243
  },
244
  {
245
+ "epoch": 8.028846153846153,
246
+ "grad_norm": 2.8210370540618896,
247
+ "learning_rate": 3.957219251336899e-05,
248
+ "loss": 4.1871,
249
  "step": 240
250
  },
251
  {
252
+ "epoch": 8.032451923076923,
253
+ "eval_accuracy": 0.04697986577181208,
254
+ "eval_loss": 4.147375583648682,
255
+ "eval_runtime": 117.849,
256
+ "eval_samples_per_second": 3.793,
257
+ "eval_steps_per_second": 0.119,
258
+ "step": 243
259
  },
260
  {
261
+ "epoch": 9.008413461538462,
262
+ "grad_norm": 3.140075206756592,
263
+ "learning_rate": 3.8903743315508025e-05,
264
+ "loss": 4.1314,
265
  "step": 250
266
  },
267
  {
268
+ "epoch": 9.020432692307692,
269
+ "grad_norm": 3.0806028842926025,
270
+ "learning_rate": 3.8235294117647055e-05,
271
+ "loss": 4.0778,
272
  "step": 260
273
  },
274
  {
275
+ "epoch": 9.032451923076923,
276
+ "grad_norm": 12.136305809020996,
277
+ "learning_rate": 3.75668449197861e-05,
278
+ "loss": 4.0891,
279
  "step": 270
280
  },
281
  {
282
+ "epoch": 9.032451923076923,
283
+ "eval_accuracy": 0.0447427293064877,
284
+ "eval_loss": 4.032660007476807,
285
+ "eval_runtime": 116.5729,
286
+ "eval_samples_per_second": 3.835,
287
+ "eval_steps_per_second": 0.12,
288
+ "step": 270
289
  },
290
  {
291
+ "epoch": 10.01201923076923,
292
+ "grad_norm": 4.812855243682861,
293
+ "learning_rate": 3.6898395721925136e-05,
294
+ "loss": 3.9545,
295
  "step": 280
296
  },
297
  {
298
+ "epoch": 10.024038461538462,
299
+ "grad_norm": 5.218036651611328,
300
+ "learning_rate": 3.622994652406417e-05,
301
+ "loss": 3.7963,
302
  "step": 290
303
  },
304
  {
305
+ "epoch": 10.032451923076923,
306
+ "eval_accuracy": 0.08277404921700224,
307
+ "eval_loss": 3.82175874710083,
308
+ "eval_runtime": 115.9952,
309
+ "eval_samples_per_second": 3.854,
310
+ "eval_steps_per_second": 0.121,
311
+ "step": 297
312
  },
313
  {
314
+ "epoch": 11.00360576923077,
315
+ "grad_norm": 4.765311241149902,
316
+ "learning_rate": 3.556149732620321e-05,
317
+ "loss": 3.7306,
318
+ "step": 300
319
  },
320
  {
321
+ "epoch": 11.015625,
322
+ "grad_norm": 6.956701755523682,
323
+ "learning_rate": 3.489304812834225e-05,
324
+ "loss": 3.5563,
 
 
325
  "step": 310
326
  },
327
  {
328
+ "epoch": 11.02764423076923,
329
+ "grad_norm": 5.394344329833984,
330
+ "learning_rate": 3.4224598930481284e-05,
331
+ "loss": 3.4787,
332
  "step": 320
333
  },
334
  {
335
+ "epoch": 11.032451923076923,
336
+ "eval_accuracy": 0.11185682326621924,
337
+ "eval_loss": 3.7061688899993896,
338
+ "eval_runtime": 113.6071,
339
+ "eval_samples_per_second": 3.935,
340
+ "eval_steps_per_second": 0.123,
341
+ "step": 324
342
+ },
343
+ {
344
+ "epoch": 12.007211538461538,
345
+ "grad_norm": 5.70036506652832,
346
+ "learning_rate": 3.355614973262032e-05,
347
+ "loss": 3.4821,
348
  "step": 330
349
  },
350
  {
351
+ "epoch": 12.01923076923077,
352
+ "grad_norm": 6.370093822479248,
353
+ "learning_rate": 3.288770053475936e-05,
354
+ "loss": 3.3756,
355
  "step": 340
356
  },
357
  {
358
+ "epoch": 12.03125,
359
+ "grad_norm": 5.8564133644104,
360
+ "learning_rate": 3.22192513368984e-05,
361
+ "loss": 3.1883,
362
+ "step": 350
 
 
363
  },
364
  {
365
+ "epoch": 12.032451923076923,
366
+ "eval_accuracy": 0.11185682326621924,
367
+ "eval_loss": 3.58866810798645,
368
+ "eval_runtime": 112.5771,
369
+ "eval_samples_per_second": 3.971,
370
+ "eval_steps_per_second": 0.124,
371
+ "step": 351
372
  },
373
  {
374
+ "epoch": 13.010817307692308,
375
+ "grad_norm": 6.884145736694336,
376
+ "learning_rate": 3.155080213903743e-05,
377
+ "loss": 3.1542,
378
  "step": 360
379
  },
380
  {
381
+ "epoch": 13.022836538461538,
382
+ "grad_norm": 7.7231059074401855,
383
+ "learning_rate": 3.0882352941176475e-05,
384
+ "loss": 3.0045,
385
  "step": 370
386
  },
387
  {
388
+ "epoch": 13.032451923076923,
389
+ "eval_accuracy": 0.14317673378076062,
390
+ "eval_loss": 3.380981206893921,
391
+ "eval_runtime": 113.3509,
392
+ "eval_samples_per_second": 3.944,
393
+ "eval_steps_per_second": 0.124,
394
+ "step": 378
395
  },
396
  {
397
+ "epoch": 14.002403846153847,
398
+ "grad_norm": 6.160039901733398,
399
+ "learning_rate": 3.0213903743315508e-05,
400
+ "loss": 2.9704,
401
  "step": 380
402
  },
403
  {
404
+ "epoch": 14.014423076923077,
405
+ "grad_norm": 8.257513046264648,
406
+ "learning_rate": 2.954545454545455e-05,
407
+ "loss": 2.7723,
408
  "step": 390
409
  },
410
  {
411
+ "epoch": 14.026442307692308,
412
+ "grad_norm": 6.512996196746826,
413
+ "learning_rate": 2.8877005347593582e-05,
414
+ "loss": 2.8045,
415
  "step": 400
416
  },
417
  {
418
+ "epoch": 14.032451923076923,
419
+ "eval_accuracy": 0.21923937360178972,
420
+ "eval_loss": 3.2211973667144775,
421
+ "eval_runtime": 116.514,
422
+ "eval_samples_per_second": 3.836,
423
+ "eval_steps_per_second": 0.12,
424
+ "step": 405
425
  },
426
  {
427
+ "epoch": 15.006009615384615,
428
+ "grad_norm": 6.627754211425781,
429
+ "learning_rate": 2.8208556149732622e-05,
430
+ "loss": 2.7247,
431
  "step": 410
432
  },
433
  {
434
+ "epoch": 15.018028846153847,
435
+ "grad_norm": 6.555057048797607,
436
+ "learning_rate": 2.754010695187166e-05,
437
+ "loss": 2.5789,
438
  "step": 420
439
  },
440
  {
441
+ "epoch": 15.030048076923077,
442
+ "grad_norm": 8.675689697265625,
443
+ "learning_rate": 2.68716577540107e-05,
444
+ "loss": 2.5344,
445
  "step": 430
446
  },
447
  {
448
+ "epoch": 15.032451923076923,
449
+ "eval_accuracy": 0.15883668903803133,
450
+ "eval_loss": 3.270237922668457,
451
+ "eval_runtime": 114.1519,
452
+ "eval_samples_per_second": 3.916,
453
+ "eval_steps_per_second": 0.123,
454
+ "step": 432
455
  },
456
  {
457
+ "epoch": 16.009615384615383,
458
+ "grad_norm": 7.248364448547363,
459
+ "learning_rate": 2.6203208556149733e-05,
460
+ "loss": 2.3791,
461
  "step": 440
462
  },
463
  {
464
+ "epoch": 16.021634615384617,
465
+ "grad_norm": 9.409223556518555,
466
+ "learning_rate": 2.5534759358288773e-05,
467
+ "loss": 2.3725,
468
  "step": 450
469
  },
470
  {
471
+ "epoch": 16.032451923076923,
472
+ "eval_accuracy": 0.14093959731543623,
473
+ "eval_loss": 3.35996413230896,
474
+ "eval_runtime": 113.8728,
475
+ "eval_samples_per_second": 3.925,
476
+ "eval_steps_per_second": 0.123,
477
+ "step": 459
478
  },
479
  {
480
+ "epoch": 17.001201923076923,
481
+ "grad_norm": 7.834836006164551,
482
+ "learning_rate": 2.4866310160427807e-05,
483
+ "loss": 2.2722,
484
+ "step": 460
 
 
485
  },
486
  {
487
+ "epoch": 17.013221153846153,
488
+ "grad_norm": 7.643532752990723,
489
+ "learning_rate": 2.4197860962566847e-05,
490
+ "loss": 2.1434,
491
  "step": 470
492
  },
493
  {
494
+ "epoch": 17.025240384615383,
495
+ "grad_norm": 8.425023078918457,
496
+ "learning_rate": 2.3529411764705884e-05,
497
+ "loss": 2.2074,
498
  "step": 480
499
  },
500
  {
501
+ "epoch": 17.032451923076923,
502
+ "eval_accuracy": 0.2371364653243848,
503
+ "eval_loss": 2.9730582237243652,
504
+ "eval_runtime": 113.5853,
505
+ "eval_samples_per_second": 3.935,
506
+ "eval_steps_per_second": 0.123,
507
+ "step": 486
508
  },
509
  {
510
+ "epoch": 18.004807692307693,
511
+ "grad_norm": 7.840888500213623,
512
+ "learning_rate": 2.286096256684492e-05,
513
+ "loss": 2.0823,
514
+ "step": 490
 
 
515
  },
516
  {
517
+ "epoch": 18.016826923076923,
518
+ "grad_norm": 9.385211944580078,
519
+ "learning_rate": 2.2192513368983957e-05,
520
+ "loss": 1.9511,
521
  "step": 500
522
  },
523
  {
524
+ "epoch": 18.028846153846153,
525
+ "grad_norm": 8.72028636932373,
526
+ "learning_rate": 2.1524064171122994e-05,
527
+ "loss": 2.1094,
528
  "step": 510
529
  },
530
  {
531
+ "epoch": 18.032451923076923,
532
+ "eval_accuracy": 0.26174496644295303,
533
+ "eval_loss": 2.8679935932159424,
534
+ "eval_runtime": 113.6275,
535
+ "eval_samples_per_second": 3.934,
536
+ "eval_steps_per_second": 0.123,
537
+ "step": 513
538
  },
539
  {
540
+ "epoch": 19.00841346153846,
541
+ "grad_norm": 9.284242630004883,
542
+ "learning_rate": 2.0855614973262035e-05,
543
+ "loss": 2.0278,
544
+ "step": 520
 
 
545
  },
546
  {
547
+ "epoch": 19.020432692307693,
548
+ "grad_norm": 6.913205623626709,
549
+ "learning_rate": 2.018716577540107e-05,
550
+ "loss": 1.8727,
551
  "step": 530
552
  },
553
  {
554
+ "epoch": 19.032451923076923,
555
+ "grad_norm": 21.923686981201172,
556
+ "learning_rate": 1.951871657754011e-05,
557
+ "loss": 1.9839,
558
  "step": 540
559
  },
560
  {
561
+ "epoch": 19.032451923076923,
562
+ "eval_accuracy": 0.27069351230425054,
563
+ "eval_loss": 2.8359620571136475,
564
+ "eval_runtime": 115.0445,
565
+ "eval_samples_per_second": 3.885,
566
+ "eval_steps_per_second": 0.122,
567
+ "step": 540
568
  },
569
  {
570
+ "epoch": 20.01201923076923,
571
+ "grad_norm": 8.244462013244629,
572
+ "learning_rate": 1.8850267379679145e-05,
573
+ "loss": 1.7584,
574
+ "step": 550
 
 
575
  },
576
  {
577
+ "epoch": 20.02403846153846,
578
+ "grad_norm": 8.496162414550781,
579
+ "learning_rate": 1.8181818181818182e-05,
580
+ "loss": 1.7354,
581
  "step": 560
582
  },
583
  {
584
+ "epoch": 20.032451923076923,
585
+ "eval_accuracy": 0.28187919463087246,
586
+ "eval_loss": 2.7890186309814453,
587
+ "eval_runtime": 111.5479,
588
+ "eval_samples_per_second": 4.007,
589
+ "eval_steps_per_second": 0.126,
590
+ "step": 567
591
+ },
592
+ {
593
+ "epoch": 21.00360576923077,
594
+ "grad_norm": 7.818769931793213,
595
+ "learning_rate": 1.7513368983957222e-05,
596
+ "loss": 1.701,
597
  "step": 570
598
  },
599
  {
600
+ "epoch": 21.015625,
601
+ "grad_norm": 9.921280860900879,
602
+ "learning_rate": 1.684491978609626e-05,
603
+ "loss": 1.648,
604
  "step": 580
605
  },
606
  {
607
+ "epoch": 21.02764423076923,
608
+ "grad_norm": 8.053401947021484,
609
+ "learning_rate": 1.6176470588235296e-05,
610
+ "loss": 1.6843,
611
+ "step": 590
 
 
612
  },
613
  {
614
+ "epoch": 21.032451923076923,
615
+ "eval_accuracy": 0.29977628635346754,
616
+ "eval_loss": 2.728635787963867,
617
+ "eval_runtime": 114.938,
618
+ "eval_samples_per_second": 3.889,
619
+ "eval_steps_per_second": 0.122,
620
+ "step": 594
621
  },
622
  {
623
+ "epoch": 22.00721153846154,
624
+ "grad_norm": 10.640948295593262,
625
+ "learning_rate": 1.5508021390374333e-05,
626
+ "loss": 1.4562,
627
  "step": 600
628
  },
629
  {
630
+ "epoch": 22.01923076923077,
631
+ "grad_norm": 9.333779335021973,
632
+ "learning_rate": 1.4839572192513372e-05,
633
+ "loss": 1.5091,
634
  "step": 610
635
  },
636
  {
637
+ "epoch": 22.03125,
638
+ "grad_norm": 9.433292388916016,
639
+ "learning_rate": 1.4171122994652408e-05,
640
+ "loss": 1.6266,
641
  "step": 620
642
  },
643
  {
644
+ "epoch": 22.032451923076923,
645
+ "eval_accuracy": 0.2841163310961969,
646
+ "eval_loss": 2.806154489517212,
647
+ "eval_runtime": 115.4195,
648
+ "eval_samples_per_second": 3.873,
649
+ "eval_steps_per_second": 0.121,
650
+ "step": 621
651
  },
652
  {
653
+ "epoch": 23.010817307692307,
654
+ "grad_norm": 8.609794616699219,
655
+ "learning_rate": 1.3502673796791445e-05,
656
+ "loss": 1.6214,
657
  "step": 630
658
  },
659
  {
660
+ "epoch": 23.02283653846154,
661
+ "grad_norm": 6.943145275115967,
662
+ "learning_rate": 1.2834224598930484e-05,
663
+ "loss": 1.4083,
664
  "step": 640
665
  },
666
  {
667
+ "epoch": 23.032451923076923,
668
+ "eval_accuracy": 0.2595078299776286,
669
+ "eval_loss": 2.8204569816589355,
670
+ "eval_runtime": 115.0186,
671
+ "eval_samples_per_second": 3.886,
672
+ "eval_steps_per_second": 0.122,
673
+ "step": 648
674
  },
675
  {
676
+ "epoch": 24.002403846153847,
677
+ "grad_norm": 10.106169700622559,
678
+ "learning_rate": 1.2165775401069519e-05,
679
+ "loss": 1.3264,
680
+ "step": 650
 
 
681
  },
682
  {
683
+ "epoch": 24.014423076923077,
684
+ "grad_norm": 10.27568244934082,
685
+ "learning_rate": 1.1497326203208558e-05,
686
+ "loss": 1.3662,
687
  "step": 660
688
  },
689
  {
690
+ "epoch": 24.026442307692307,
691
+ "grad_norm": 10.36124038696289,
692
+ "learning_rate": 1.0828877005347594e-05,
693
+ "loss": 1.4422,
694
  "step": 670
695
  },
696
  {
697
+ "epoch": 24.032451923076923,
698
+ "eval_accuracy": 0.30648769574944074,
699
+ "eval_loss": 2.6406848430633545,
700
+ "eval_runtime": 116.1998,
701
+ "eval_samples_per_second": 3.847,
702
+ "eval_steps_per_second": 0.12,
703
+ "step": 675
704
  },
705
  {
706
+ "epoch": 25.006009615384617,
707
+ "grad_norm": 8.807711601257324,
708
+ "learning_rate": 1.0160427807486631e-05,
709
+ "loss": 1.3912,
710
+ "step": 680
 
 
711
  },
712
  {
713
+ "epoch": 25.018028846153847,
714
+ "grad_norm": 8.629064559936523,
715
+ "learning_rate": 9.49197860962567e-06,
716
+ "loss": 1.2904,
717
  "step": 690
718
  },
719
  {
720
+ "epoch": 25.030048076923077,
721
+ "grad_norm": 9.190649032592773,
722
+ "learning_rate": 8.823529411764707e-06,
723
+ "loss": 1.3897,
724
  "step": 700
725
  },
726
  {
727
+ "epoch": 25.032451923076923,
728
+ "eval_accuracy": 0.34675615212527966,
729
+ "eval_loss": 2.5948002338409424,
730
+ "eval_runtime": 118.4696,
731
+ "eval_samples_per_second": 3.773,
732
+ "eval_steps_per_second": 0.118,
733
+ "step": 702
734
  },
735
  {
736
+ "epoch": 26.009615384615383,
737
+ "grad_norm": 10.941615104675293,
738
+ "learning_rate": 8.155080213903744e-06,
739
+ "loss": 1.2647,
740
+ "step": 710
 
 
741
  },
742
  {
743
+ "epoch": 26.021634615384617,
744
+ "grad_norm": 9.282670021057129,
745
+ "learning_rate": 7.4866310160427806e-06,
746
+ "loss": 1.3906,
747
  "step": 720
748
  },
749
  {
750
+ "epoch": 26.032451923076923,
751
+ "eval_accuracy": 0.31543624161073824,
752
+ "eval_loss": 2.629518985748291,
753
+ "eval_runtime": 120.316,
754
+ "eval_samples_per_second": 3.715,
755
+ "eval_steps_per_second": 0.116,
756
+ "step": 729
757
+ },
758
+ {
759
+ "epoch": 27.001201923076923,
760
+ "grad_norm": 9.254827499389648,
761
+ "learning_rate": 6.818181818181818e-06,
762
+ "loss": 1.3257,
763
  "step": 730
764
  },
765
  {
766
+ "epoch": 27.013221153846153,
767
+ "grad_norm": 7.69126558303833,
768
+ "learning_rate": 6.149732620320856e-06,
769
+ "loss": 1.2581,
770
  "step": 740
771
  },
772
  {
773
+ "epoch": 27.025240384615383,
774
+ "grad_norm": 10.26952075958252,
775
+ "learning_rate": 5.481283422459893e-06,
776
+ "loss": 1.2291,
777
+ "step": 750
 
 
778
  },
779
  {
780
+ "epoch": 27.032451923076923,
781
+ "eval_accuracy": 0.3378076062639821,
782
+ "eval_loss": 2.5539445877075195,
783
+ "eval_runtime": 116.6007,
784
+ "eval_samples_per_second": 3.834,
785
+ "eval_steps_per_second": 0.12,
786
+ "step": 756
787
  },
788
  {
789
+ "epoch": 28.004807692307693,
790
+ "grad_norm": 11.07676887512207,
791
+ "learning_rate": 4.812834224598931e-06,
792
+ "loss": 1.29,
793
  "step": 760
794
  },
795
  {
796
+ "epoch": 28.016826923076923,
797
+ "grad_norm": 8.940054893493652,
798
+ "learning_rate": 4.144385026737968e-06,
799
+ "loss": 1.1383,
800
  "step": 770
801
  },
802
  {
803
+ "epoch": 28.028846153846153,
804
+ "grad_norm": 9.17213249206543,
805
+ "learning_rate": 3.4759358288770056e-06,
806
+ "loss": 1.3166,
807
+ "step": 780
 
 
808
  },
809
  {
810
+ "epoch": 28.032451923076923,
811
+ "eval_accuracy": 0.35570469798657717,
812
+ "eval_loss": 2.519993305206299,
813
+ "eval_runtime": 116.4901,
814
+ "eval_samples_per_second": 3.837,
815
+ "eval_steps_per_second": 0.12,
816
+ "step": 783
817
  },
818
  {
819
+ "epoch": 29.00841346153846,
820
+ "grad_norm": 11.884153366088867,
821
+ "learning_rate": 2.807486631016043e-06,
822
+ "loss": 1.232,
823
  "step": 790
824
  },
825
  {
826
+ "epoch": 29.020432692307693,
827
+ "grad_norm": 9.171028137207031,
828
+ "learning_rate": 2.1390374331550802e-06,
829
+ "loss": 1.1733,
830
  "step": 800
831
  },
832
  {
833
+ "epoch": 29.032451923076923,
834
+ "grad_norm": 24.7429256439209,
835
+ "learning_rate": 1.4705882352941177e-06,
836
+ "loss": 1.2619,
837
+ "step": 810
 
 
838
  },
839
  {
840
+ "epoch": 29.032451923076923,
841
+ "eval_accuracy": 0.35570469798657717,
842
+ "eval_loss": 2.530792236328125,
843
+ "eval_runtime": 118.4418,
844
+ "eval_samples_per_second": 3.774,
845
+ "eval_steps_per_second": 0.118,
846
  "step": 810
847
  },
848
  {
849
+ "epoch": 30.01201923076923,
850
+ "grad_norm": 8.288737297058105,
851
+ "learning_rate": 8.021390374331552e-07,
852
+ "loss": 1.1824,
853
  "step": 820
854
  },
855
  {
856
+ "epoch": 30.02403846153846,
857
+ "grad_norm": 8.749896049499512,
858
+ "learning_rate": 1.3368983957219251e-07,
859
+ "loss": 1.1393,
860
  "step": 830
861
  },
862
  {
863
+ "epoch": 30.026442307692307,
864
+ "eval_accuracy": 0.3243847874720358,
865
+ "eval_loss": 2.5855681896209717,
866
+ "eval_runtime": 118.0615,
867
+ "eval_samples_per_second": 3.786,
868
+ "eval_steps_per_second": 0.119,
869
+ "step": 832
870
+ },
871
+ {
872
+ "epoch": 30.026442307692307,
873
+ "step": 832,
874
+ "total_flos": 3.2147153612960956e+19,
875
+ "train_loss": 2.754230235058528,
876
+ "train_runtime": 10713.0029,
877
+ "train_samples_per_second": 2.485,
878
+ "train_steps_per_second": 0.078
879
+ },
880
+ {
881
+ "epoch": 30.026442307692307,
882
+ "eval_accuracy": 0.35570469798657717,
883
+ "eval_loss": 2.5194525718688965,
884
+ "eval_runtime": 114.2775,
885
+ "eval_samples_per_second": 3.912,
886
+ "eval_steps_per_second": 0.123,
887
+ "step": 832
888
+ },
889
+ {
890
+ "epoch": 30.026442307692307,
891
+ "eval_accuracy": 0.35570469798657717,
892
+ "eval_loss": 2.520230293273926,
893
+ "eval_runtime": 112.9248,
894
+ "eval_samples_per_second": 3.958,
895
+ "eval_steps_per_second": 0.124,
896
+ "step": 832
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
897
  }
898
  ],
899
  "logging_steps": 10,
900
+ "max_steps": 832,
901
  "num_input_tokens_seen": 0,
902
  "num_train_epochs": 9223372036854775807,
903
  "save_steps": 500,
 
913
  "attributes": {}
914
  }
915
  },
916
+ "total_flos": 3.2147153612960956e+19,
917
  "train_batch_size": 32,
918
  "trial_name": null,
919
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7cc4787d1003025077041fe8562408bfc17781b67555b6a4d4679cffac3df35
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ca0192f1ac816b4812752b0fe0fa76354065c0193d0654cff0a6bbaf0741618
3
  size 5368