ihsanahakiim commited on
Commit
e3f2c35
·
verified ·
1 Parent(s): a9d9650

End of training

Browse files
README.md CHANGED
@@ -18,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [MCG-NJU/videomae-base](https://huggingface.co/MCG-NJU/videomae-base) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 2.1814
22
  - Accuracy: 0.4883
23
 
24
  ## Model description
 
18
 
19
  This model is a fine-tuned version of [MCG-NJU/videomae-base](https://huggingface.co/MCG-NJU/videomae-base) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 2.1806
22
  - Accuracy: 0.4883
23
 
24
  ## Model description
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 15.03125,
3
- "eval_accuracy": 0.25752508361204013,
4
- "eval_loss": 3.0235204696655273,
5
- "eval_runtime": 157.8403,
6
- "eval_samples_per_second": 1.894,
7
- "eval_steps_per_second": 0.063
8
  }
 
1
  {
2
+ "epoch": 30.03125,
3
+ "eval_accuracy": 0.4882943143812709,
4
+ "eval_loss": 2.180596113204956,
5
+ "eval_runtime": 179.6038,
6
+ "eval_samples_per_second": 1.665,
7
+ "eval_steps_per_second": 0.056
8
  }
runs/Jan12_17-57-19_GAN-SVR/events.out.tfevents.1736694740.GAN-SVR.3870842.4 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c0d6d3585cbc49ec1dfb467b11a21b44f1b626da41caf1f23cba4d5c3a7b00a
3
- size 411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a4152e639af0b4173a211351cdff00a09ef6d111370e2b8acf891f431281470
3
+ size 734
test_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 15.03125,
3
- "eval_accuracy": 0.25752508361204013,
4
- "eval_loss": 3.0235204696655273,
5
- "eval_runtime": 157.8403,
6
- "eval_samples_per_second": 1.894,
7
- "eval_steps_per_second": 0.063
8
  }
 
1
  {
2
+ "epoch": 30.03125,
3
+ "eval_accuracy": 0.4882943143812709,
4
+ "eval_loss": 2.180596113204956,
5
+ "eval_runtime": 179.6038,
6
+ "eval_samples_per_second": 1.665,
7
+ "eval_steps_per_second": 0.056
8
  }
trainer_state.json CHANGED
@@ -1,523 +1,994 @@
1
  {
2
- "best_metric": 0.25752508361204013,
3
- "best_model_checkpoint": "videomae-base-finetuned-ucf101-subset/checkpoint-465",
4
- "epoch": 15.03125,
5
  "eval_steps": 500,
6
- "global_step": 480,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.020833333333333332,
13
- "grad_norm": 4.865664958953857,
14
- "learning_rate": 1.0416666666666668e-05,
15
- "loss": 4.2486,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.041666666666666664,
20
- "grad_norm": 5.374879837036133,
21
- "learning_rate": 2.0833333333333336e-05,
22
- "loss": 4.2806,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.0625,
27
- "grad_norm": 4.254001140594482,
28
- "learning_rate": 3.125e-05,
29
- "loss": 4.2682,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.06458333333333334,
34
  "eval_accuracy": 0.0033444816053511705,
35
- "eval_loss": 4.24931526184082,
36
- "eval_runtime": 156.0528,
37
- "eval_samples_per_second": 1.916,
38
- "eval_steps_per_second": 0.064,
39
  "step": 31
40
  },
41
  {
42
- "epoch": 1.01875,
43
- "grad_norm": 3.580606698989868,
44
- "learning_rate": 4.166666666666667e-05,
45
- "loss": 4.249,
46
  "step": 40
47
  },
48
  {
49
- "epoch": 1.0395833333333333,
50
- "grad_norm": 3.6750593185424805,
51
- "learning_rate": 4.976851851851852e-05,
52
- "loss": 4.2497,
53
  "step": 50
54
  },
55
  {
56
- "epoch": 1.0604166666666666,
57
- "grad_norm": 3.427530527114868,
58
- "learning_rate": 4.8611111111111115e-05,
59
- "loss": 4.2584,
60
  "step": 60
61
  },
62
  {
63
- "epoch": 1.0645833333333334,
64
- "eval_accuracy": 0.013377926421404682,
65
- "eval_loss": 4.243381023406982,
66
- "eval_runtime": 184.9,
67
- "eval_samples_per_second": 1.617,
68
- "eval_steps_per_second": 0.054,
69
  "step": 62
70
  },
71
  {
72
- "epoch": 2.0166666666666666,
73
- "grad_norm": 3.5070836544036865,
74
- "learning_rate": 4.745370370370371e-05,
75
- "loss": 4.221,
76
  "step": 70
77
  },
78
  {
79
- "epoch": 2.0375,
80
- "grad_norm": 3.814365863800049,
81
- "learning_rate": 4.62962962962963e-05,
82
- "loss": 4.249,
83
  "step": 80
84
  },
85
  {
86
- "epoch": 2.058333333333333,
87
- "grad_norm": 3.279081344604492,
88
- "learning_rate": 4.5138888888888894e-05,
89
- "loss": 4.2518,
90
  "step": 90
91
  },
92
  {
93
- "epoch": 2.064583333333333,
94
- "eval_accuracy": 0.016722408026755852,
95
- "eval_loss": 4.224606990814209,
96
- "eval_runtime": 182.615,
97
- "eval_samples_per_second": 1.637,
98
- "eval_steps_per_second": 0.055,
99
  "step": 93
100
  },
101
  {
102
- "epoch": 3.0145833333333334,
103
- "grad_norm": 2.9108073711395264,
104
- "learning_rate": 4.3981481481481486e-05,
105
- "loss": 4.2168,
106
  "step": 100
107
  },
108
  {
109
- "epoch": 3.035416666666667,
110
- "grad_norm": 3.1853749752044678,
111
- "learning_rate": 4.282407407407408e-05,
112
- "loss": 4.2103,
113
  "step": 110
114
  },
115
  {
116
- "epoch": 3.05625,
117
- "grad_norm": 3.1384546756744385,
118
- "learning_rate": 4.166666666666667e-05,
119
  "loss": 4.2445,
120
  "step": 120
121
  },
122
  {
123
- "epoch": 3.064583333333333,
124
  "eval_accuracy": 0.006688963210702341,
125
- "eval_loss": 4.220835208892822,
126
- "eval_runtime": 172.1522,
127
- "eval_samples_per_second": 1.737,
128
- "eval_steps_per_second": 0.058,
129
  "step": 124
130
  },
131
  {
132
- "epoch": 4.0125,
133
- "grad_norm": 2.7196435928344727,
134
- "learning_rate": 4.0509259259259265e-05,
135
- "loss": 4.2159,
136
  "step": 130
137
  },
138
  {
139
- "epoch": 4.033333333333333,
140
- "grad_norm": 2.6634223461151123,
141
- "learning_rate": 3.935185185185186e-05,
142
- "loss": 4.2063,
143
  "step": 140
144
  },
145
  {
146
- "epoch": 4.054166666666666,
147
- "grad_norm": 2.5531411170959473,
148
- "learning_rate": 3.8194444444444444e-05,
149
- "loss": 4.2272,
150
  "step": 150
151
  },
152
  {
153
- "epoch": 4.064583333333333,
154
  "eval_accuracy": 0.010033444816053512,
155
- "eval_loss": 4.223015308380127,
156
- "eval_runtime": 163.7094,
157
- "eval_samples_per_second": 1.826,
158
- "eval_steps_per_second": 0.061,
159
  "step": 155
160
  },
161
  {
162
- "epoch": 5.010416666666667,
163
- "grad_norm": 2.824289321899414,
164
- "learning_rate": 3.7037037037037037e-05,
165
- "loss": 4.2151,
166
  "step": 160
167
  },
168
  {
169
- "epoch": 5.03125,
170
- "grad_norm": 2.8366966247558594,
171
- "learning_rate": 3.587962962962963e-05,
172
- "loss": 4.183,
173
  "step": 170
174
  },
175
  {
176
- "epoch": 5.052083333333333,
177
- "grad_norm": 3.340677499771118,
178
- "learning_rate": 3.472222222222222e-05,
179
- "loss": 4.205,
180
  "step": 180
181
  },
182
  {
183
- "epoch": 5.064583333333333,
184
- "eval_accuracy": 0.023411371237458192,
185
- "eval_loss": 4.211067199707031,
186
- "eval_runtime": 166.937,
187
- "eval_samples_per_second": 1.791,
188
- "eval_steps_per_second": 0.06,
189
  "step": 186
190
  },
191
  {
192
- "epoch": 6.008333333333334,
193
- "grad_norm": 2.784593105316162,
194
- "learning_rate": 3.3564814814814815e-05,
195
- "loss": 4.1898,
196
  "step": 190
197
  },
198
  {
199
- "epoch": 6.029166666666667,
200
- "grad_norm": 3.389150381088257,
201
- "learning_rate": 3.240740740740741e-05,
202
- "loss": 4.1386,
203
  "step": 200
204
  },
205
  {
206
- "epoch": 6.05,
207
- "grad_norm": 3.5054867267608643,
208
- "learning_rate": 3.125e-05,
209
- "loss": 4.1238,
210
  "step": 210
211
  },
212
  {
213
- "epoch": 6.064583333333333,
214
- "eval_accuracy": 0.03678929765886288,
215
- "eval_loss": 4.111179828643799,
216
- "eval_runtime": 161.8854,
217
- "eval_samples_per_second": 1.847,
218
- "eval_steps_per_second": 0.062,
219
  "step": 217
220
  },
221
  {
222
- "epoch": 7.00625,
223
- "grad_norm": 4.150498867034912,
224
- "learning_rate": 3.0092592592592593e-05,
225
- "loss": 4.0898,
226
  "step": 220
227
  },
228
  {
229
- "epoch": 7.027083333333334,
230
- "grad_norm": 4.683104038238525,
231
- "learning_rate": 2.8935185185185186e-05,
232
- "loss": 4.0073,
233
  "step": 230
234
  },
235
  {
236
- "epoch": 7.047916666666667,
237
- "grad_norm": 4.379587650299072,
238
- "learning_rate": 2.777777777777778e-05,
239
- "loss": 3.9136,
240
  "step": 240
241
  },
242
  {
243
- "epoch": 7.064583333333333,
244
- "eval_accuracy": 0.07357859531772576,
245
- "eval_loss": 3.8529512882232666,
246
- "eval_runtime": 164.2718,
247
- "eval_samples_per_second": 1.82,
248
- "eval_steps_per_second": 0.061,
249
  "step": 248
250
  },
251
  {
252
- "epoch": 8.004166666666666,
253
- "grad_norm": 5.4900078773498535,
254
- "learning_rate": 2.6620370370370372e-05,
255
- "loss": 3.8304,
256
  "step": 250
257
  },
258
  {
259
- "epoch": 8.025,
260
- "grad_norm": 4.974089622497559,
261
- "learning_rate": 2.5462962962962965e-05,
262
- "loss": 3.7169,
263
  "step": 260
264
  },
265
  {
266
- "epoch": 8.045833333333333,
267
- "grad_norm": 5.386253833770752,
268
- "learning_rate": 2.4305555555555558e-05,
269
- "loss": 3.6241,
270
  "step": 270
271
  },
272
  {
273
- "epoch": 8.064583333333333,
274
- "eval_accuracy": 0.11705685618729098,
275
- "eval_loss": 3.6734354496002197,
276
- "eval_runtime": 159.295,
277
- "eval_samples_per_second": 1.877,
278
- "eval_steps_per_second": 0.063,
279
  "step": 279
280
  },
281
  {
282
- "epoch": 9.002083333333333,
283
- "grad_norm": 5.429062843322754,
284
- "learning_rate": 2.314814814814815e-05,
285
- "loss": 3.4885,
286
  "step": 280
287
  },
288
  {
289
- "epoch": 9.022916666666667,
290
- "grad_norm": 5.275113105773926,
291
- "learning_rate": 2.1990740740740743e-05,
292
- "loss": 3.4022,
293
  "step": 290
294
  },
295
  {
296
- "epoch": 9.04375,
297
- "grad_norm": 6.2369704246521,
298
- "learning_rate": 2.0833333333333336e-05,
299
- "loss": 3.2977,
300
  "step": 300
301
  },
302
  {
303
- "epoch": 9.064583333333333,
304
- "grad_norm": 13.635384559631348,
305
- "learning_rate": 1.967592592592593e-05,
306
- "loss": 3.3103,
307
  "step": 310
308
  },
309
  {
310
- "epoch": 9.064583333333333,
311
  "eval_accuracy": 0.10702341137123746,
312
- "eval_loss": 3.5260610580444336,
313
- "eval_runtime": 170.757,
314
- "eval_samples_per_second": 1.751,
315
- "eval_steps_per_second": 0.059,
316
  "step": 310
317
  },
318
  {
319
- "epoch": 10.020833333333334,
320
- "grad_norm": 6.099160194396973,
321
- "learning_rate": 1.8518518518518518e-05,
322
- "loss": 3.1017,
323
  "step": 320
324
  },
325
  {
326
- "epoch": 10.041666666666666,
327
- "grad_norm": 8.271078109741211,
328
- "learning_rate": 1.736111111111111e-05,
329
- "loss": 3.0604,
330
  "step": 330
331
  },
332
  {
333
- "epoch": 10.0625,
334
- "grad_norm": 6.345729827880859,
335
- "learning_rate": 1.6203703703703704e-05,
336
- "loss": 3.0981,
337
  "step": 340
338
  },
339
  {
340
- "epoch": 10.064583333333333,
341
- "eval_accuracy": 0.16387959866220736,
342
- "eval_loss": 3.3859572410583496,
343
- "eval_runtime": 165.4502,
344
- "eval_samples_per_second": 1.807,
345
- "eval_steps_per_second": 0.06,
346
  "step": 341
347
  },
348
  {
349
- "epoch": 11.01875,
350
- "grad_norm": 6.985031604766846,
351
- "learning_rate": 1.5046296296296297e-05,
352
- "loss": 2.9706,
353
  "step": 350
354
  },
355
  {
356
- "epoch": 11.039583333333333,
357
- "grad_norm": 6.81059455871582,
358
- "learning_rate": 1.388888888888889e-05,
359
- "loss": 2.916,
360
  "step": 360
361
  },
362
  {
363
- "epoch": 11.060416666666667,
364
- "grad_norm": 6.5202436447143555,
365
- "learning_rate": 1.2731481481481482e-05,
366
- "loss": 2.8216,
367
  "step": 370
368
  },
369
  {
370
- "epoch": 11.064583333333333,
371
- "eval_accuracy": 0.2140468227424749,
372
- "eval_loss": 3.179076671600342,
373
- "eval_runtime": 163.8066,
374
- "eval_samples_per_second": 1.825,
375
- "eval_steps_per_second": 0.061,
376
  "step": 372
377
  },
378
  {
379
- "epoch": 12.016666666666667,
380
- "grad_norm": 5.686159610748291,
381
- "learning_rate": 1.1574074074074075e-05,
382
- "loss": 2.7755,
383
  "step": 380
384
  },
385
  {
386
- "epoch": 12.0375,
387
- "grad_norm": 6.596237659454346,
388
- "learning_rate": 1.0416666666666668e-05,
389
- "loss": 2.824,
390
  "step": 390
391
  },
392
  {
393
- "epoch": 12.058333333333334,
394
- "grad_norm": 7.551661968231201,
395
- "learning_rate": 9.259259259259259e-06,
396
- "loss": 2.6108,
397
  "step": 400
398
  },
399
  {
400
- "epoch": 12.064583333333333,
401
- "eval_accuracy": 0.24414715719063546,
402
- "eval_loss": 3.1618316173553467,
403
- "eval_runtime": 177.1262,
404
- "eval_samples_per_second": 1.688,
405
- "eval_steps_per_second": 0.056,
406
  "step": 403
407
  },
408
  {
409
- "epoch": 13.014583333333333,
410
- "grad_norm": 6.725275039672852,
411
- "learning_rate": 8.101851851851852e-06,
412
- "loss": 2.7309,
413
  "step": 410
414
  },
415
  {
416
- "epoch": 13.035416666666666,
417
- "grad_norm": 7.1535563468933105,
418
- "learning_rate": 6.944444444444445e-06,
419
- "loss": 2.6451,
420
  "step": 420
421
  },
422
  {
423
- "epoch": 13.05625,
424
- "grad_norm": 7.705063819885254,
425
- "learning_rate": 5.787037037037038e-06,
426
- "loss": 2.598,
427
  "step": 430
428
  },
429
  {
430
- "epoch": 13.064583333333333,
431
- "eval_accuracy": 0.23411371237458195,
432
- "eval_loss": 3.0792758464813232,
433
- "eval_runtime": 186.6022,
434
- "eval_samples_per_second": 1.602,
435
- "eval_steps_per_second": 0.054,
436
  "step": 434
437
  },
438
  {
439
- "epoch": 14.0125,
440
- "grad_norm": 5.5132927894592285,
441
- "learning_rate": 4.6296296296296296e-06,
442
- "loss": 2.5532,
443
  "step": 440
444
  },
445
  {
446
- "epoch": 14.033333333333333,
447
- "grad_norm": 6.853929042816162,
448
- "learning_rate": 3.4722222222222224e-06,
449
- "loss": 2.56,
450
  "step": 450
451
  },
452
  {
453
- "epoch": 14.054166666666667,
454
- "grad_norm": 6.731771469116211,
455
- "learning_rate": 2.3148148148148148e-06,
456
- "loss": 2.5023,
457
  "step": 460
458
  },
459
  {
460
- "epoch": 14.064583333333333,
461
- "eval_accuracy": 0.25752508361204013,
462
- "eval_loss": 3.019373893737793,
463
- "eval_runtime": 155.2839,
464
- "eval_samples_per_second": 1.926,
465
- "eval_steps_per_second": 0.064,
466
  "step": 465
467
  },
468
  {
469
- "epoch": 15.010416666666666,
470
- "grad_norm": 6.570545673370361,
471
- "learning_rate": 1.1574074074074074e-06,
472
- "loss": 2.5538,
473
  "step": 470
474
  },
475
  {
476
- "epoch": 15.03125,
477
- "grad_norm": 6.480160236358643,
478
- "learning_rate": 0.0,
479
- "loss": 2.513,
480
  "step": 480
481
  },
482
  {
483
- "epoch": 15.03125,
484
- "eval_accuracy": 0.23745819397993312,
485
- "eval_loss": 3.0668206214904785,
486
- "eval_runtime": 187.3659,
487
- "eval_samples_per_second": 1.596,
488
- "eval_steps_per_second": 0.053,
489
- "step": 480
490
  },
491
  {
492
- "epoch": 15.03125,
493
- "step": 480,
494
- "total_flos": 1.8664399999458017e+19,
495
- "train_loss": 3.5868410070737204,
496
- "train_runtime": 12231.8484,
497
- "train_samples_per_second": 1.256,
498
- "train_steps_per_second": 0.039
499
  },
500
  {
501
- "epoch": 15.03125,
502
- "eval_accuracy": 0.25752508361204013,
503
- "eval_loss": 3.0240976810455322,
504
- "eval_runtime": 158.7962,
505
- "eval_samples_per_second": 1.883,
506
- "eval_steps_per_second": 0.063,
507
- "step": 480
508
  },
509
  {
510
- "epoch": 15.03125,
511
- "eval_accuracy": 0.25752508361204013,
512
- "eval_loss": 3.0235204696655273,
513
- "eval_runtime": 157.8403,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
  "eval_samples_per_second": 1.894,
515
  "eval_steps_per_second": 0.063,
516
- "step": 480
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517
  }
518
  ],
519
  "logging_steps": 10,
520
- "max_steps": 480,
521
  "num_input_tokens_seen": 0,
522
  "num_train_epochs": 9223372036854775807,
523
  "save_steps": 500,
@@ -533,7 +1004,7 @@
533
  "attributes": {}
534
  }
535
  },
536
- "total_flos": 1.8664399999458017e+19,
537
  "train_batch_size": 32,
538
  "trial_name": null,
539
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.4882943143812709,
3
+ "best_model_checkpoint": "videomae-base-finetuned-ucf101-subset/checkpoint-806",
4
+ "epoch": 30.03125,
5
  "eval_steps": 500,
6
+ "global_step": 960,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.010416666666666666,
13
+ "grad_norm": 4.111547470092773,
14
+ "learning_rate": 5.208333333333334e-06,
15
+ "loss": 4.2127,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.020833333333333332,
20
+ "grad_norm": 4.218038082122803,
21
+ "learning_rate": 1.0416666666666668e-05,
22
+ "loss": 4.2247,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.03125,
27
+ "grad_norm": 3.766047954559326,
28
+ "learning_rate": 1.5625e-05,
29
+ "loss": 4.2427,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.03229166666666667,
34
  "eval_accuracy": 0.0033444816053511705,
35
+ "eval_loss": 4.22645378112793,
36
+ "eval_runtime": 157.4916,
37
+ "eval_samples_per_second": 1.899,
38
+ "eval_steps_per_second": 0.063,
39
  "step": 31
40
  },
41
  {
42
+ "epoch": 1.009375,
43
+ "grad_norm": 4.1204304695129395,
44
+ "learning_rate": 2.0833333333333336e-05,
45
+ "loss": 4.2374,
46
  "step": 40
47
  },
48
  {
49
+ "epoch": 1.0197916666666667,
50
+ "grad_norm": 4.202085971832275,
51
+ "learning_rate": 2.604166666666667e-05,
52
+ "loss": 4.228,
53
  "step": 50
54
  },
55
  {
56
+ "epoch": 1.0302083333333334,
57
+ "grad_norm": 3.5194547176361084,
58
+ "learning_rate": 3.125e-05,
59
+ "loss": 4.2321,
60
  "step": 60
61
  },
62
  {
63
+ "epoch": 1.0322916666666666,
64
+ "eval_accuracy": 0.010033444816053512,
65
+ "eval_loss": 4.223534107208252,
66
+ "eval_runtime": 156.7976,
67
+ "eval_samples_per_second": 1.907,
68
+ "eval_steps_per_second": 0.064,
69
  "step": 62
70
  },
71
  {
72
+ "epoch": 2.0083333333333333,
73
+ "grad_norm": 3.131030559539795,
74
+ "learning_rate": 3.6458333333333336e-05,
75
+ "loss": 4.2293,
76
  "step": 70
77
  },
78
  {
79
+ "epoch": 2.01875,
80
+ "grad_norm": 3.120128631591797,
81
+ "learning_rate": 4.166666666666667e-05,
82
+ "loss": 4.2347,
83
  "step": 80
84
  },
85
  {
86
+ "epoch": 2.029166666666667,
87
+ "grad_norm": 3.0988192558288574,
88
+ "learning_rate": 4.6875e-05,
89
+ "loss": 4.24,
90
  "step": 90
91
  },
92
  {
93
+ "epoch": 2.0322916666666666,
94
+ "eval_accuracy": 0.010033444816053512,
95
+ "eval_loss": 4.228224754333496,
96
+ "eval_runtime": 156.5504,
97
+ "eval_samples_per_second": 1.91,
98
+ "eval_steps_per_second": 0.064,
99
  "step": 93
100
  },
101
  {
102
+ "epoch": 3.0072916666666667,
103
+ "grad_norm": 3.102827310562134,
104
+ "learning_rate": 4.976851851851852e-05,
105
+ "loss": 4.2195,
106
  "step": 100
107
  },
108
  {
109
+ "epoch": 3.017708333333333,
110
+ "grad_norm": 2.8366568088531494,
111
+ "learning_rate": 4.9189814814814815e-05,
112
+ "loss": 4.2354,
113
  "step": 110
114
  },
115
  {
116
+ "epoch": 3.028125,
117
+ "grad_norm": 3.0085482597351074,
118
+ "learning_rate": 4.8611111111111115e-05,
119
  "loss": 4.2445,
120
  "step": 120
121
  },
122
  {
123
+ "epoch": 3.0322916666666666,
124
  "eval_accuracy": 0.006688963210702341,
125
+ "eval_loss": 4.224982261657715,
126
+ "eval_runtime": 151.1437,
127
+ "eval_samples_per_second": 1.978,
128
+ "eval_steps_per_second": 0.066,
129
  "step": 124
130
  },
131
  {
132
+ "epoch": 4.00625,
133
+ "grad_norm": 3.300618886947632,
134
+ "learning_rate": 4.803240740740741e-05,
135
+ "loss": 4.2226,
136
  "step": 130
137
  },
138
  {
139
+ "epoch": 4.016666666666667,
140
+ "grad_norm": 3.202220916748047,
141
+ "learning_rate": 4.745370370370371e-05,
142
+ "loss": 4.2265,
143
  "step": 140
144
  },
145
  {
146
+ "epoch": 4.027083333333334,
147
+ "grad_norm": 2.977271556854248,
148
+ "learning_rate": 4.6875e-05,
149
+ "loss": 4.2327,
150
  "step": 150
151
  },
152
  {
153
+ "epoch": 4.032291666666667,
154
  "eval_accuracy": 0.010033444816053512,
155
+ "eval_loss": 4.2244367599487305,
156
+ "eval_runtime": 156.5602,
157
+ "eval_samples_per_second": 1.91,
158
+ "eval_steps_per_second": 0.064,
159
  "step": 155
160
  },
161
  {
162
+ "epoch": 5.005208333333333,
163
+ "grad_norm": 3.7291171550750732,
164
+ "learning_rate": 4.62962962962963e-05,
165
+ "loss": 4.2106,
166
  "step": 160
167
  },
168
  {
169
+ "epoch": 5.015625,
170
+ "grad_norm": 3.581210136413574,
171
+ "learning_rate": 4.5717592592592594e-05,
172
+ "loss": 4.2112,
173
  "step": 170
174
  },
175
  {
176
+ "epoch": 5.026041666666667,
177
+ "grad_norm": 3.7153217792510986,
178
+ "learning_rate": 4.5138888888888894e-05,
179
+ "loss": 4.2104,
180
  "step": 180
181
  },
182
  {
183
+ "epoch": 5.032291666666667,
184
+ "eval_accuracy": 0.020066889632107024,
185
+ "eval_loss": 4.2100114822387695,
186
+ "eval_runtime": 150.442,
187
+ "eval_samples_per_second": 1.987,
188
+ "eval_steps_per_second": 0.066,
189
  "step": 186
190
  },
191
  {
192
+ "epoch": 6.004166666666666,
193
+ "grad_norm": 3.1238365173339844,
194
+ "learning_rate": 4.456018518518519e-05,
195
+ "loss": 4.2205,
196
  "step": 190
197
  },
198
  {
199
+ "epoch": 6.014583333333333,
200
+ "grad_norm": 3.4233925342559814,
201
+ "learning_rate": 4.3981481481481486e-05,
202
+ "loss": 4.2012,
203
  "step": 200
204
  },
205
  {
206
+ "epoch": 6.025,
207
+ "grad_norm": 3.511300563812256,
208
+ "learning_rate": 4.340277777777778e-05,
209
+ "loss": 4.2374,
210
  "step": 210
211
  },
212
  {
213
+ "epoch": 6.032291666666667,
214
+ "eval_accuracy": 0.006688963210702341,
215
+ "eval_loss": 4.2022294998168945,
216
+ "eval_runtime": 153.3689,
217
+ "eval_samples_per_second": 1.95,
218
+ "eval_steps_per_second": 0.065,
219
  "step": 217
220
  },
221
  {
222
+ "epoch": 7.003125,
223
+ "grad_norm": 3.343992233276367,
224
+ "learning_rate": 4.282407407407408e-05,
225
+ "loss": 4.2055,
226
  "step": 220
227
  },
228
  {
229
+ "epoch": 7.013541666666667,
230
+ "grad_norm": 2.911720037460327,
231
+ "learning_rate": 4.224537037037037e-05,
232
+ "loss": 4.1687,
233
  "step": 230
234
  },
235
  {
236
+ "epoch": 7.023958333333334,
237
+ "grad_norm": 3.1043291091918945,
238
+ "learning_rate": 4.166666666666667e-05,
239
+ "loss": 4.1597,
240
  "step": 240
241
  },
242
  {
243
+ "epoch": 7.032291666666667,
244
+ "eval_accuracy": 0.030100334448160536,
245
+ "eval_loss": 4.118756294250488,
246
+ "eval_runtime": 152.3194,
247
+ "eval_samples_per_second": 1.963,
248
+ "eval_steps_per_second": 0.066,
249
  "step": 248
250
  },
251
  {
252
+ "epoch": 8.002083333333333,
253
+ "grad_norm": 3.4208502769470215,
254
+ "learning_rate": 4.1087962962962965e-05,
255
+ "loss": 4.1418,
256
  "step": 250
257
  },
258
  {
259
+ "epoch": 8.0125,
260
+ "grad_norm": 4.338245868682861,
261
+ "learning_rate": 4.0509259259259265e-05,
262
+ "loss": 4.0753,
263
  "step": 260
264
  },
265
  {
266
+ "epoch": 8.022916666666667,
267
+ "grad_norm": 4.86409854888916,
268
+ "learning_rate": 3.993055555555556e-05,
269
+ "loss": 4.0522,
270
  "step": 270
271
  },
272
  {
273
+ "epoch": 8.032291666666667,
274
+ "eval_accuracy": 0.07023411371237458,
275
+ "eval_loss": 3.935123920440674,
276
+ "eval_runtime": 155.145,
277
+ "eval_samples_per_second": 1.927,
278
+ "eval_steps_per_second": 0.064,
279
  "step": 279
280
  },
281
  {
282
+ "epoch": 9.001041666666667,
283
+ "grad_norm": 4.609180927276611,
284
+ "learning_rate": 3.935185185185186e-05,
285
+ "loss": 3.9967,
286
  "step": 280
287
  },
288
  {
289
+ "epoch": 9.011458333333334,
290
+ "grad_norm": 5.407143592834473,
291
+ "learning_rate": 3.877314814814815e-05,
292
+ "loss": 3.8626,
293
  "step": 290
294
  },
295
  {
296
+ "epoch": 9.021875,
297
+ "grad_norm": 4.2119598388671875,
298
+ "learning_rate": 3.8194444444444444e-05,
299
+ "loss": 3.821,
300
  "step": 300
301
  },
302
  {
303
+ "epoch": 9.032291666666667,
304
+ "grad_norm": 14.079527854919434,
305
+ "learning_rate": 3.7615740740740744e-05,
306
+ "loss": 3.768,
307
  "step": 310
308
  },
309
  {
310
+ "epoch": 9.032291666666667,
311
  "eval_accuracy": 0.10702341137123746,
312
+ "eval_loss": 3.680009126663208,
313
+ "eval_runtime": 152.6981,
314
+ "eval_samples_per_second": 1.958,
315
+ "eval_steps_per_second": 0.065,
316
  "step": 310
317
  },
318
  {
319
+ "epoch": 10.010416666666666,
320
+ "grad_norm": 4.879744529724121,
321
+ "learning_rate": 3.7037037037037037e-05,
322
+ "loss": 3.54,
323
  "step": 320
324
  },
325
  {
326
+ "epoch": 10.020833333333334,
327
+ "grad_norm": 6.769280910491943,
328
+ "learning_rate": 3.6458333333333336e-05,
329
+ "loss": 3.6435,
330
  "step": 330
331
  },
332
  {
333
+ "epoch": 10.03125,
334
+ "grad_norm": 6.974754810333252,
335
+ "learning_rate": 3.587962962962963e-05,
336
+ "loss": 3.5147,
337
  "step": 340
338
  },
339
  {
340
+ "epoch": 10.032291666666667,
341
+ "eval_accuracy": 0.11036789297658862,
342
+ "eval_loss": 3.541635036468506,
343
+ "eval_runtime": 156.1418,
344
+ "eval_samples_per_second": 1.915,
345
+ "eval_steps_per_second": 0.064,
346
  "step": 341
347
  },
348
  {
349
+ "epoch": 11.009375,
350
+ "grad_norm": 6.004092216491699,
351
+ "learning_rate": 3.530092592592593e-05,
352
+ "loss": 3.2736,
353
  "step": 350
354
  },
355
  {
356
+ "epoch": 11.019791666666666,
357
+ "grad_norm": 4.7057905197143555,
358
+ "learning_rate": 3.472222222222222e-05,
359
+ "loss": 3.2459,
360
  "step": 360
361
  },
362
  {
363
+ "epoch": 11.030208333333333,
364
+ "grad_norm": 7.564170837402344,
365
+ "learning_rate": 3.414351851851852e-05,
366
+ "loss": 3.2878,
367
  "step": 370
368
  },
369
  {
370
+ "epoch": 11.032291666666667,
371
+ "eval_accuracy": 0.07023411371237458,
372
+ "eval_loss": 3.707416534423828,
373
+ "eval_runtime": 151.3197,
374
+ "eval_samples_per_second": 1.976,
375
+ "eval_steps_per_second": 0.066,
376
  "step": 372
377
  },
378
  {
379
+ "epoch": 12.008333333333333,
380
+ "grad_norm": 6.44061279296875,
381
+ "learning_rate": 3.3564814814814815e-05,
382
+ "loss": 3.048,
383
  "step": 380
384
  },
385
  {
386
+ "epoch": 12.01875,
387
+ "grad_norm": 6.303668022155762,
388
+ "learning_rate": 3.2986111111111115e-05,
389
+ "loss": 3.0619,
390
  "step": 390
391
  },
392
  {
393
+ "epoch": 12.029166666666667,
394
+ "grad_norm": 7.074206352233887,
395
+ "learning_rate": 3.240740740740741e-05,
396
+ "loss": 2.9491,
397
  "step": 400
398
  },
399
  {
400
+ "epoch": 12.032291666666667,
401
+ "eval_accuracy": 0.10702341137123746,
402
+ "eval_loss": 3.3954155445098877,
403
+ "eval_runtime": 154.7501,
404
+ "eval_samples_per_second": 1.932,
405
+ "eval_steps_per_second": 0.065,
406
  "step": 403
407
  },
408
  {
409
+ "epoch": 13.007291666666667,
410
+ "grad_norm": 6.961267471313477,
411
+ "learning_rate": 3.182870370370371e-05,
412
+ "loss": 2.8801,
413
  "step": 410
414
  },
415
  {
416
+ "epoch": 13.017708333333333,
417
+ "grad_norm": 5.816298007965088,
418
+ "learning_rate": 3.125e-05,
419
+ "loss": 2.9025,
420
  "step": 420
421
  },
422
  {
423
+ "epoch": 13.028125,
424
+ "grad_norm": 10.059369087219238,
425
+ "learning_rate": 3.06712962962963e-05,
426
+ "loss": 2.806,
427
  "step": 430
428
  },
429
  {
430
+ "epoch": 13.032291666666667,
431
+ "eval_accuracy": 0.1705685618729097,
432
+ "eval_loss": 3.2551913261413574,
433
+ "eval_runtime": 154.5283,
434
+ "eval_samples_per_second": 1.935,
435
+ "eval_steps_per_second": 0.065,
436
  "step": 434
437
  },
438
  {
439
+ "epoch": 14.00625,
440
+ "grad_norm": 6.142374515533447,
441
+ "learning_rate": 3.0092592592592593e-05,
442
+ "loss": 2.6302,
443
  "step": 440
444
  },
445
  {
446
+ "epoch": 14.016666666666667,
447
+ "grad_norm": 5.806746959686279,
448
+ "learning_rate": 2.951388888888889e-05,
449
+ "loss": 2.5778,
450
  "step": 450
451
  },
452
  {
453
+ "epoch": 14.027083333333334,
454
+ "grad_norm": 6.683168888092041,
455
+ "learning_rate": 2.8935185185185186e-05,
456
+ "loss": 2.4568,
457
  "step": 460
458
  },
459
  {
460
+ "epoch": 14.032291666666667,
461
+ "eval_accuracy": 0.2040133779264214,
462
+ "eval_loss": 3.0654280185699463,
463
+ "eval_runtime": 155.0,
464
+ "eval_samples_per_second": 1.929,
465
+ "eval_steps_per_second": 0.065,
466
  "step": 465
467
  },
468
  {
469
+ "epoch": 15.005208333333334,
470
+ "grad_norm": 8.81658935546875,
471
+ "learning_rate": 2.8356481481481483e-05,
472
+ "loss": 2.4185,
473
  "step": 470
474
  },
475
  {
476
+ "epoch": 15.015625,
477
+ "grad_norm": 17.862884521484375,
478
+ "learning_rate": 2.777777777777778e-05,
479
+ "loss": 2.3865,
480
  "step": 480
481
  },
482
  {
483
+ "epoch": 15.026041666666666,
484
+ "grad_norm": 7.982626438140869,
485
+ "learning_rate": 2.7199074074074076e-05,
486
+ "loss": 2.3102,
487
+ "step": 490
 
 
488
  },
489
  {
490
+ "epoch": 15.032291666666667,
491
+ "eval_accuracy": 0.3010033444816054,
492
+ "eval_loss": 2.7440292835235596,
493
+ "eval_runtime": 154.0941,
494
+ "eval_samples_per_second": 1.94,
495
+ "eval_steps_per_second": 0.065,
496
+ "step": 496
497
  },
498
  {
499
+ "epoch": 16.004166666666666,
500
+ "grad_norm": 7.09642219543457,
501
+ "learning_rate": 2.6620370370370372e-05,
502
+ "loss": 2.1208,
503
+ "step": 500
 
 
504
  },
505
  {
506
+ "epoch": 16.014583333333334,
507
+ "grad_norm": 7.21767520904541,
508
+ "learning_rate": 2.604166666666667e-05,
509
+ "loss": 2.1481,
510
+ "step": 510
511
+ },
512
+ {
513
+ "epoch": 16.025,
514
+ "grad_norm": 8.300311088562012,
515
+ "learning_rate": 2.5462962962962965e-05,
516
+ "loss": 2.2079,
517
+ "step": 520
518
+ },
519
+ {
520
+ "epoch": 16.032291666666666,
521
+ "eval_accuracy": 0.31438127090301005,
522
+ "eval_loss": 2.6789305210113525,
523
+ "eval_runtime": 157.8279,
524
  "eval_samples_per_second": 1.894,
525
  "eval_steps_per_second": 0.063,
526
+ "step": 527
527
+ },
528
+ {
529
+ "epoch": 17.003125,
530
+ "grad_norm": 8.606009483337402,
531
+ "learning_rate": 2.488425925925926e-05,
532
+ "loss": 2.0646,
533
+ "step": 530
534
+ },
535
+ {
536
+ "epoch": 17.013541666666665,
537
+ "grad_norm": 7.775322437286377,
538
+ "learning_rate": 2.4305555555555558e-05,
539
+ "loss": 1.9341,
540
+ "step": 540
541
+ },
542
+ {
543
+ "epoch": 17.023958333333333,
544
+ "grad_norm": 7.849579811096191,
545
+ "learning_rate": 2.3726851851851854e-05,
546
+ "loss": 1.9638,
547
+ "step": 550
548
+ },
549
+ {
550
+ "epoch": 17.032291666666666,
551
+ "eval_accuracy": 0.36789297658862874,
552
+ "eval_loss": 2.5920491218566895,
553
+ "eval_runtime": 157.1641,
554
+ "eval_samples_per_second": 1.902,
555
+ "eval_steps_per_second": 0.064,
556
+ "step": 558
557
+ },
558
+ {
559
+ "epoch": 18.002083333333335,
560
+ "grad_norm": 10.023223876953125,
561
+ "learning_rate": 2.314814814814815e-05,
562
+ "loss": 2.0018,
563
+ "step": 560
564
+ },
565
+ {
566
+ "epoch": 18.0125,
567
+ "grad_norm": 7.027205467224121,
568
+ "learning_rate": 2.2569444444444447e-05,
569
+ "loss": 1.8571,
570
+ "step": 570
571
+ },
572
+ {
573
+ "epoch": 18.022916666666667,
574
+ "grad_norm": 9.41115665435791,
575
+ "learning_rate": 2.1990740740740743e-05,
576
+ "loss": 1.7914,
577
+ "step": 580
578
+ },
579
+ {
580
+ "epoch": 18.032291666666666,
581
+ "eval_accuracy": 0.3377926421404682,
582
+ "eval_loss": 2.6151952743530273,
583
+ "eval_runtime": 162.3006,
584
+ "eval_samples_per_second": 1.842,
585
+ "eval_steps_per_second": 0.062,
586
+ "step": 589
587
+ },
588
+ {
589
+ "epoch": 19.001041666666666,
590
+ "grad_norm": 9.176055908203125,
591
+ "learning_rate": 2.141203703703704e-05,
592
+ "loss": 1.8165,
593
+ "step": 590
594
+ },
595
+ {
596
+ "epoch": 19.011458333333334,
597
+ "grad_norm": 7.396921157836914,
598
+ "learning_rate": 2.0833333333333336e-05,
599
+ "loss": 1.6863,
600
+ "step": 600
601
+ },
602
+ {
603
+ "epoch": 19.021875,
604
+ "grad_norm": 9.088372230529785,
605
+ "learning_rate": 2.0254629629629632e-05,
606
+ "loss": 1.597,
607
+ "step": 610
608
+ },
609
+ {
610
+ "epoch": 19.032291666666666,
611
+ "grad_norm": 16.782529830932617,
612
+ "learning_rate": 1.967592592592593e-05,
613
+ "loss": 1.6925,
614
+ "step": 620
615
+ },
616
+ {
617
+ "epoch": 19.032291666666666,
618
+ "eval_accuracy": 0.34448160535117056,
619
+ "eval_loss": 2.5970685482025146,
620
+ "eval_runtime": 161.5247,
621
+ "eval_samples_per_second": 1.851,
622
+ "eval_steps_per_second": 0.062,
623
+ "step": 620
624
+ },
625
+ {
626
+ "epoch": 20.010416666666668,
627
+ "grad_norm": 10.128305435180664,
628
+ "learning_rate": 1.9097222222222222e-05,
629
+ "loss": 1.6303,
630
+ "step": 630
631
+ },
632
+ {
633
+ "epoch": 20.020833333333332,
634
+ "grad_norm": 8.396921157836914,
635
+ "learning_rate": 1.8518518518518518e-05,
636
+ "loss": 1.6306,
637
+ "step": 640
638
+ },
639
+ {
640
+ "epoch": 20.03125,
641
+ "grad_norm": 8.574676513671875,
642
+ "learning_rate": 1.7939814814814815e-05,
643
+ "loss": 1.5124,
644
+ "step": 650
645
+ },
646
+ {
647
+ "epoch": 20.032291666666666,
648
+ "eval_accuracy": 0.34782608695652173,
649
+ "eval_loss": 2.5766701698303223,
650
+ "eval_runtime": 162.6227,
651
+ "eval_samples_per_second": 1.839,
652
+ "eval_steps_per_second": 0.061,
653
+ "step": 651
654
+ },
655
+ {
656
+ "epoch": 21.009375,
657
+ "grad_norm": 8.482582092285156,
658
+ "learning_rate": 1.736111111111111e-05,
659
+ "loss": 1.5163,
660
+ "step": 660
661
+ },
662
+ {
663
+ "epoch": 21.019791666666666,
664
+ "grad_norm": 7.806921482086182,
665
+ "learning_rate": 1.6782407407407408e-05,
666
+ "loss": 1.4579,
667
+ "step": 670
668
+ },
669
+ {
670
+ "epoch": 21.030208333333334,
671
+ "grad_norm": 10.889120101928711,
672
+ "learning_rate": 1.6203703703703704e-05,
673
+ "loss": 1.4834,
674
+ "step": 680
675
+ },
676
+ {
677
+ "epoch": 21.032291666666666,
678
+ "eval_accuracy": 0.3879598662207358,
679
+ "eval_loss": 2.4438529014587402,
680
+ "eval_runtime": 163.8828,
681
+ "eval_samples_per_second": 1.824,
682
+ "eval_steps_per_second": 0.061,
683
+ "step": 682
684
+ },
685
+ {
686
+ "epoch": 22.008333333333333,
687
+ "grad_norm": 8.548680305480957,
688
+ "learning_rate": 1.5625e-05,
689
+ "loss": 1.3913,
690
+ "step": 690
691
+ },
692
+ {
693
+ "epoch": 22.01875,
694
+ "grad_norm": 10.439925193786621,
695
+ "learning_rate": 1.5046296296296297e-05,
696
+ "loss": 1.4161,
697
+ "step": 700
698
+ },
699
+ {
700
+ "epoch": 22.029166666666665,
701
+ "grad_norm": 8.317498207092285,
702
+ "learning_rate": 1.4467592592592593e-05,
703
+ "loss": 1.4565,
704
+ "step": 710
705
+ },
706
+ {
707
+ "epoch": 22.032291666666666,
708
+ "eval_accuracy": 0.38461538461538464,
709
+ "eval_loss": 2.405748128890991,
710
+ "eval_runtime": 162.732,
711
+ "eval_samples_per_second": 1.837,
712
+ "eval_steps_per_second": 0.061,
713
+ "step": 713
714
+ },
715
+ {
716
+ "epoch": 23.007291666666667,
717
+ "grad_norm": 8.510457038879395,
718
+ "learning_rate": 1.388888888888889e-05,
719
+ "loss": 1.4043,
720
+ "step": 720
721
+ },
722
+ {
723
+ "epoch": 23.017708333333335,
724
+ "grad_norm": 9.268413543701172,
725
+ "learning_rate": 1.3310185185185186e-05,
726
+ "loss": 1.2734,
727
+ "step": 730
728
+ },
729
+ {
730
+ "epoch": 23.028125,
731
+ "grad_norm": 8.000787734985352,
732
+ "learning_rate": 1.2731481481481482e-05,
733
+ "loss": 1.279,
734
+ "step": 740
735
+ },
736
+ {
737
+ "epoch": 23.032291666666666,
738
+ "eval_accuracy": 0.35451505016722407,
739
+ "eval_loss": 2.550072193145752,
740
+ "eval_runtime": 160.4305,
741
+ "eval_samples_per_second": 1.864,
742
+ "eval_steps_per_second": 0.062,
743
+ "step": 744
744
+ },
745
+ {
746
+ "epoch": 24.00625,
747
+ "grad_norm": 8.658973693847656,
748
+ "learning_rate": 1.2152777777777779e-05,
749
+ "loss": 1.1963,
750
+ "step": 750
751
+ },
752
+ {
753
+ "epoch": 24.016666666666666,
754
+ "grad_norm": 7.558023929595947,
755
+ "learning_rate": 1.1574074074074075e-05,
756
+ "loss": 1.2601,
757
+ "step": 760
758
+ },
759
+ {
760
+ "epoch": 24.027083333333334,
761
+ "grad_norm": 9.91243839263916,
762
+ "learning_rate": 1.0995370370370372e-05,
763
+ "loss": 1.1477,
764
+ "step": 770
765
+ },
766
+ {
767
+ "epoch": 24.032291666666666,
768
+ "eval_accuracy": 0.44816053511705684,
769
+ "eval_loss": 2.3246614933013916,
770
+ "eval_runtime": 157.9743,
771
+ "eval_samples_per_second": 1.893,
772
+ "eval_steps_per_second": 0.063,
773
+ "step": 775
774
+ },
775
+ {
776
+ "epoch": 25.005208333333332,
777
+ "grad_norm": 10.049819946289062,
778
+ "learning_rate": 1.0416666666666668e-05,
779
+ "loss": 1.1727,
780
+ "step": 780
781
+ },
782
+ {
783
+ "epoch": 25.015625,
784
+ "grad_norm": 8.463665008544922,
785
+ "learning_rate": 9.837962962962964e-06,
786
+ "loss": 1.1207,
787
+ "step": 790
788
+ },
789
+ {
790
+ "epoch": 25.026041666666668,
791
+ "grad_norm": 7.52623176574707,
792
+ "learning_rate": 9.259259259259259e-06,
793
+ "loss": 1.2573,
794
+ "step": 800
795
+ },
796
+ {
797
+ "epoch": 25.032291666666666,
798
+ "eval_accuracy": 0.4882943143812709,
799
+ "eval_loss": 2.1776490211486816,
800
+ "eval_runtime": 158.1033,
801
+ "eval_samples_per_second": 1.891,
802
+ "eval_steps_per_second": 0.063,
803
+ "step": 806
804
+ },
805
+ {
806
+ "epoch": 26.004166666666666,
807
+ "grad_norm": 9.336162567138672,
808
+ "learning_rate": 8.680555555555556e-06,
809
+ "loss": 1.1791,
810
+ "step": 810
811
+ },
812
+ {
813
+ "epoch": 26.014583333333334,
814
+ "grad_norm": 6.728664875030518,
815
+ "learning_rate": 8.101851851851852e-06,
816
+ "loss": 1.1284,
817
+ "step": 820
818
+ },
819
+ {
820
+ "epoch": 26.025,
821
+ "grad_norm": 7.307468414306641,
822
+ "learning_rate": 7.523148148148148e-06,
823
+ "loss": 1.0825,
824
+ "step": 830
825
+ },
826
+ {
827
+ "epoch": 26.032291666666666,
828
+ "eval_accuracy": 0.4782608695652174,
829
+ "eval_loss": 2.14430832862854,
830
+ "eval_runtime": 156.854,
831
+ "eval_samples_per_second": 1.906,
832
+ "eval_steps_per_second": 0.064,
833
+ "step": 837
834
+ },
835
+ {
836
+ "epoch": 27.003125,
837
+ "grad_norm": 8.843693733215332,
838
+ "learning_rate": 6.944444444444445e-06,
839
+ "loss": 1.1431,
840
+ "step": 840
841
+ },
842
+ {
843
+ "epoch": 27.013541666666665,
844
+ "grad_norm": 9.648551940917969,
845
+ "learning_rate": 6.365740740740741e-06,
846
+ "loss": 1.0873,
847
+ "step": 850
848
+ },
849
+ {
850
+ "epoch": 27.023958333333333,
851
+ "grad_norm": 10.551318168640137,
852
+ "learning_rate": 5.787037037037038e-06,
853
+ "loss": 1.2121,
854
+ "step": 860
855
+ },
856
+ {
857
+ "epoch": 27.032291666666666,
858
+ "eval_accuracy": 0.4782608695652174,
859
+ "eval_loss": 2.149031639099121,
860
+ "eval_runtime": 156.4789,
861
+ "eval_samples_per_second": 1.911,
862
+ "eval_steps_per_second": 0.064,
863
+ "step": 868
864
+ },
865
+ {
866
+ "epoch": 28.002083333333335,
867
+ "grad_norm": 7.037572383880615,
868
+ "learning_rate": 5.208333333333334e-06,
869
+ "loss": 1.0147,
870
+ "step": 870
871
+ },
872
+ {
873
+ "epoch": 28.0125,
874
+ "grad_norm": 11.031582832336426,
875
+ "learning_rate": 4.6296296296296296e-06,
876
+ "loss": 1.0574,
877
+ "step": 880
878
+ },
879
+ {
880
+ "epoch": 28.022916666666667,
881
+ "grad_norm": 7.408577919006348,
882
+ "learning_rate": 4.050925925925926e-06,
883
+ "loss": 1.0887,
884
+ "step": 890
885
+ },
886
+ {
887
+ "epoch": 28.032291666666666,
888
+ "eval_accuracy": 0.47157190635451507,
889
+ "eval_loss": 2.151550054550171,
890
+ "eval_runtime": 157.7417,
891
+ "eval_samples_per_second": 1.896,
892
+ "eval_steps_per_second": 0.063,
893
+ "step": 899
894
+ },
895
+ {
896
+ "epoch": 29.001041666666666,
897
+ "grad_norm": 7.416742324829102,
898
+ "learning_rate": 3.4722222222222224e-06,
899
+ "loss": 1.1448,
900
+ "step": 900
901
+ },
902
+ {
903
+ "epoch": 29.011458333333334,
904
+ "grad_norm": 8.579631805419922,
905
+ "learning_rate": 2.893518518518519e-06,
906
+ "loss": 1.051,
907
+ "step": 910
908
+ },
909
+ {
910
+ "epoch": 29.021875,
911
+ "grad_norm": 7.20076847076416,
912
+ "learning_rate": 2.3148148148148148e-06,
913
+ "loss": 0.9936,
914
+ "step": 920
915
+ },
916
+ {
917
+ "epoch": 29.032291666666666,
918
+ "grad_norm": 20.40438461303711,
919
+ "learning_rate": 1.7361111111111112e-06,
920
+ "loss": 1.1127,
921
+ "step": 930
922
+ },
923
+ {
924
+ "epoch": 29.032291666666666,
925
+ "eval_accuracy": 0.4882943143812709,
926
+ "eval_loss": 2.1050899028778076,
927
+ "eval_runtime": 158.1493,
928
+ "eval_samples_per_second": 1.891,
929
+ "eval_steps_per_second": 0.063,
930
+ "step": 930
931
+ },
932
+ {
933
+ "epoch": 30.010416666666668,
934
+ "grad_norm": 7.520814895629883,
935
+ "learning_rate": 1.1574074074074074e-06,
936
+ "loss": 1.0214,
937
+ "step": 940
938
+ },
939
+ {
940
+ "epoch": 30.020833333333332,
941
+ "grad_norm": 7.830852508544922,
942
+ "learning_rate": 5.787037037037037e-07,
943
+ "loss": 0.9516,
944
+ "step": 950
945
+ },
946
+ {
947
+ "epoch": 30.03125,
948
+ "grad_norm": 8.230799674987793,
949
+ "learning_rate": 0.0,
950
+ "loss": 0.9905,
951
+ "step": 960
952
+ },
953
+ {
954
+ "epoch": 30.03125,
955
+ "eval_accuracy": 0.4816053511705686,
956
+ "eval_loss": 2.117016553878784,
957
+ "eval_runtime": 175.2626,
958
+ "eval_samples_per_second": 1.706,
959
+ "eval_steps_per_second": 0.057,
960
+ "step": 960
961
+ },
962
+ {
963
+ "epoch": 30.03125,
964
+ "step": 960,
965
+ "total_flos": 3.733004678582315e+19,
966
+ "train_loss": 2.590478341778119,
967
+ "train_runtime": 22304.5183,
968
+ "train_samples_per_second": 1.377,
969
+ "train_steps_per_second": 0.043
970
+ },
971
+ {
972
+ "epoch": 30.03125,
973
+ "eval_accuracy": 0.4882943143812709,
974
+ "eval_loss": 2.1814045906066895,
975
+ "eval_runtime": 181.2056,
976
+ "eval_samples_per_second": 1.65,
977
+ "eval_steps_per_second": 0.055,
978
+ "step": 960
979
+ },
980
+ {
981
+ "epoch": 30.03125,
982
+ "eval_accuracy": 0.4882943143812709,
983
+ "eval_loss": 2.180596113204956,
984
+ "eval_runtime": 179.6038,
985
+ "eval_samples_per_second": 1.665,
986
+ "eval_steps_per_second": 0.056,
987
+ "step": 960
988
  }
989
  ],
990
  "logging_steps": 10,
991
+ "max_steps": 960,
992
  "num_input_tokens_seen": 0,
993
  "num_train_epochs": 9223372036854775807,
994
  "save_steps": 500,
 
1004
  "attributes": {}
1005
  }
1006
  },
1007
+ "total_flos": 3.733004678582315e+19,
1008
  "train_batch_size": 32,
1009
  "trial_name": null,
1010
  "trial_params": null