jayanthspratap commited on
Commit
825abf7
·
verified ·
1 Parent(s): baef539

End of training

Browse files
README.md CHANGED
@@ -30,7 +30,7 @@ should probably proofread and complete it, then remove this comment. -->
30
 
31
  This model was trained from scratch on the imagefolder dataset.
32
  It achieves the following results on the evaluation set:
33
- - Loss: 0.5502
34
  - Accuracy: 0.75
35
 
36
  ## Model description
 
30
 
31
  This model was trained from scratch on the imagefolder dataset.
32
  It achieves the following results on the evaluation set:
33
+ - Loss: 0.6787
34
  - Accuracy: 0.75
35
 
36
  ## Model description
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 29.76,
3
+ "eval_accuracy": 0.75,
4
+ "eval_loss": 0.6786516308784485,
5
+ "eval_runtime": 1.1783,
6
+ "eval_samples_per_second": 27.158,
7
+ "eval_steps_per_second": 27.158,
8
+ "total_flos": 3.8132430847082496e+17,
9
+ "train_loss": 0.5704158216394404,
10
+ "train_runtime": 536.3007,
11
+ "train_samples_per_second": 6.992,
12
+ "train_steps_per_second": 1.734
13
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 29.76,
3
+ "eval_accuracy": 0.75,
4
+ "eval_loss": 0.6786516308784485,
5
+ "eval_runtime": 1.1783,
6
+ "eval_samples_per_second": 27.158,
7
+ "eval_steps_per_second": 27.158
8
+ }
runs/Aug13_14-22-46_EMIMDGXA100GPU1/events.out.tfevents.1723573954.EMIMDGXA100GPU1.133133.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9a3923e0449843d9ab38f585bdb0ae2b8bb431dd7cc6757b26adc36ee56663e
3
+ size 411
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 29.76,
3
+ "total_flos": 3.8132430847082496e+17,
4
+ "train_loss": 0.5704158216394404,
5
+ "train_runtime": 536.3007,
6
+ "train_samples_per_second": 6.992,
7
+ "train_steps_per_second": 1.734
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,963 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.75,
3
+ "best_model_checkpoint": "2024_08_13/checkpoint-62",
4
+ "epoch": 29.76,
5
+ "eval_steps": 500,
6
+ "global_step": 930,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.32,
13
+ "grad_norm": 10.543134689331055,
14
+ "learning_rate": 1.0752688172043011e-07,
15
+ "loss": 0.7672,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.64,
20
+ "grad_norm": 3.446369171142578,
21
+ "learning_rate": 2.1505376344086022e-07,
22
+ "loss": 0.7681,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.96,
27
+ "grad_norm": 11.52937126159668,
28
+ "learning_rate": 3.225806451612903e-07,
29
+ "loss": 0.7191,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.992,
34
+ "eval_accuracy": 0.25,
35
+ "eval_loss": 0.7458651065826416,
36
+ "eval_runtime": 1.1044,
37
+ "eval_samples_per_second": 28.975,
38
+ "eval_steps_per_second": 28.975,
39
+ "step": 31
40
+ },
41
+ {
42
+ "epoch": 1.28,
43
+ "grad_norm": 5.57147741317749,
44
+ "learning_rate": 4.3010752688172043e-07,
45
+ "loss": 0.7269,
46
+ "step": 40
47
+ },
48
+ {
49
+ "epoch": 1.6,
50
+ "grad_norm": 5.154497146606445,
51
+ "learning_rate": 5.376344086021505e-07,
52
+ "loss": 0.7094,
53
+ "step": 50
54
+ },
55
+ {
56
+ "epoch": 1.92,
57
+ "grad_norm": 7.2774434089660645,
58
+ "learning_rate": 6.451612903225806e-07,
59
+ "loss": 0.6894,
60
+ "step": 60
61
+ },
62
+ {
63
+ "epoch": 1.984,
64
+ "eval_accuracy": 0.75,
65
+ "eval_loss": 0.6786516308784485,
66
+ "eval_runtime": 1.183,
67
+ "eval_samples_per_second": 27.05,
68
+ "eval_steps_per_second": 27.05,
69
+ "step": 62
70
+ },
71
+ {
72
+ "epoch": 2.24,
73
+ "grad_norm": 3.891305923461914,
74
+ "learning_rate": 7.526881720430107e-07,
75
+ "loss": 0.6601,
76
+ "step": 70
77
+ },
78
+ {
79
+ "epoch": 2.56,
80
+ "grad_norm": 4.829281330108643,
81
+ "learning_rate": 8.602150537634409e-07,
82
+ "loss": 0.6542,
83
+ "step": 80
84
+ },
85
+ {
86
+ "epoch": 2.88,
87
+ "grad_norm": 4.894190311431885,
88
+ "learning_rate": 9.67741935483871e-07,
89
+ "loss": 0.5993,
90
+ "step": 90
91
+ },
92
+ {
93
+ "epoch": 2.976,
94
+ "eval_accuracy": 0.75,
95
+ "eval_loss": 0.6089950799942017,
96
+ "eval_runtime": 1.1112,
97
+ "eval_samples_per_second": 28.798,
98
+ "eval_steps_per_second": 28.798,
99
+ "step": 93
100
+ },
101
+ {
102
+ "epoch": 3.2,
103
+ "grad_norm": 3.572125196456909,
104
+ "learning_rate": 9.91636798088411e-07,
105
+ "loss": 0.5845,
106
+ "step": 100
107
+ },
108
+ {
109
+ "epoch": 3.52,
110
+ "grad_norm": 6.358020305633545,
111
+ "learning_rate": 9.79689366786141e-07,
112
+ "loss": 0.5637,
113
+ "step": 110
114
+ },
115
+ {
116
+ "epoch": 3.84,
117
+ "grad_norm": 2.5889291763305664,
118
+ "learning_rate": 9.67741935483871e-07,
119
+ "loss": 0.5858,
120
+ "step": 120
121
+ },
122
+ {
123
+ "epoch": 4.0,
124
+ "eval_accuracy": 0.75,
125
+ "eval_loss": 0.5701560974121094,
126
+ "eval_runtime": 1.1543,
127
+ "eval_samples_per_second": 27.722,
128
+ "eval_steps_per_second": 27.722,
129
+ "step": 125
130
+ },
131
+ {
132
+ "epoch": 4.16,
133
+ "grad_norm": 5.341928482055664,
134
+ "learning_rate": 9.557945041816009e-07,
135
+ "loss": 0.559,
136
+ "step": 130
137
+ },
138
+ {
139
+ "epoch": 4.48,
140
+ "grad_norm": 5.699573993682861,
141
+ "learning_rate": 9.438470728793309e-07,
142
+ "loss": 0.4752,
143
+ "step": 140
144
+ },
145
+ {
146
+ "epoch": 4.8,
147
+ "grad_norm": 6.19121789932251,
148
+ "learning_rate": 9.318996415770609e-07,
149
+ "loss": 0.5407,
150
+ "step": 150
151
+ },
152
+ {
153
+ "epoch": 4.992,
154
+ "eval_accuracy": 0.75,
155
+ "eval_loss": 0.5572408437728882,
156
+ "eval_runtime": 1.1052,
157
+ "eval_samples_per_second": 28.954,
158
+ "eval_steps_per_second": 28.954,
159
+ "step": 156
160
+ },
161
+ {
162
+ "epoch": 5.12,
163
+ "grad_norm": 2.574436664581299,
164
+ "learning_rate": 9.199522102747909e-07,
165
+ "loss": 0.6231,
166
+ "step": 160
167
+ },
168
+ {
169
+ "epoch": 5.44,
170
+ "grad_norm": 5.261692523956299,
171
+ "learning_rate": 9.080047789725208e-07,
172
+ "loss": 0.4838,
173
+ "step": 170
174
+ },
175
+ {
176
+ "epoch": 5.76,
177
+ "grad_norm": 5.266414642333984,
178
+ "learning_rate": 8.960573476702509e-07,
179
+ "loss": 0.6552,
180
+ "step": 180
181
+ },
182
+ {
183
+ "epoch": 5.984,
184
+ "eval_accuracy": 0.75,
185
+ "eval_loss": 0.5552529692649841,
186
+ "eval_runtime": 1.1351,
187
+ "eval_samples_per_second": 28.192,
188
+ "eval_steps_per_second": 28.192,
189
+ "step": 187
190
+ },
191
+ {
192
+ "epoch": 6.08,
193
+ "grad_norm": 5.3043341636657715,
194
+ "learning_rate": 8.841099163679809e-07,
195
+ "loss": 0.5321,
196
+ "step": 190
197
+ },
198
+ {
199
+ "epoch": 6.4,
200
+ "grad_norm": 3.8107211589813232,
201
+ "learning_rate": 8.721624850657109e-07,
202
+ "loss": 0.5797,
203
+ "step": 200
204
+ },
205
+ {
206
+ "epoch": 6.72,
207
+ "grad_norm": 13.660761833190918,
208
+ "learning_rate": 8.602150537634409e-07,
209
+ "loss": 0.5562,
210
+ "step": 210
211
+ },
212
+ {
213
+ "epoch": 6.976,
214
+ "eval_accuracy": 0.75,
215
+ "eval_loss": 0.552901029586792,
216
+ "eval_runtime": 1.1121,
217
+ "eval_samples_per_second": 28.774,
218
+ "eval_steps_per_second": 28.774,
219
+ "step": 218
220
+ },
221
+ {
222
+ "epoch": 7.04,
223
+ "grad_norm": 5.3051605224609375,
224
+ "learning_rate": 8.482676224611708e-07,
225
+ "loss": 0.4844,
226
+ "step": 220
227
+ },
228
+ {
229
+ "epoch": 7.36,
230
+ "grad_norm": 3.601945400238037,
231
+ "learning_rate": 8.363201911589009e-07,
232
+ "loss": 0.6067,
233
+ "step": 230
234
+ },
235
+ {
236
+ "epoch": 7.68,
237
+ "grad_norm": 5.2441229820251465,
238
+ "learning_rate": 8.243727598566307e-07,
239
+ "loss": 0.519,
240
+ "step": 240
241
+ },
242
+ {
243
+ "epoch": 8.0,
244
+ "grad_norm": 8.390249252319336,
245
+ "learning_rate": 8.124253285543607e-07,
246
+ "loss": 0.6054,
247
+ "step": 250
248
+ },
249
+ {
250
+ "epoch": 8.0,
251
+ "eval_accuracy": 0.75,
252
+ "eval_loss": 0.5519319772720337,
253
+ "eval_runtime": 1.1259,
254
+ "eval_samples_per_second": 28.423,
255
+ "eval_steps_per_second": 28.423,
256
+ "step": 250
257
+ },
258
+ {
259
+ "epoch": 8.32,
260
+ "grad_norm": 4.872560024261475,
261
+ "learning_rate": 8.004778972520908e-07,
262
+ "loss": 0.4244,
263
+ "step": 260
264
+ },
265
+ {
266
+ "epoch": 8.64,
267
+ "grad_norm": 4.932515621185303,
268
+ "learning_rate": 7.885304659498207e-07,
269
+ "loss": 0.501,
270
+ "step": 270
271
+ },
272
+ {
273
+ "epoch": 8.96,
274
+ "grad_norm": 10.174718856811523,
275
+ "learning_rate": 7.765830346475507e-07,
276
+ "loss": 0.7563,
277
+ "step": 280
278
+ },
279
+ {
280
+ "epoch": 8.992,
281
+ "eval_accuracy": 0.75,
282
+ "eval_loss": 0.5517733097076416,
283
+ "eval_runtime": 1.1268,
284
+ "eval_samples_per_second": 28.399,
285
+ "eval_steps_per_second": 28.399,
286
+ "step": 281
287
+ },
288
+ {
289
+ "epoch": 9.28,
290
+ "grad_norm": 10.531723022460938,
291
+ "learning_rate": 7.646356033452807e-07,
292
+ "loss": 0.7409,
293
+ "step": 290
294
+ },
295
+ {
296
+ "epoch": 9.6,
297
+ "grad_norm": 5.187292098999023,
298
+ "learning_rate": 7.526881720430107e-07,
299
+ "loss": 0.4602,
300
+ "step": 300
301
+ },
302
+ {
303
+ "epoch": 9.92,
304
+ "grad_norm": 5.227697849273682,
305
+ "learning_rate": 7.407407407407406e-07,
306
+ "loss": 0.5174,
307
+ "step": 310
308
+ },
309
+ {
310
+ "epoch": 9.984,
311
+ "eval_accuracy": 0.75,
312
+ "eval_loss": 0.5523006319999695,
313
+ "eval_runtime": 1.1419,
314
+ "eval_samples_per_second": 28.023,
315
+ "eval_steps_per_second": 28.023,
316
+ "step": 312
317
+ },
318
+ {
319
+ "epoch": 10.24,
320
+ "grad_norm": 3.8940961360931396,
321
+ "learning_rate": 7.287933094384707e-07,
322
+ "loss": 0.4655,
323
+ "step": 320
324
+ },
325
+ {
326
+ "epoch": 10.56,
327
+ "grad_norm": 9.157276153564453,
328
+ "learning_rate": 7.168458781362007e-07,
329
+ "loss": 0.8298,
330
+ "step": 330
331
+ },
332
+ {
333
+ "epoch": 10.88,
334
+ "grad_norm": 6.260276794433594,
335
+ "learning_rate": 7.048984468339306e-07,
336
+ "loss": 0.3765,
337
+ "step": 340
338
+ },
339
+ {
340
+ "epoch": 10.975999999999999,
341
+ "eval_accuracy": 0.75,
342
+ "eval_loss": 0.5513983964920044,
343
+ "eval_runtime": 1.3718,
344
+ "eval_samples_per_second": 23.327,
345
+ "eval_steps_per_second": 23.327,
346
+ "step": 343
347
+ },
348
+ {
349
+ "epoch": 11.2,
350
+ "grad_norm": 6.0307416915893555,
351
+ "learning_rate": 6.929510155316607e-07,
352
+ "loss": 0.409,
353
+ "step": 350
354
+ },
355
+ {
356
+ "epoch": 11.52,
357
+ "grad_norm": 7.104811191558838,
358
+ "learning_rate": 6.810035842293906e-07,
359
+ "loss": 0.5515,
360
+ "step": 360
361
+ },
362
+ {
363
+ "epoch": 11.84,
364
+ "grad_norm": 1.7931679487228394,
365
+ "learning_rate": 6.690561529271206e-07,
366
+ "loss": 0.5727,
367
+ "step": 370
368
+ },
369
+ {
370
+ "epoch": 12.0,
371
+ "eval_accuracy": 0.75,
372
+ "eval_loss": 0.5506787300109863,
373
+ "eval_runtime": 1.1592,
374
+ "eval_samples_per_second": 27.606,
375
+ "eval_steps_per_second": 27.606,
376
+ "step": 375
377
+ },
378
+ {
379
+ "epoch": 12.16,
380
+ "grad_norm": 15.06185245513916,
381
+ "learning_rate": 6.571087216248506e-07,
382
+ "loss": 0.6646,
383
+ "step": 380
384
+ },
385
+ {
386
+ "epoch": 12.48,
387
+ "grad_norm": 4.5719828605651855,
388
+ "learning_rate": 6.451612903225806e-07,
389
+ "loss": 0.6321,
390
+ "step": 390
391
+ },
392
+ {
393
+ "epoch": 12.8,
394
+ "grad_norm": 10.500142097473145,
395
+ "learning_rate": 6.332138590203107e-07,
396
+ "loss": 0.5613,
397
+ "step": 400
398
+ },
399
+ {
400
+ "epoch": 12.992,
401
+ "eval_accuracy": 0.75,
402
+ "eval_loss": 0.5510138273239136,
403
+ "eval_runtime": 1.1313,
404
+ "eval_samples_per_second": 28.287,
405
+ "eval_steps_per_second": 28.287,
406
+ "step": 406
407
+ },
408
+ {
409
+ "epoch": 13.12,
410
+ "grad_norm": 3.0991406440734863,
411
+ "learning_rate": 6.212664277180406e-07,
412
+ "loss": 0.3966,
413
+ "step": 410
414
+ },
415
+ {
416
+ "epoch": 13.44,
417
+ "grad_norm": 2.3058762550354004,
418
+ "learning_rate": 6.093189964157706e-07,
419
+ "loss": 0.5845,
420
+ "step": 420
421
+ },
422
+ {
423
+ "epoch": 13.76,
424
+ "grad_norm": 2.215249538421631,
425
+ "learning_rate": 5.973715651135006e-07,
426
+ "loss": 0.568,
427
+ "step": 430
428
+ },
429
+ {
430
+ "epoch": 13.984,
431
+ "eval_accuracy": 0.75,
432
+ "eval_loss": 0.5510228872299194,
433
+ "eval_runtime": 1.3316,
434
+ "eval_samples_per_second": 24.031,
435
+ "eval_steps_per_second": 24.031,
436
+ "step": 437
437
+ },
438
+ {
439
+ "epoch": 14.08,
440
+ "grad_norm": 2.978492021560669,
441
+ "learning_rate": 5.854241338112306e-07,
442
+ "loss": 0.5611,
443
+ "step": 440
444
+ },
445
+ {
446
+ "epoch": 14.4,
447
+ "grad_norm": 3.9042763710021973,
448
+ "learning_rate": 5.734767025089605e-07,
449
+ "loss": 0.4335,
450
+ "step": 450
451
+ },
452
+ {
453
+ "epoch": 14.72,
454
+ "grad_norm": 8.8019380569458,
455
+ "learning_rate": 5.615292712066906e-07,
456
+ "loss": 0.6655,
457
+ "step": 460
458
+ },
459
+ {
460
+ "epoch": 14.975999999999999,
461
+ "eval_accuracy": 0.75,
462
+ "eval_loss": 0.5513969659805298,
463
+ "eval_runtime": 1.1115,
464
+ "eval_samples_per_second": 28.791,
465
+ "eval_steps_per_second": 28.791,
466
+ "step": 468
467
+ },
468
+ {
469
+ "epoch": 15.04,
470
+ "grad_norm": 3.463810920715332,
471
+ "learning_rate": 5.495818399044206e-07,
472
+ "loss": 0.5628,
473
+ "step": 470
474
+ },
475
+ {
476
+ "epoch": 15.36,
477
+ "grad_norm": 1.9772050380706787,
478
+ "learning_rate": 5.376344086021505e-07,
479
+ "loss": 0.504,
480
+ "step": 480
481
+ },
482
+ {
483
+ "epoch": 15.68,
484
+ "grad_norm": 2.6561172008514404,
485
+ "learning_rate": 5.256869772998806e-07,
486
+ "loss": 0.7277,
487
+ "step": 490
488
+ },
489
+ {
490
+ "epoch": 16.0,
491
+ "grad_norm": 5.404987335205078,
492
+ "learning_rate": 5.137395459976105e-07,
493
+ "loss": 0.4883,
494
+ "step": 500
495
+ },
496
+ {
497
+ "epoch": 16.0,
498
+ "eval_accuracy": 0.75,
499
+ "eval_loss": 0.5522246956825256,
500
+ "eval_runtime": 1.157,
501
+ "eval_samples_per_second": 27.658,
502
+ "eval_steps_per_second": 27.658,
503
+ "step": 500
504
+ },
505
+ {
506
+ "epoch": 16.32,
507
+ "grad_norm": 4.906336307525635,
508
+ "learning_rate": 5.017921146953405e-07,
509
+ "loss": 0.4576,
510
+ "step": 510
511
+ },
512
+ {
513
+ "epoch": 16.64,
514
+ "grad_norm": 3.543666124343872,
515
+ "learning_rate": 4.898446833930704e-07,
516
+ "loss": 0.687,
517
+ "step": 520
518
+ },
519
+ {
520
+ "epoch": 16.96,
521
+ "grad_norm": 5.162899017333984,
522
+ "learning_rate": 4.778972520908004e-07,
523
+ "loss": 0.5317,
524
+ "step": 530
525
+ },
526
+ {
527
+ "epoch": 16.992,
528
+ "eval_accuracy": 0.75,
529
+ "eval_loss": 0.5518386960029602,
530
+ "eval_runtime": 1.1469,
531
+ "eval_samples_per_second": 27.901,
532
+ "eval_steps_per_second": 27.901,
533
+ "step": 531
534
+ },
535
+ {
536
+ "epoch": 17.28,
537
+ "grad_norm": 3.6676950454711914,
538
+ "learning_rate": 4.6594982078853044e-07,
539
+ "loss": 0.5024,
540
+ "step": 540
541
+ },
542
+ {
543
+ "epoch": 17.6,
544
+ "grad_norm": 10.735907554626465,
545
+ "learning_rate": 4.540023894862604e-07,
546
+ "loss": 0.6743,
547
+ "step": 550
548
+ },
549
+ {
550
+ "epoch": 17.92,
551
+ "grad_norm": 4.575161457061768,
552
+ "learning_rate": 4.4205495818399044e-07,
553
+ "loss": 0.4501,
554
+ "step": 560
555
+ },
556
+ {
557
+ "epoch": 17.984,
558
+ "eval_accuracy": 0.75,
559
+ "eval_loss": 0.5519962906837463,
560
+ "eval_runtime": 1.0972,
561
+ "eval_samples_per_second": 29.165,
562
+ "eval_steps_per_second": 29.165,
563
+ "step": 562
564
+ },
565
+ {
566
+ "epoch": 18.24,
567
+ "grad_norm": 6.966436862945557,
568
+ "learning_rate": 4.3010752688172043e-07,
569
+ "loss": 0.7276,
570
+ "step": 570
571
+ },
572
+ {
573
+ "epoch": 18.56,
574
+ "grad_norm": 5.1026763916015625,
575
+ "learning_rate": 4.1816009557945043e-07,
576
+ "loss": 0.4801,
577
+ "step": 580
578
+ },
579
+ {
580
+ "epoch": 18.88,
581
+ "grad_norm": 6.751893043518066,
582
+ "learning_rate": 4.0621266427718037e-07,
583
+ "loss": 0.4616,
584
+ "step": 590
585
+ },
586
+ {
587
+ "epoch": 18.976,
588
+ "eval_accuracy": 0.75,
589
+ "eval_loss": 0.551902174949646,
590
+ "eval_runtime": 1.27,
591
+ "eval_samples_per_second": 25.197,
592
+ "eval_steps_per_second": 25.197,
593
+ "step": 593
594
+ },
595
+ {
596
+ "epoch": 19.2,
597
+ "grad_norm": 5.350219249725342,
598
+ "learning_rate": 3.9426523297491037e-07,
599
+ "loss": 0.4631,
600
+ "step": 600
601
+ },
602
+ {
603
+ "epoch": 19.52,
604
+ "grad_norm": 5.136310577392578,
605
+ "learning_rate": 3.8231780167264037e-07,
606
+ "loss": 0.5746,
607
+ "step": 610
608
+ },
609
+ {
610
+ "epoch": 19.84,
611
+ "grad_norm": 4.849793910980225,
612
+ "learning_rate": 3.703703703703703e-07,
613
+ "loss": 0.4522,
614
+ "step": 620
615
+ },
616
+ {
617
+ "epoch": 20.0,
618
+ "eval_accuracy": 0.75,
619
+ "eval_loss": 0.5509653091430664,
620
+ "eval_runtime": 1.1115,
621
+ "eval_samples_per_second": 28.789,
622
+ "eval_steps_per_second": 28.789,
623
+ "step": 625
624
+ },
625
+ {
626
+ "epoch": 20.16,
627
+ "grad_norm": 8.096334457397461,
628
+ "learning_rate": 3.5842293906810036e-07,
629
+ "loss": 0.66,
630
+ "step": 630
631
+ },
632
+ {
633
+ "epoch": 20.48,
634
+ "grad_norm": 11.139561653137207,
635
+ "learning_rate": 3.4647550776583036e-07,
636
+ "loss": 0.4573,
637
+ "step": 640
638
+ },
639
+ {
640
+ "epoch": 20.8,
641
+ "grad_norm": 5.0489583015441895,
642
+ "learning_rate": 3.345280764635603e-07,
643
+ "loss": 0.6326,
644
+ "step": 650
645
+ },
646
+ {
647
+ "epoch": 20.992,
648
+ "eval_accuracy": 0.75,
649
+ "eval_loss": 0.5507452487945557,
650
+ "eval_runtime": 1.2933,
651
+ "eval_samples_per_second": 24.742,
652
+ "eval_steps_per_second": 24.742,
653
+ "step": 656
654
+ },
655
+ {
656
+ "epoch": 21.12,
657
+ "grad_norm": 12.198716163635254,
658
+ "learning_rate": 3.225806451612903e-07,
659
+ "loss": 0.7282,
660
+ "step": 660
661
+ },
662
+ {
663
+ "epoch": 21.44,
664
+ "grad_norm": 4.501183986663818,
665
+ "learning_rate": 3.106332138590203e-07,
666
+ "loss": 0.51,
667
+ "step": 670
668
+ },
669
+ {
670
+ "epoch": 21.76,
671
+ "grad_norm": 5.399625778198242,
672
+ "learning_rate": 2.986857825567503e-07,
673
+ "loss": 0.3828,
674
+ "step": 680
675
+ },
676
+ {
677
+ "epoch": 21.984,
678
+ "eval_accuracy": 0.75,
679
+ "eval_loss": 0.5508217811584473,
680
+ "eval_runtime": 1.1768,
681
+ "eval_samples_per_second": 27.192,
682
+ "eval_steps_per_second": 27.192,
683
+ "step": 687
684
+ },
685
+ {
686
+ "epoch": 22.08,
687
+ "grad_norm": 3.282414436340332,
688
+ "learning_rate": 2.8673835125448024e-07,
689
+ "loss": 0.6789,
690
+ "step": 690
691
+ },
692
+ {
693
+ "epoch": 22.4,
694
+ "grad_norm": 4.816540718078613,
695
+ "learning_rate": 2.747909199522103e-07,
696
+ "loss": 0.5746,
697
+ "step": 700
698
+ },
699
+ {
700
+ "epoch": 22.72,
701
+ "grad_norm": 3.5417306423187256,
702
+ "learning_rate": 2.628434886499403e-07,
703
+ "loss": 0.4283,
704
+ "step": 710
705
+ },
706
+ {
707
+ "epoch": 22.976,
708
+ "eval_accuracy": 0.75,
709
+ "eval_loss": 0.5509472489356995,
710
+ "eval_runtime": 1.2114,
711
+ "eval_samples_per_second": 26.417,
712
+ "eval_steps_per_second": 26.417,
713
+ "step": 718
714
+ },
715
+ {
716
+ "epoch": 23.04,
717
+ "grad_norm": 5.404343605041504,
718
+ "learning_rate": 2.508960573476702e-07,
719
+ "loss": 0.5891,
720
+ "step": 720
721
+ },
722
+ {
723
+ "epoch": 23.36,
724
+ "grad_norm": 4.924590587615967,
725
+ "learning_rate": 2.389486260454002e-07,
726
+ "loss": 0.4529,
727
+ "step": 730
728
+ },
729
+ {
730
+ "epoch": 23.68,
731
+ "grad_norm": 3.1310863494873047,
732
+ "learning_rate": 2.270011947431302e-07,
733
+ "loss": 0.5812,
734
+ "step": 740
735
+ },
736
+ {
737
+ "epoch": 24.0,
738
+ "grad_norm": 5.183323383331299,
739
+ "learning_rate": 2.1505376344086022e-07,
740
+ "loss": 0.6701,
741
+ "step": 750
742
+ },
743
+ {
744
+ "epoch": 24.0,
745
+ "eval_accuracy": 0.75,
746
+ "eval_loss": 0.5505539178848267,
747
+ "eval_runtime": 1.1047,
748
+ "eval_samples_per_second": 28.967,
749
+ "eval_steps_per_second": 28.967,
750
+ "step": 750
751
+ },
752
+ {
753
+ "epoch": 24.32,
754
+ "grad_norm": 11.777995109558105,
755
+ "learning_rate": 2.0310633213859019e-07,
756
+ "loss": 0.5262,
757
+ "step": 760
758
+ },
759
+ {
760
+ "epoch": 24.64,
761
+ "grad_norm": 2.3787074089050293,
762
+ "learning_rate": 1.9115890083632018e-07,
763
+ "loss": 0.4884,
764
+ "step": 770
765
+ },
766
+ {
767
+ "epoch": 24.96,
768
+ "grad_norm": 5.162797927856445,
769
+ "learning_rate": 1.7921146953405018e-07,
770
+ "loss": 0.6157,
771
+ "step": 780
772
+ },
773
+ {
774
+ "epoch": 24.992,
775
+ "eval_accuracy": 0.75,
776
+ "eval_loss": 0.5503212213516235,
777
+ "eval_runtime": 1.2194,
778
+ "eval_samples_per_second": 26.243,
779
+ "eval_steps_per_second": 26.243,
780
+ "step": 781
781
+ },
782
+ {
783
+ "epoch": 25.28,
784
+ "grad_norm": 4.399529933929443,
785
+ "learning_rate": 1.6726403823178015e-07,
786
+ "loss": 0.5277,
787
+ "step": 790
788
+ },
789
+ {
790
+ "epoch": 25.6,
791
+ "grad_norm": 8.869352340698242,
792
+ "learning_rate": 1.5531660692951015e-07,
793
+ "loss": 0.6222,
794
+ "step": 800
795
+ },
796
+ {
797
+ "epoch": 25.92,
798
+ "grad_norm": 3.5912718772888184,
799
+ "learning_rate": 1.4336917562724012e-07,
800
+ "loss": 0.5657,
801
+ "step": 810
802
+ },
803
+ {
804
+ "epoch": 25.984,
805
+ "eval_accuracy": 0.75,
806
+ "eval_loss": 0.5502746105194092,
807
+ "eval_runtime": 1.0991,
808
+ "eval_samples_per_second": 29.116,
809
+ "eval_steps_per_second": 29.116,
810
+ "step": 812
811
+ },
812
+ {
813
+ "epoch": 26.24,
814
+ "grad_norm": 4.004587173461914,
815
+ "learning_rate": 1.3142174432497014e-07,
816
+ "loss": 0.5406,
817
+ "step": 820
818
+ },
819
+ {
820
+ "epoch": 26.56,
821
+ "grad_norm": 6.145273208618164,
822
+ "learning_rate": 1.194743130227001e-07,
823
+ "loss": 0.6507,
824
+ "step": 830
825
+ },
826
+ {
827
+ "epoch": 26.88,
828
+ "grad_norm": 2.8329687118530273,
829
+ "learning_rate": 1.0752688172043011e-07,
830
+ "loss": 0.5127,
831
+ "step": 840
832
+ },
833
+ {
834
+ "epoch": 26.976,
835
+ "eval_accuracy": 0.75,
836
+ "eval_loss": 0.5503281354904175,
837
+ "eval_runtime": 1.277,
838
+ "eval_samples_per_second": 25.058,
839
+ "eval_steps_per_second": 25.058,
840
+ "step": 843
841
+ },
842
+ {
843
+ "epoch": 27.2,
844
+ "grad_norm": 5.049647808074951,
845
+ "learning_rate": 9.557945041816009e-08,
846
+ "loss": 0.4542,
847
+ "step": 850
848
+ },
849
+ {
850
+ "epoch": 27.52,
851
+ "grad_norm": 2.4863924980163574,
852
+ "learning_rate": 8.363201911589008e-08,
853
+ "loss": 0.5172,
854
+ "step": 860
855
+ },
856
+ {
857
+ "epoch": 27.84,
858
+ "grad_norm": 5.455885887145996,
859
+ "learning_rate": 7.168458781362006e-08,
860
+ "loss": 0.6178,
861
+ "step": 870
862
+ },
863
+ {
864
+ "epoch": 28.0,
865
+ "eval_accuracy": 0.75,
866
+ "eval_loss": 0.5502800941467285,
867
+ "eval_runtime": 1.1133,
868
+ "eval_samples_per_second": 28.744,
869
+ "eval_steps_per_second": 28.744,
870
+ "step": 875
871
+ },
872
+ {
873
+ "epoch": 28.16,
874
+ "grad_norm": 5.835262298583984,
875
+ "learning_rate": 5.973715651135006e-08,
876
+ "loss": 0.4629,
877
+ "step": 880
878
+ },
879
+ {
880
+ "epoch": 28.48,
881
+ "grad_norm": 5.468852519989014,
882
+ "learning_rate": 4.7789725209080046e-08,
883
+ "loss": 0.381,
884
+ "step": 890
885
+ },
886
+ {
887
+ "epoch": 28.8,
888
+ "grad_norm": 4.890566825866699,
889
+ "learning_rate": 3.584229390681003e-08,
890
+ "loss": 0.5679,
891
+ "step": 900
892
+ },
893
+ {
894
+ "epoch": 28.992,
895
+ "eval_accuracy": 0.75,
896
+ "eval_loss": 0.5502068996429443,
897
+ "eval_runtime": 1.1769,
898
+ "eval_samples_per_second": 27.191,
899
+ "eval_steps_per_second": 27.191,
900
+ "step": 906
901
+ },
902
+ {
903
+ "epoch": 29.12,
904
+ "grad_norm": 5.2326884269714355,
905
+ "learning_rate": 2.3894862604540023e-08,
906
+ "loss": 0.7681,
907
+ "step": 910
908
+ },
909
+ {
910
+ "epoch": 29.44,
911
+ "grad_norm": 8.419669151306152,
912
+ "learning_rate": 1.1947431302270011e-08,
913
+ "loss": 0.6077,
914
+ "step": 920
915
+ },
916
+ {
917
+ "epoch": 29.76,
918
+ "grad_norm": 2.1025705337524414,
919
+ "learning_rate": 0.0,
920
+ "loss": 0.6102,
921
+ "step": 930
922
+ },
923
+ {
924
+ "epoch": 29.76,
925
+ "eval_accuracy": 0.75,
926
+ "eval_loss": 0.5502274632453918,
927
+ "eval_runtime": 1.6314,
928
+ "eval_samples_per_second": 19.615,
929
+ "eval_steps_per_second": 19.615,
930
+ "step": 930
931
+ },
932
+ {
933
+ "epoch": 29.76,
934
+ "step": 930,
935
+ "total_flos": 3.8132430847082496e+17,
936
+ "train_loss": 0.5704158216394404,
937
+ "train_runtime": 536.3007,
938
+ "train_samples_per_second": 6.992,
939
+ "train_steps_per_second": 1.734
940
+ }
941
+ ],
942
+ "logging_steps": 10,
943
+ "max_steps": 930,
944
+ "num_input_tokens_seen": 0,
945
+ "num_train_epochs": 30,
946
+ "save_steps": 500,
947
+ "stateful_callbacks": {
948
+ "TrainerControl": {
949
+ "args": {
950
+ "should_epoch_stop": false,
951
+ "should_evaluate": false,
952
+ "should_log": false,
953
+ "should_save": true,
954
+ "should_training_stop": true
955
+ },
956
+ "attributes": {}
957
+ }
958
+ },
959
+ "total_flos": 3.8132430847082496e+17,
960
+ "train_batch_size": 1,
961
+ "trial_name": null,
962
+ "trial_params": null
963
+ }