nrshoudi commited on
Commit
08eb3e7
·
verified ·
1 Parent(s): 3dd7bb7

End of training

Browse files
README.md CHANGED
@@ -16,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  This model is a fine-tuned version of [openai/whisper-small](https://huggingface.co/openai/whisper-small) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 0.2137
20
 
21
  ## Model description
22
 
@@ -49,16 +49,16 @@ The following hyperparameters were used during training:
49
 
50
  | Training Loss | Epoch | Step | Validation Loss |
51
  |:-------------:|:-----:|:----:|:---------------:|
52
- | 0.0681 | 1.0 | 546 | 0.1955 |
53
- | 0.0367 | 2.0 | 1092 | 0.1992 |
54
- | 0.0382 | 3.0 | 1638 | 0.1857 |
55
- | 0.0189 | 4.0 | 2184 | 0.1970 |
56
- | 0.0274 | 5.0 | 2730 | 0.1894 |
57
- | 0.02 | 6.0 | 3276 | 0.1877 |
58
- | 0.0087 | 7.0 | 3822 | 0.1908 |
59
- | 0.0066 | 8.0 | 4368 | 0.2085 |
60
- | 0.0055 | 9.0 | 4914 | 0.2100 |
61
- | 0.0013 | 10.0 | 5460 | 0.2137 |
62
 
63
 
64
  ### Framework versions
 
16
 
17
  This model is a fine-tuned version of [openai/whisper-small](https://huggingface.co/openai/whisper-small) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 0.2212
20
 
21
  ## Model description
22
 
 
49
 
50
  | Training Loss | Epoch | Step | Validation Loss |
51
  |:-------------:|:-----:|:----:|:---------------:|
52
+ | 0.0726 | 1.0 | 546 | 0.2210 |
53
+ | 0.0419 | 2.0 | 1092 | 0.2139 |
54
+ | 0.0322 | 3.0 | 1638 | 0.1935 |
55
+ | 0.0175 | 4.0 | 2184 | 0.1896 |
56
+ | 0.0266 | 5.0 | 2730 | 0.1927 |
57
+ | 0.0178 | 6.0 | 3276 | 0.2013 |
58
+ | 0.0081 | 7.0 | 3822 | 0.1979 |
59
+ | 0.0081 | 8.0 | 4368 | 0.2113 |
60
+ | 0.0018 | 9.0 | 4914 | 0.2146 |
61
+ | 0.0015 | 10.0 | 5460 | 0.2212 |
62
 
63
 
64
  ### Framework versions
adapter_1/adapter_config.json CHANGED
@@ -23,8 +23,8 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "q_proj",
27
- "v_proj"
28
  ],
29
  "task_type": null,
30
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "v_proj",
27
+ "q_proj"
28
  ],
29
  "task_type": null,
30
  "use_dora": false,
adapter_1/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abb0ebdc47891ae516deaeab040b653abd88b2dceb9990155159ce05013d93b9
3
  size 14176064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b28887bcb66a49c5a999c6eb3d3de767b7cce7fddc9b4c82e33787479d628aa8
3
  size 14176064
adapter_config.json CHANGED
@@ -23,8 +23,8 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "q_proj",
27
- "v_proj"
28
  ],
29
  "task_type": null,
30
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "v_proj",
27
+ "q_proj"
28
  ],
29
  "task_type": null,
30
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de7195721032862164c68a850217a7bbc5a0df6dd26266eb5ee8c195bfb57721
3
  size 14176064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c76afb1d27a22a98d2438cd165915fee99c1936be55a353c091f790c722bdd2
3
  size 14176064
trainer_state.json CHANGED
@@ -10,1617 +10,1617 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.05,
13
- "grad_norm": Infinity,
14
- "learning_rate": 0.00044,
15
- "loss": 4.1397,
16
  "step": 25
17
  },
18
  {
19
  "epoch": 0.09,
20
- "grad_norm": 0.8320172429084778,
21
- "learning_rate": 0.00094,
22
- "loss": 1.0739,
23
  "step": 50
24
  },
25
  {
26
  "epoch": 0.14,
27
- "grad_norm": 1.5708420276641846,
28
- "learning_rate": 0.0009959334565619224,
29
- "loss": 0.4997,
30
  "step": 75
31
  },
32
  {
33
  "epoch": 0.18,
34
- "grad_norm": 0.5893439650535583,
35
- "learning_rate": 0.000991312384473198,
36
- "loss": 0.1246,
37
  "step": 100
38
  },
39
  {
40
  "epoch": 0.23,
41
- "grad_norm": 2.0901906490325928,
42
- "learning_rate": 0.0009866913123844732,
43
- "loss": 0.1273,
44
  "step": 125
45
  },
46
  {
47
  "epoch": 0.27,
48
- "grad_norm": 0.9590554237365723,
49
- "learning_rate": 0.0009820702402957486,
50
- "loss": 0.1257,
51
  "step": 150
52
  },
53
  {
54
  "epoch": 0.32,
55
- "grad_norm": 1.531774878501892,
56
- "learning_rate": 0.000977449168207024,
57
- "loss": 0.1024,
58
  "step": 175
59
  },
60
  {
61
  "epoch": 0.37,
62
- "grad_norm": 1.2284561395645142,
63
- "learning_rate": 0.0009728280961182994,
64
- "loss": 0.1041,
65
  "step": 200
66
  },
67
  {
68
  "epoch": 0.41,
69
- "grad_norm": 1.0752886533737183,
70
- "learning_rate": 0.0009682070240295749,
71
- "loss": 0.1225,
72
  "step": 225
73
  },
74
  {
75
  "epoch": 0.46,
76
- "grad_norm": 0.7119214534759521,
77
- "learning_rate": 0.0009635859519408503,
78
- "loss": 0.0904,
79
  "step": 250
80
  },
81
  {
82
  "epoch": 0.5,
83
- "grad_norm": 1.0164552927017212,
84
- "learning_rate": 0.0009589648798521257,
85
- "loss": 0.0773,
86
  "step": 275
87
  },
88
  {
89
  "epoch": 0.55,
90
- "grad_norm": 0.4222384989261627,
91
- "learning_rate": 0.0009543438077634012,
92
- "loss": 0.2081,
93
  "step": 300
94
  },
95
  {
96
  "epoch": 0.6,
97
- "grad_norm": 0.17551083862781525,
98
- "learning_rate": 0.0009497227356746766,
99
- "loss": 0.0863,
100
  "step": 325
101
  },
102
  {
103
  "epoch": 0.64,
104
- "grad_norm": 0.41848981380462646,
105
- "learning_rate": 0.000945101663585952,
106
- "loss": 0.0632,
107
  "step": 350
108
  },
109
  {
110
  "epoch": 0.69,
111
- "grad_norm": 0.7539293766021729,
112
- "learning_rate": 0.0009404805914972274,
113
- "loss": 0.077,
114
  "step": 375
115
  },
116
  {
117
  "epoch": 0.73,
118
- "grad_norm": 0.3750676214694977,
119
- "learning_rate": 0.0009358595194085028,
120
- "loss": 0.0948,
121
  "step": 400
122
  },
123
  {
124
  "epoch": 0.78,
125
- "grad_norm": 0.33498436212539673,
126
- "learning_rate": 0.0009312384473197783,
127
- "loss": 0.077,
128
  "step": 425
129
  },
130
  {
131
  "epoch": 0.82,
132
- "grad_norm": 0.43420735001564026,
133
- "learning_rate": 0.0009266173752310536,
134
- "loss": 0.0729,
135
  "step": 450
136
  },
137
  {
138
  "epoch": 0.87,
139
- "grad_norm": 1.0590511560440063,
140
  "learning_rate": 0.0009219963031423291,
141
- "loss": 0.0816,
142
  "step": 475
143
  },
144
  {
145
  "epoch": 0.92,
146
- "grad_norm": 0.34223881363868713,
147
  "learning_rate": 0.0009173752310536044,
148
- "loss": 0.0567,
149
  "step": 500
150
  },
151
  {
152
  "epoch": 0.96,
153
- "grad_norm": 0.26913997530937195,
154
  "learning_rate": 0.0009127541589648799,
155
- "loss": 0.0681,
156
  "step": 525
157
  },
158
  {
159
  "epoch": 1.0,
160
- "eval_loss": 0.19546650350093842,
161
- "eval_runtime": 175.4508,
162
- "eval_samples_per_second": 4.634,
163
- "eval_steps_per_second": 0.775,
164
  "step": 546
165
  },
166
  {
167
  "epoch": 1.01,
168
- "grad_norm": 0.41807565093040466,
169
  "learning_rate": 0.0009081330868761552,
170
- "loss": 0.0365,
171
  "step": 550
172
  },
173
  {
174
  "epoch": 1.05,
175
- "grad_norm": 0.8459017872810364,
176
  "learning_rate": 0.0009035120147874307,
177
- "loss": 0.0481,
178
  "step": 575
179
  },
180
  {
181
  "epoch": 1.1,
182
- "grad_norm": 0.21556143462657928,
183
  "learning_rate": 0.000898890942698706,
184
- "loss": 0.0426,
185
  "step": 600
186
  },
187
  {
188
  "epoch": 1.14,
189
- "grad_norm": 0.06540340185165405,
190
  "learning_rate": 0.0008942698706099815,
191
- "loss": 0.0435,
192
  "step": 625
193
  },
194
  {
195
  "epoch": 1.19,
196
- "grad_norm": 0.8853626251220703,
197
  "learning_rate": 0.0008896487985212569,
198
- "loss": 0.0668,
199
  "step": 650
200
  },
201
  {
202
  "epoch": 1.24,
203
- "grad_norm": 0.2227557897567749,
204
  "learning_rate": 0.0008850277264325323,
205
- "loss": 0.0373,
206
  "step": 675
207
  },
208
  {
209
  "epoch": 1.28,
210
- "grad_norm": 0.5395527482032776,
211
  "learning_rate": 0.0008804066543438077,
212
- "loss": 0.0505,
213
  "step": 700
214
  },
215
  {
216
  "epoch": 1.33,
217
- "grad_norm": 0.47810012102127075,
218
  "learning_rate": 0.0008757855822550833,
219
- "loss": 0.0619,
220
  "step": 725
221
  },
222
  {
223
  "epoch": 1.37,
224
- "grad_norm": 0.41824525594711304,
225
  "learning_rate": 0.0008711645101663586,
226
- "loss": 0.0505,
227
  "step": 750
228
  },
229
  {
230
  "epoch": 1.42,
231
- "grad_norm": 0.35845717787742615,
232
  "learning_rate": 0.0008665434380776341,
233
- "loss": 0.0556,
234
  "step": 775
235
  },
236
  {
237
  "epoch": 1.47,
238
- "grad_norm": 0.591626763343811,
239
  "learning_rate": 0.0008619223659889095,
240
- "loss": 0.0452,
241
  "step": 800
242
  },
243
  {
244
  "epoch": 1.51,
245
- "grad_norm": 0.52753746509552,
246
  "learning_rate": 0.0008573012939001849,
247
- "loss": 0.0413,
248
  "step": 825
249
  },
250
  {
251
  "epoch": 1.56,
252
- "grad_norm": 0.17933356761932373,
253
  "learning_rate": 0.0008526802218114603,
254
- "loss": 0.0264,
255
  "step": 850
256
  },
257
  {
258
  "epoch": 1.6,
259
- "grad_norm": 0.4725402891635895,
260
  "learning_rate": 0.0008480591497227357,
261
- "loss": 0.0503,
262
  "step": 875
263
  },
264
  {
265
  "epoch": 1.65,
266
- "grad_norm": 0.43168240785598755,
267
  "learning_rate": 0.0008434380776340112,
268
- "loss": 0.0543,
269
  "step": 900
270
  },
271
  {
272
  "epoch": 1.69,
273
- "grad_norm": 0.15935927629470825,
274
  "learning_rate": 0.0008388170055452865,
275
- "loss": 0.0625,
276
  "step": 925
277
  },
278
  {
279
  "epoch": 1.74,
280
- "grad_norm": 0.1527830809354782,
281
  "learning_rate": 0.000834195933456562,
282
- "loss": 0.0338,
283
  "step": 950
284
  },
285
  {
286
  "epoch": 1.79,
287
- "grad_norm": 0.6140448451042175,
288
  "learning_rate": 0.0008295748613678373,
289
- "loss": 0.0391,
290
  "step": 975
291
  },
292
  {
293
  "epoch": 1.83,
294
- "grad_norm": 0.34989482164382935,
295
  "learning_rate": 0.0008249537892791128,
296
- "loss": 0.0428,
297
  "step": 1000
298
  },
299
  {
300
  "epoch": 1.88,
301
- "grad_norm": 1.364334225654602,
302
  "learning_rate": 0.0008203327171903881,
303
- "loss": 0.0452,
304
  "step": 1025
305
  },
306
  {
307
  "epoch": 1.92,
308
- "grad_norm": 0.4410119354724884,
309
  "learning_rate": 0.0008157116451016636,
310
- "loss": 0.0519,
311
  "step": 1050
312
  },
313
  {
314
  "epoch": 1.97,
315
- "grad_norm": 0.018156496807932854,
316
  "learning_rate": 0.000811090573012939,
317
- "loss": 0.0367,
318
  "step": 1075
319
  },
320
  {
321
  "epoch": 2.0,
322
- "eval_loss": 0.19916145503520966,
323
- "eval_runtime": 174.1191,
324
- "eval_samples_per_second": 4.669,
325
  "eval_steps_per_second": 0.781,
326
  "step": 1092
327
  },
328
  {
329
  "epoch": 2.01,
330
- "grad_norm": 0.2884193956851959,
331
  "learning_rate": 0.0008064695009242144,
332
- "loss": 0.0437,
333
  "step": 1100
334
  },
335
  {
336
  "epoch": 2.06,
337
- "grad_norm": 0.14955410361289978,
338
  "learning_rate": 0.0008018484288354898,
339
- "loss": 0.0332,
340
  "step": 1125
341
  },
342
  {
343
  "epoch": 2.11,
344
- "grad_norm": 0.016795417293906212,
345
  "learning_rate": 0.0007972273567467652,
346
- "loss": 0.0614,
347
  "step": 1150
348
  },
349
  {
350
  "epoch": 2.15,
351
- "grad_norm": 0.272935688495636,
352
  "learning_rate": 0.0007926062846580406,
353
- "loss": 0.0354,
354
  "step": 1175
355
  },
356
  {
357
  "epoch": 2.2,
358
- "grad_norm": 0.181545689702034,
359
  "learning_rate": 0.0007879852125693162,
360
- "loss": 0.0477,
361
  "step": 1200
362
  },
363
  {
364
  "epoch": 2.24,
365
- "grad_norm": 0.14347945153713226,
366
  "learning_rate": 0.0007833641404805915,
367
- "loss": 0.0285,
368
  "step": 1225
369
  },
370
  {
371
  "epoch": 2.29,
372
- "grad_norm": 0.1175965741276741,
373
  "learning_rate": 0.000778743068391867,
374
- "loss": 0.0398,
375
  "step": 1250
376
  },
377
  {
378
  "epoch": 2.34,
379
- "grad_norm": 0.089854396879673,
380
  "learning_rate": 0.0007741219963031424,
381
- "loss": 0.0473,
382
  "step": 1275
383
  },
384
  {
385
  "epoch": 2.38,
386
- "grad_norm": 0.3998129665851593,
387
  "learning_rate": 0.0007695009242144178,
388
- "loss": 0.0425,
389
  "step": 1300
390
  },
391
  {
392
  "epoch": 2.43,
393
- "grad_norm": 0.42039960622787476,
394
  "learning_rate": 0.0007648798521256932,
395
- "loss": 0.0404,
396
  "step": 1325
397
  },
398
  {
399
  "epoch": 2.47,
400
- "grad_norm": 0.3940460979938507,
401
  "learning_rate": 0.0007602587800369686,
402
- "loss": 0.0381,
403
  "step": 1350
404
  },
405
  {
406
  "epoch": 2.52,
407
- "grad_norm": 0.37924668192863464,
408
  "learning_rate": 0.0007556377079482441,
409
- "loss": 0.0397,
410
  "step": 1375
411
  },
412
  {
413
  "epoch": 2.56,
414
- "grad_norm": 0.46505168080329895,
415
  "learning_rate": 0.0007510166358595194,
416
- "loss": 0.0348,
417
  "step": 1400
418
  },
419
  {
420
  "epoch": 2.61,
421
- "grad_norm": 0.2604403495788574,
422
  "learning_rate": 0.0007463955637707949,
423
- "loss": 0.036,
424
  "step": 1425
425
  },
426
  {
427
  "epoch": 2.66,
428
- "grad_norm": 0.4742681384086609,
429
  "learning_rate": 0.0007417744916820702,
430
- "loss": 0.0216,
431
  "step": 1450
432
  },
433
  {
434
  "epoch": 2.7,
435
- "grad_norm": 0.5116605162620544,
436
  "learning_rate": 0.0007371534195933457,
437
- "loss": 0.0281,
438
  "step": 1475
439
  },
440
  {
441
  "epoch": 2.75,
442
- "grad_norm": 0.8539583683013916,
443
  "learning_rate": 0.000732532347504621,
444
- "loss": 0.0347,
445
  "step": 1500
446
  },
447
  {
448
  "epoch": 2.79,
449
- "grad_norm": 1.8664207458496094,
450
  "learning_rate": 0.0007279112754158965,
451
- "loss": 0.045,
452
  "step": 1525
453
  },
454
  {
455
  "epoch": 2.84,
456
- "grad_norm": 0.18790672719478607,
457
  "learning_rate": 0.0007232902033271719,
458
- "loss": 0.0233,
459
  "step": 1550
460
  },
461
  {
462
  "epoch": 2.88,
463
- "grad_norm": 0.31298670172691345,
464
  "learning_rate": 0.0007186691312384473,
465
- "loss": 0.0337,
466
  "step": 1575
467
  },
468
  {
469
  "epoch": 2.93,
470
- "grad_norm": 0.8353794813156128,
471
  "learning_rate": 0.0007140480591497227,
472
- "loss": 0.0387,
473
  "step": 1600
474
  },
475
  {
476
  "epoch": 2.98,
477
- "grad_norm": 0.08966954797506332,
478
  "learning_rate": 0.0007094269870609981,
479
- "loss": 0.0382,
480
  "step": 1625
481
  },
482
  {
483
  "epoch": 3.0,
484
- "eval_loss": 0.18573200702667236,
485
- "eval_runtime": 175.912,
486
- "eval_samples_per_second": 4.622,
487
- "eval_steps_per_second": 0.773,
488
  "step": 1638
489
  },
490
  {
491
  "epoch": 3.02,
492
- "grad_norm": 0.039618588984012604,
493
  "learning_rate": 0.0007048059149722735,
494
- "loss": 0.0353,
495
  "step": 1650
496
  },
497
  {
498
  "epoch": 3.07,
499
- "grad_norm": 0.3156013488769531,
500
  "learning_rate": 0.000700184842883549,
501
- "loss": 0.0318,
502
  "step": 1675
503
  },
504
  {
505
  "epoch": 3.11,
506
- "grad_norm": 1.9033042192459106,
507
  "learning_rate": 0.0006955637707948245,
508
- "loss": 0.0273,
509
  "step": 1700
510
  },
511
  {
512
  "epoch": 3.16,
513
- "grad_norm": 0.32316115498542786,
514
  "learning_rate": 0.0006909426987060999,
515
- "loss": 0.035,
516
  "step": 1725
517
  },
518
  {
519
  "epoch": 3.21,
520
- "grad_norm": 0.5656726956367493,
521
  "learning_rate": 0.0006863216266173753,
522
- "loss": 0.0311,
523
  "step": 1750
524
  },
525
  {
526
  "epoch": 3.25,
527
- "grad_norm": 0.032537270337343216,
528
  "learning_rate": 0.0006817005545286507,
529
- "loss": 0.0274,
530
  "step": 1775
531
  },
532
  {
533
  "epoch": 3.3,
534
- "grad_norm": 0.30572062730789185,
535
  "learning_rate": 0.0006770794824399261,
536
- "loss": 0.0362,
537
  "step": 1800
538
  },
539
  {
540
  "epoch": 3.34,
541
- "grad_norm": 0.3374157249927521,
542
  "learning_rate": 0.0006724584103512015,
543
- "loss": 0.0305,
544
  "step": 1825
545
  },
546
  {
547
  "epoch": 3.39,
548
- "grad_norm": 0.1089138388633728,
549
  "learning_rate": 0.000667837338262477,
550
- "loss": 0.0301,
551
  "step": 1850
552
  },
553
  {
554
  "epoch": 3.43,
555
- "grad_norm": 0.10849720984697342,
556
  "learning_rate": 0.0006632162661737523,
557
- "loss": 0.0241,
558
  "step": 1875
559
  },
560
  {
561
  "epoch": 3.48,
562
- "grad_norm": 0.11349553614854813,
563
  "learning_rate": 0.0006585951940850278,
564
- "loss": 0.0218,
565
  "step": 1900
566
  },
567
  {
568
  "epoch": 3.53,
569
- "grad_norm": 0.3237963616847992,
570
  "learning_rate": 0.0006539741219963031,
571
- "loss": 0.019,
572
  "step": 1925
573
  },
574
  {
575
  "epoch": 3.57,
576
- "grad_norm": 0.5101845860481262,
577
  "learning_rate": 0.0006493530499075786,
578
- "loss": 0.0244,
579
  "step": 1950
580
  },
581
  {
582
  "epoch": 3.62,
583
- "grad_norm": 0.010137775912880898,
584
  "learning_rate": 0.0006447319778188539,
585
- "loss": 0.0304,
586
  "step": 1975
587
  },
588
  {
589
  "epoch": 3.66,
590
- "grad_norm": 0.8954480886459351,
591
  "learning_rate": 0.0006401109057301294,
592
- "loss": 0.0306,
593
  "step": 2000
594
  },
595
  {
596
  "epoch": 3.71,
597
- "grad_norm": 0.014889650978147984,
598
  "learning_rate": 0.0006354898336414048,
599
- "loss": 0.0251,
600
  "step": 2025
601
  },
602
  {
603
  "epoch": 3.75,
604
- "grad_norm": 0.0878029614686966,
605
  "learning_rate": 0.0006308687615526802,
606
- "loss": 0.0341,
607
  "step": 2050
608
  },
609
  {
610
  "epoch": 3.8,
611
- "grad_norm": 0.13351218402385712,
612
  "learning_rate": 0.0006262476894639556,
613
- "loss": 0.0246,
614
  "step": 2075
615
  },
616
  {
617
  "epoch": 3.85,
618
- "grad_norm": 0.3208947479724884,
619
  "learning_rate": 0.000621626617375231,
620
- "loss": 0.0248,
621
  "step": 2100
622
  },
623
  {
624
  "epoch": 3.89,
625
- "grad_norm": 0.42570358514785767,
626
  "learning_rate": 0.0006170055452865064,
627
- "loss": 0.0358,
628
  "step": 2125
629
  },
630
  {
631
  "epoch": 3.94,
632
- "grad_norm": 0.131515234708786,
633
  "learning_rate": 0.000612384473197782,
634
- "loss": 0.0355,
635
  "step": 2150
636
  },
637
  {
638
  "epoch": 3.98,
639
- "grad_norm": 0.12344180792570114,
640
  "learning_rate": 0.0006077634011090574,
641
- "loss": 0.0189,
642
  "step": 2175
643
  },
644
  {
645
  "epoch": 4.0,
646
- "eval_loss": 0.19701677560806274,
647
- "eval_runtime": 176.7126,
648
- "eval_samples_per_second": 4.601,
649
- "eval_steps_per_second": 0.77,
650
  "step": 2184
651
  },
652
  {
653
  "epoch": 4.03,
654
- "grad_norm": 0.10986749082803726,
655
  "learning_rate": 0.0006031423290203328,
656
- "loss": 0.0259,
657
  "step": 2200
658
  },
659
  {
660
  "epoch": 4.08,
661
- "grad_norm": 0.46528518199920654,
662
  "learning_rate": 0.0005985212569316082,
663
- "loss": 0.022,
664
  "step": 2225
665
  },
666
  {
667
  "epoch": 4.12,
668
- "grad_norm": 0.2069913148880005,
669
  "learning_rate": 0.0005939001848428836,
670
- "loss": 0.015,
671
  "step": 2250
672
  },
673
  {
674
  "epoch": 4.17,
675
- "grad_norm": 0.34658578038215637,
676
  "learning_rate": 0.000589279112754159,
677
- "loss": 0.0299,
678
  "step": 2275
679
  },
680
  {
681
  "epoch": 4.21,
682
- "grad_norm": 0.18868118524551392,
683
  "learning_rate": 0.0005846580406654344,
684
- "loss": 0.0174,
685
  "step": 2300
686
  },
687
  {
688
  "epoch": 4.26,
689
- "grad_norm": 0.33069688081741333,
690
  "learning_rate": 0.0005800369685767099,
691
- "loss": 0.0216,
692
  "step": 2325
693
  },
694
  {
695
  "epoch": 4.3,
696
- "grad_norm": 0.7511343955993652,
697
  "learning_rate": 0.0005754158964879852,
698
- "loss": 0.0157,
699
  "step": 2350
700
  },
701
  {
702
  "epoch": 4.35,
703
- "grad_norm": 0.27277225255966187,
704
  "learning_rate": 0.0005707948243992607,
705
- "loss": 0.0198,
706
  "step": 2375
707
  },
708
  {
709
  "epoch": 4.4,
710
- "grad_norm": 2.3098878860473633,
711
  "learning_rate": 0.000566173752310536,
712
- "loss": 0.026,
713
  "step": 2400
714
  },
715
  {
716
  "epoch": 4.44,
717
- "grad_norm": 0.39823707938194275,
718
  "learning_rate": 0.0005615526802218115,
719
- "loss": 0.0118,
720
  "step": 2425
721
  },
722
  {
723
  "epoch": 4.49,
724
- "grad_norm": 0.2773701250553131,
725
  "learning_rate": 0.0005569316081330868,
726
- "loss": 0.0319,
727
  "step": 2450
728
  },
729
  {
730
  "epoch": 4.53,
731
- "grad_norm": 0.2549929916858673,
732
  "learning_rate": 0.0005523105360443623,
733
- "loss": 0.0164,
734
  "step": 2475
735
  },
736
  {
737
  "epoch": 4.58,
738
- "grad_norm": 3.1059272289276123,
739
  "learning_rate": 0.0005476894639556377,
740
- "loss": 0.0231,
741
  "step": 2500
742
  },
743
  {
744
  "epoch": 4.62,
745
- "grad_norm": 0.10516056418418884,
746
  "learning_rate": 0.0005430683918669131,
747
- "loss": 0.0262,
748
  "step": 2525
749
  },
750
  {
751
  "epoch": 4.67,
752
- "grad_norm": 0.046087902039289474,
753
  "learning_rate": 0.0005384473197781885,
754
- "loss": 0.0212,
755
  "step": 2550
756
  },
757
  {
758
  "epoch": 4.72,
759
- "grad_norm": 0.9207663536071777,
760
  "learning_rate": 0.0005338262476894639,
761
- "loss": 0.018,
762
  "step": 2575
763
  },
764
  {
765
  "epoch": 4.76,
766
- "grad_norm": 0.5687919855117798,
767
  "learning_rate": 0.0005292051756007393,
768
- "loss": 0.0255,
769
  "step": 2600
770
  },
771
  {
772
  "epoch": 4.81,
773
- "grad_norm": 0.006184098310768604,
774
  "learning_rate": 0.0005245841035120147,
775
- "loss": 0.0207,
776
  "step": 2625
777
  },
778
  {
779
  "epoch": 4.85,
780
- "grad_norm": 0.5442487597465515,
781
  "learning_rate": 0.0005199630314232903,
782
- "loss": 0.0192,
783
  "step": 2650
784
  },
785
  {
786
  "epoch": 4.9,
787
- "grad_norm": 0.031753990799188614,
788
  "learning_rate": 0.0005153419593345657,
789
- "loss": 0.015,
790
  "step": 2675
791
  },
792
  {
793
  "epoch": 4.95,
794
- "grad_norm": 0.022051149979233742,
795
  "learning_rate": 0.0005107208872458411,
796
- "loss": 0.0252,
797
  "step": 2700
798
  },
799
  {
800
  "epoch": 4.99,
801
- "grad_norm": 0.17456993460655212,
802
  "learning_rate": 0.0005060998151571165,
803
- "loss": 0.0274,
804
  "step": 2725
805
  },
806
  {
807
  "epoch": 5.0,
808
- "eval_loss": 0.18937160074710846,
809
- "eval_runtime": 177.0599,
810
- "eval_samples_per_second": 4.592,
811
- "eval_steps_per_second": 0.768,
812
  "step": 2730
813
  },
814
  {
815
  "epoch": 5.04,
816
- "grad_norm": 0.643435537815094,
817
  "learning_rate": 0.0005014787430683919,
818
- "loss": 0.0211,
819
  "step": 2750
820
  },
821
  {
822
  "epoch": 5.08,
823
- "grad_norm": 0.35862746834754944,
824
  "learning_rate": 0.0004968576709796673,
825
- "loss": 0.0073,
826
  "step": 2775
827
  },
828
  {
829
  "epoch": 5.13,
830
- "grad_norm": 0.5732066035270691,
831
  "learning_rate": 0.0004922365988909427,
832
- "loss": 0.0083,
833
  "step": 2800
834
  },
835
  {
836
  "epoch": 5.17,
837
- "grad_norm": 0.21464449167251587,
838
  "learning_rate": 0.0004876155268022181,
839
- "loss": 0.0104,
840
  "step": 2825
841
  },
842
  {
843
  "epoch": 5.22,
844
- "grad_norm": 0.1674581915140152,
845
  "learning_rate": 0.0004829944547134935,
846
- "loss": 0.0093,
847
  "step": 2850
848
  },
849
  {
850
  "epoch": 5.27,
851
- "grad_norm": 0.03593946248292923,
852
  "learning_rate": 0.000478373382624769,
853
- "loss": 0.0119,
854
  "step": 2875
855
  },
856
  {
857
  "epoch": 5.31,
858
- "grad_norm": 0.18074722588062286,
859
  "learning_rate": 0.0004737523105360444,
860
- "loss": 0.0097,
861
  "step": 2900
862
  },
863
  {
864
  "epoch": 5.36,
865
- "grad_norm": 0.06277300417423248,
866
  "learning_rate": 0.0004691312384473198,
867
- "loss": 0.0137,
868
  "step": 2925
869
  },
870
  {
871
  "epoch": 5.4,
872
- "grad_norm": 0.20016886293888092,
873
  "learning_rate": 0.0004645101663585952,
874
- "loss": 0.0204,
875
  "step": 2950
876
  },
877
  {
878
  "epoch": 5.45,
879
- "grad_norm": 0.1815144419670105,
880
  "learning_rate": 0.0004598890942698706,
881
- "loss": 0.0162,
882
  "step": 2975
883
  },
884
  {
885
  "epoch": 5.49,
886
- "grad_norm": 0.5112192034721375,
887
  "learning_rate": 0.00045526802218114607,
888
- "loss": 0.0131,
889
  "step": 3000
890
  },
891
  {
892
  "epoch": 5.54,
893
- "grad_norm": 0.1796441674232483,
894
  "learning_rate": 0.0004506469500924215,
895
- "loss": 0.0176,
896
  "step": 3025
897
  },
898
  {
899
  "epoch": 5.59,
900
- "grad_norm": 0.4108269214630127,
901
  "learning_rate": 0.0004460258780036969,
902
- "loss": 0.0196,
903
  "step": 3050
904
  },
905
  {
906
  "epoch": 5.63,
907
- "grad_norm": 0.4271663427352905,
908
  "learning_rate": 0.0004414048059149723,
909
- "loss": 0.017,
910
  "step": 3075
911
  },
912
  {
913
  "epoch": 5.68,
914
- "grad_norm": 0.2981961667537689,
915
  "learning_rate": 0.0004367837338262477,
916
- "loss": 0.012,
917
  "step": 3100
918
  },
919
  {
920
  "epoch": 5.72,
921
- "grad_norm": 0.392818808555603,
922
  "learning_rate": 0.0004321626617375231,
923
- "loss": 0.0115,
924
  "step": 3125
925
  },
926
  {
927
  "epoch": 5.77,
928
- "grad_norm": 0.00586000457406044,
929
  "learning_rate": 0.0004275415896487985,
930
- "loss": 0.0115,
931
  "step": 3150
932
  },
933
  {
934
  "epoch": 5.82,
935
- "grad_norm": 0.2224288433790207,
936
  "learning_rate": 0.0004229205175600739,
937
  "loss": 0.0127,
938
  "step": 3175
939
  },
940
  {
941
  "epoch": 5.86,
942
- "grad_norm": 0.28421640396118164,
943
  "learning_rate": 0.00041829944547134933,
944
- "loss": 0.0124,
945
  "step": 3200
946
  },
947
  {
948
  "epoch": 5.91,
949
- "grad_norm": 0.3791782557964325,
950
  "learning_rate": 0.00041367837338262474,
951
- "loss": 0.016,
952
  "step": 3225
953
  },
954
  {
955
  "epoch": 5.95,
956
- "grad_norm": 0.12688513100147247,
957
  "learning_rate": 0.0004090573012939002,
958
- "loss": 0.0212,
959
  "step": 3250
960
  },
961
  {
962
  "epoch": 6.0,
963
- "grad_norm": 0.009004692547023296,
964
  "learning_rate": 0.0004044362292051756,
965
- "loss": 0.02,
966
  "step": 3275
967
  },
968
  {
969
  "epoch": 6.0,
970
- "eval_loss": 0.18766650557518005,
971
- "eval_runtime": 177.5092,
972
- "eval_samples_per_second": 4.58,
973
- "eval_steps_per_second": 0.766,
974
  "step": 3276
975
  },
976
  {
977
  "epoch": 6.04,
978
- "grad_norm": 0.014529082924127579,
979
  "learning_rate": 0.000399815157116451,
980
- "loss": 0.012,
981
  "step": 3300
982
  },
983
  {
984
  "epoch": 6.09,
985
- "grad_norm": 0.16003918647766113,
986
  "learning_rate": 0.0003951940850277264,
987
- "loss": 0.0078,
988
  "step": 3325
989
  },
990
  {
991
  "epoch": 6.14,
992
- "grad_norm": 0.042826466262340546,
993
  "learning_rate": 0.0003905730129390019,
994
- "loss": 0.0116,
995
  "step": 3350
996
  },
997
  {
998
  "epoch": 6.18,
999
- "grad_norm": 0.0034067954402416945,
1000
  "learning_rate": 0.0003859519408502773,
1001
- "loss": 0.006,
1002
  "step": 3375
1003
  },
1004
  {
1005
  "epoch": 6.23,
1006
- "grad_norm": 0.005681981332600117,
1007
  "learning_rate": 0.0003813308687615527,
1008
- "loss": 0.0088,
1009
  "step": 3400
1010
  },
1011
  {
1012
  "epoch": 6.27,
1013
- "grad_norm": 0.05403963476419449,
1014
  "learning_rate": 0.0003767097966728281,
1015
- "loss": 0.0104,
1016
  "step": 3425
1017
  },
1018
  {
1019
  "epoch": 6.32,
1020
- "grad_norm": 0.1421121209859848,
1021
  "learning_rate": 0.0003720887245841035,
1022
- "loss": 0.0066,
1023
  "step": 3450
1024
  },
1025
  {
1026
  "epoch": 6.36,
1027
- "grad_norm": 0.02004937455058098,
1028
  "learning_rate": 0.0003674676524953789,
1029
- "loss": 0.0075,
1030
  "step": 3475
1031
  },
1032
  {
1033
  "epoch": 6.41,
1034
- "grad_norm": 0.009357332251966,
1035
  "learning_rate": 0.0003628465804066544,
1036
- "loss": 0.0065,
1037
  "step": 3500
1038
  },
1039
  {
1040
  "epoch": 6.46,
1041
- "grad_norm": 0.01666351594030857,
1042
  "learning_rate": 0.0003582255083179298,
1043
- "loss": 0.006,
1044
  "step": 3525
1045
  },
1046
  {
1047
  "epoch": 6.5,
1048
- "grad_norm": 0.24134355783462524,
1049
  "learning_rate": 0.0003536044362292052,
1050
- "loss": 0.0088,
1051
  "step": 3550
1052
  },
1053
  {
1054
  "epoch": 6.55,
1055
- "grad_norm": 0.14924415946006775,
1056
  "learning_rate": 0.0003489833641404806,
1057
- "loss": 0.007,
1058
  "step": 3575
1059
  },
1060
  {
1061
  "epoch": 6.59,
1062
- "grad_norm": 0.12202003598213196,
1063
  "learning_rate": 0.000344362292051756,
1064
- "loss": 0.0101,
1065
  "step": 3600
1066
  },
1067
  {
1068
  "epoch": 6.64,
1069
- "grad_norm": 0.0060227783396840096,
1070
  "learning_rate": 0.0003397412199630314,
1071
- "loss": 0.0117,
1072
  "step": 3625
1073
  },
1074
  {
1075
  "epoch": 6.68,
1076
- "grad_norm": 0.3869228959083557,
1077
  "learning_rate": 0.0003351201478743068,
1078
- "loss": 0.0151,
1079
  "step": 3650
1080
  },
1081
  {
1082
  "epoch": 6.73,
1083
- "grad_norm": 0.018938152119517326,
1084
  "learning_rate": 0.00033049907578558223,
1085
- "loss": 0.0076,
1086
  "step": 3675
1087
  },
1088
  {
1089
  "epoch": 6.78,
1090
- "grad_norm": 0.018859045580029488,
1091
  "learning_rate": 0.00032587800369685764,
1092
- "loss": 0.0085,
1093
  "step": 3700
1094
  },
1095
  {
1096
  "epoch": 6.82,
1097
- "grad_norm": 0.08804900199174881,
1098
  "learning_rate": 0.0003212569316081331,
1099
- "loss": 0.0097,
1100
  "step": 3725
1101
  },
1102
  {
1103
  "epoch": 6.87,
1104
- "grad_norm": 0.3045863211154938,
1105
  "learning_rate": 0.0003166358595194085,
1106
- "loss": 0.0132,
1107
  "step": 3750
1108
  },
1109
  {
1110
  "epoch": 6.91,
1111
- "grad_norm": 0.022158470004796982,
1112
  "learning_rate": 0.0003120147874306839,
1113
- "loss": 0.0124,
1114
  "step": 3775
1115
  },
1116
  {
1117
  "epoch": 6.96,
1118
- "grad_norm": 0.15056921541690826,
1119
  "learning_rate": 0.0003073937153419594,
1120
- "loss": 0.0087,
1121
  "step": 3800
1122
  },
1123
  {
1124
  "epoch": 7.0,
1125
- "eval_loss": 0.19078923761844635,
1126
- "eval_runtime": 177.5513,
1127
- "eval_samples_per_second": 4.579,
1128
- "eval_steps_per_second": 0.766,
1129
  "step": 3822
1130
  },
1131
  {
1132
  "epoch": 7.01,
1133
- "grad_norm": 0.04336291924118996,
1134
  "learning_rate": 0.0003027726432532348,
1135
- "loss": 0.0086,
1136
  "step": 3825
1137
  },
1138
  {
1139
  "epoch": 7.05,
1140
- "grad_norm": 0.0327971875667572,
1141
  "learning_rate": 0.0002981515711645102,
1142
- "loss": 0.0084,
1143
  "step": 3850
1144
  },
1145
  {
1146
  "epoch": 7.1,
1147
- "grad_norm": 0.0314444899559021,
1148
  "learning_rate": 0.0002935304990757856,
1149
- "loss": 0.0048,
1150
  "step": 3875
1151
  },
1152
  {
1153
  "epoch": 7.14,
1154
- "grad_norm": 0.17276029288768768,
1155
  "learning_rate": 0.000288909426987061,
1156
- "loss": 0.007,
1157
  "step": 3900
1158
  },
1159
  {
1160
  "epoch": 7.19,
1161
- "grad_norm": 0.18024314939975739,
1162
  "learning_rate": 0.0002842883548983364,
1163
- "loss": 0.0074,
1164
  "step": 3925
1165
  },
1166
  {
1167
  "epoch": 7.23,
1168
- "grad_norm": 0.01734893210232258,
1169
  "learning_rate": 0.0002796672828096118,
1170
- "loss": 0.0071,
1171
  "step": 3950
1172
  },
1173
  {
1174
  "epoch": 7.28,
1175
- "grad_norm": 0.01721636950969696,
1176
  "learning_rate": 0.0002750462107208873,
1177
- "loss": 0.0123,
1178
  "step": 3975
1179
  },
1180
  {
1181
  "epoch": 7.33,
1182
- "grad_norm": 0.03225923702120781,
1183
  "learning_rate": 0.0002704251386321627,
1184
- "loss": 0.0061,
1185
  "step": 4000
1186
  },
1187
  {
1188
  "epoch": 7.37,
1189
- "grad_norm": 0.10785706341266632,
1190
  "learning_rate": 0.0002658040665434381,
1191
- "loss": 0.0071,
1192
  "step": 4025
1193
  },
1194
  {
1195
  "epoch": 7.42,
1196
- "grad_norm": 0.02195531316101551,
1197
  "learning_rate": 0.0002611829944547135,
1198
- "loss": 0.0067,
1199
  "step": 4050
1200
  },
1201
  {
1202
  "epoch": 7.46,
1203
- "grad_norm": 0.025887854397296906,
1204
  "learning_rate": 0.0002565619223659889,
1205
- "loss": 0.0072,
1206
  "step": 4075
1207
  },
1208
  {
1209
  "epoch": 7.51,
1210
- "grad_norm": 1.8573029041290283,
1211
  "learning_rate": 0.0002519408502772643,
1212
- "loss": 0.0044,
1213
  "step": 4100
1214
  },
1215
  {
1216
  "epoch": 7.55,
1217
- "grad_norm": 0.41556769609451294,
1218
  "learning_rate": 0.0002473197781885397,
1219
- "loss": 0.0076,
1220
  "step": 4125
1221
  },
1222
  {
1223
  "epoch": 7.6,
1224
- "grad_norm": 0.0036406666040420532,
1225
  "learning_rate": 0.0002426987060998152,
1226
- "loss": 0.0054,
1227
  "step": 4150
1228
  },
1229
  {
1230
  "epoch": 7.65,
1231
- "grad_norm": 0.1950559765100479,
1232
  "learning_rate": 0.0002380776340110906,
1233
- "loss": 0.0052,
1234
  "step": 4175
1235
  },
1236
  {
1237
  "epoch": 7.69,
1238
- "grad_norm": 0.01785474270582199,
1239
  "learning_rate": 0.000233456561922366,
1240
- "loss": 0.007,
1241
  "step": 4200
1242
  },
1243
  {
1244
  "epoch": 7.74,
1245
- "grad_norm": 0.26933544874191284,
1246
  "learning_rate": 0.0002288354898336414,
1247
- "loss": 0.0048,
1248
  "step": 4225
1249
  },
1250
  {
1251
  "epoch": 7.78,
1252
- "grad_norm": 0.19295917451381683,
1253
  "learning_rate": 0.00022421441774491682,
1254
- "loss": 0.0035,
1255
  "step": 4250
1256
  },
1257
  {
1258
  "epoch": 7.83,
1259
- "grad_norm": 0.008535887114703655,
1260
  "learning_rate": 0.00021959334565619225,
1261
- "loss": 0.0063,
1262
  "step": 4275
1263
  },
1264
  {
1265
  "epoch": 7.88,
1266
- "grad_norm": 0.16601914167404175,
1267
  "learning_rate": 0.00021497227356746766,
1268
- "loss": 0.0049,
1269
  "step": 4300
1270
  },
1271
  {
1272
  "epoch": 7.92,
1273
- "grad_norm": 0.25450438261032104,
1274
  "learning_rate": 0.00021035120147874306,
1275
- "loss": 0.0069,
1276
  "step": 4325
1277
  },
1278
  {
1279
  "epoch": 7.97,
1280
- "grad_norm": 0.049375709146261215,
1281
  "learning_rate": 0.00020573012939001847,
1282
- "loss": 0.0066,
1283
  "step": 4350
1284
  },
1285
  {
1286
  "epoch": 8.0,
1287
- "eval_loss": 0.2085200548171997,
1288
- "eval_runtime": 177.9722,
1289
- "eval_samples_per_second": 4.568,
1290
- "eval_steps_per_second": 0.764,
1291
  "step": 4368
1292
  },
1293
  {
1294
  "epoch": 8.01,
1295
- "grad_norm": 0.06922808289527893,
1296
  "learning_rate": 0.00020110905730129388,
1297
- "loss": 0.0042,
1298
  "step": 4375
1299
  },
1300
  {
1301
  "epoch": 8.06,
1302
- "grad_norm": 0.04170389473438263,
1303
  "learning_rate": 0.00019648798521256934,
1304
- "loss": 0.0037,
1305
  "step": 4400
1306
  },
1307
  {
1308
  "epoch": 8.1,
1309
- "grad_norm": 0.010052547790110111,
1310
  "learning_rate": 0.00019186691312384475,
1311
- "loss": 0.0029,
1312
  "step": 4425
1313
  },
1314
  {
1315
  "epoch": 8.15,
1316
- "grad_norm": 0.25184884667396545,
1317
  "learning_rate": 0.00018724584103512016,
1318
- "loss": 0.0039,
1319
  "step": 4450
1320
  },
1321
  {
1322
  "epoch": 8.2,
1323
- "grad_norm": 0.07106045633554459,
1324
  "learning_rate": 0.00018262476894639556,
1325
- "loss": 0.0039,
1326
  "step": 4475
1327
  },
1328
  {
1329
  "epoch": 8.24,
1330
- "grad_norm": 0.002000249456614256,
1331
  "learning_rate": 0.00017800369685767097,
1332
- "loss": 0.0056,
1333
  "step": 4500
1334
  },
1335
  {
1336
  "epoch": 8.29,
1337
- "grad_norm": 0.025201383978128433,
1338
  "learning_rate": 0.0001733826247689464,
1339
- "loss": 0.0031,
1340
  "step": 4525
1341
  },
1342
  {
1343
  "epoch": 8.33,
1344
- "grad_norm": 0.0007307173800654709,
1345
  "learning_rate": 0.0001687615526802218,
1346
- "loss": 0.0031,
1347
  "step": 4550
1348
  },
1349
  {
1350
  "epoch": 8.38,
1351
- "grad_norm": 0.010259617120027542,
1352
  "learning_rate": 0.00016414048059149722,
1353
- "loss": 0.0036,
1354
  "step": 4575
1355
  },
1356
  {
1357
  "epoch": 8.42,
1358
- "grad_norm": 0.004237270914018154,
1359
  "learning_rate": 0.00015951940850277263,
1360
- "loss": 0.0038,
1361
  "step": 4600
1362
  },
1363
  {
1364
  "epoch": 8.47,
1365
- "grad_norm": 0.02443511225283146,
1366
  "learning_rate": 0.0001548983364140481,
1367
- "loss": 0.0044,
1368
  "step": 4625
1369
  },
1370
  {
1371
  "epoch": 8.52,
1372
- "grad_norm": 0.039590246975421906,
1373
  "learning_rate": 0.0001502772643253235,
1374
- "loss": 0.0019,
1375
  "step": 4650
1376
  },
1377
  {
1378
  "epoch": 8.56,
1379
- "grad_norm": 0.30276018381118774,
1380
  "learning_rate": 0.0001456561922365989,
1381
- "loss": 0.0023,
1382
  "step": 4675
1383
  },
1384
  {
1385
  "epoch": 8.61,
1386
- "grad_norm": 0.05218060687184334,
1387
  "learning_rate": 0.0001410351201478743,
1388
- "loss": 0.003,
1389
  "step": 4700
1390
  },
1391
  {
1392
  "epoch": 8.65,
1393
- "grad_norm": 0.02608703263103962,
1394
  "learning_rate": 0.00013641404805914972,
1395
- "loss": 0.0027,
1396
  "step": 4725
1397
  },
1398
  {
1399
  "epoch": 8.7,
1400
- "grad_norm": 0.007796884514391422,
1401
  "learning_rate": 0.00013179297597042515,
1402
- "loss": 0.0038,
1403
  "step": 4750
1404
  },
1405
  {
1406
  "epoch": 8.75,
1407
- "grad_norm": 0.008572472259402275,
1408
  "learning_rate": 0.00012717190388170056,
1409
- "loss": 0.0018,
1410
  "step": 4775
1411
  },
1412
  {
1413
  "epoch": 8.79,
1414
- "grad_norm": 0.0034019711893051863,
1415
  "learning_rate": 0.00012255083179297597,
1416
- "loss": 0.003,
1417
  "step": 4800
1418
  },
1419
  {
1420
  "epoch": 8.84,
1421
- "grad_norm": 0.003986136056482792,
1422
  "learning_rate": 0.00011792975970425139,
1423
- "loss": 0.0047,
1424
  "step": 4825
1425
  },
1426
  {
1427
  "epoch": 8.88,
1428
- "grad_norm": 0.055789873003959656,
1429
  "learning_rate": 0.00011330868761552681,
1430
- "loss": 0.0021,
1431
  "step": 4850
1432
  },
1433
  {
1434
  "epoch": 8.93,
1435
- "grad_norm": 0.07775359600782394,
1436
  "learning_rate": 0.00010868761552680221,
1437
- "loss": 0.0032,
1438
  "step": 4875
1439
  },
1440
  {
1441
  "epoch": 8.97,
1442
- "grad_norm": 0.0017645555781200528,
1443
  "learning_rate": 0.00010406654343807764,
1444
- "loss": 0.0055,
1445
  "step": 4900
1446
  },
1447
  {
1448
  "epoch": 9.0,
1449
- "eval_loss": 0.21004897356033325,
1450
- "eval_runtime": 178.8955,
1451
- "eval_samples_per_second": 4.545,
1452
- "eval_steps_per_second": 0.76,
1453
  "step": 4914
1454
  },
1455
  {
1456
  "epoch": 9.02,
1457
- "grad_norm": 0.22125497460365295,
1458
  "learning_rate": 9.944547134935306e-05,
1459
- "loss": 0.0031,
1460
  "step": 4925
1461
  },
1462
  {
1463
  "epoch": 9.07,
1464
- "grad_norm": 0.003768475726246834,
1465
  "learning_rate": 9.482439926062846e-05,
1466
  "loss": 0.0013,
1467
  "step": 4950
1468
  },
1469
  {
1470
  "epoch": 9.11,
1471
- "grad_norm": 0.013520549982786179,
1472
  "learning_rate": 9.020332717190388e-05,
1473
- "loss": 0.0025,
1474
  "step": 4975
1475
  },
1476
  {
1477
  "epoch": 9.16,
1478
- "grad_norm": 0.009503871202468872,
1479
  "learning_rate": 8.558225508317929e-05,
1480
- "loss": 0.0024,
1481
  "step": 5000
1482
  },
1483
  {
1484
  "epoch": 9.2,
1485
- "grad_norm": 0.0057460549287498,
1486
  "learning_rate": 8.096118299445473e-05,
1487
- "loss": 0.0015,
1488
  "step": 5025
1489
  },
1490
  {
1491
  "epoch": 9.25,
1492
- "grad_norm": 0.06969017535448074,
1493
  "learning_rate": 7.634011090573013e-05,
1494
- "loss": 0.0017,
1495
  "step": 5050
1496
  },
1497
  {
1498
  "epoch": 9.29,
1499
- "grad_norm": 0.1530989110469818,
1500
  "learning_rate": 7.171903881700554e-05,
1501
- "loss": 0.0022,
1502
  "step": 5075
1503
  },
1504
  {
1505
  "epoch": 9.34,
1506
- "grad_norm": 0.1752089112997055,
1507
  "learning_rate": 6.709796672828096e-05,
1508
- "loss": 0.0018,
1509
  "step": 5100
1510
  },
1511
  {
1512
  "epoch": 9.39,
1513
- "grad_norm": 0.023138588294386864,
1514
  "learning_rate": 6.247689463955638e-05,
1515
- "loss": 0.0014,
1516
  "step": 5125
1517
  },
1518
  {
1519
  "epoch": 9.43,
1520
- "grad_norm": 0.005098209250718355,
1521
  "learning_rate": 5.785582255083179e-05,
1522
- "loss": 0.0012,
1523
  "step": 5150
1524
  },
1525
  {
1526
  "epoch": 9.48,
1527
- "grad_norm": 0.007919879630208015,
1528
  "learning_rate": 5.323475046210721e-05,
1529
- "loss": 0.0023,
1530
  "step": 5175
1531
  },
1532
  {
1533
  "epoch": 9.52,
1534
- "grad_norm": 0.0019298276165500283,
1535
  "learning_rate": 4.8613678373382625e-05,
1536
- "loss": 0.0015,
1537
  "step": 5200
1538
  },
1539
  {
1540
  "epoch": 9.57,
1541
- "grad_norm": 0.0023822402581572533,
1542
  "learning_rate": 4.3992606284658045e-05,
1543
- "loss": 0.0011,
1544
  "step": 5225
1545
  },
1546
  {
1547
  "epoch": 9.62,
1548
- "grad_norm": 0.03612617775797844,
1549
  "learning_rate": 3.937153419593346e-05,
1550
  "loss": 0.001,
1551
  "step": 5250
1552
  },
1553
  {
1554
  "epoch": 9.66,
1555
- "grad_norm": 0.03683371841907501,
1556
  "learning_rate": 3.4750462107208874e-05,
1557
- "loss": 0.0016,
1558
  "step": 5275
1559
  },
1560
  {
1561
  "epoch": 9.71,
1562
- "grad_norm": 0.04906224459409714,
1563
  "learning_rate": 3.012939001848429e-05,
1564
- "loss": 0.0022,
1565
  "step": 5300
1566
  },
1567
  {
1568
  "epoch": 9.75,
1569
- "grad_norm": 0.08069704473018646,
1570
  "learning_rate": 2.5508317929759705e-05,
1571
- "loss": 0.0015,
1572
  "step": 5325
1573
  },
1574
  {
1575
  "epoch": 9.8,
1576
- "grad_norm": 0.13353778421878815,
1577
  "learning_rate": 2.088724584103512e-05,
1578
- "loss": 0.0013,
1579
  "step": 5350
1580
  },
1581
  {
1582
  "epoch": 9.84,
1583
- "grad_norm": 0.10152421146631241,
1584
  "learning_rate": 1.6266173752310537e-05,
1585
- "loss": 0.0015,
1586
  "step": 5375
1587
  },
1588
  {
1589
  "epoch": 9.89,
1590
- "grad_norm": 0.010886043310165405,
1591
  "learning_rate": 1.1645101663585952e-05,
1592
- "loss": 0.0017,
1593
  "step": 5400
1594
  },
1595
  {
1596
  "epoch": 9.94,
1597
- "grad_norm": 0.009057571180164814,
1598
  "learning_rate": 7.024029574861368e-06,
1599
- "loss": 0.0016,
1600
  "step": 5425
1601
  },
1602
  {
1603
  "epoch": 9.98,
1604
- "grad_norm": 0.020738158375024796,
1605
  "learning_rate": 2.402957486136784e-06,
1606
- "loss": 0.0013,
1607
  "step": 5450
1608
  },
1609
  {
1610
  "epoch": 10.0,
1611
- "eval_loss": 0.21373072266578674,
1612
- "eval_runtime": 177.6097,
1613
- "eval_samples_per_second": 4.577,
1614
- "eval_steps_per_second": 0.766,
1615
  "step": 5460
1616
  },
1617
  {
1618
  "epoch": 10.0,
1619
  "step": 5460,
1620
  "total_flos": 9.7789895073792e+18,
1621
- "train_loss": 0.05130936206342318,
1622
- "train_runtime": 10728.949,
1623
- "train_samples_per_second": 3.051,
1624
  "train_steps_per_second": 0.509
1625
  }
1626
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.05,
13
+ "grad_norm": 0.4075450599193573,
14
+ "learning_rate": 0.0005,
15
+ "loss": 0.0571,
16
  "step": 25
17
  },
18
  {
19
  "epoch": 0.09,
20
+ "grad_norm": 0.5959680676460266,
21
+ "learning_rate": 0.001,
22
+ "loss": 0.0853,
23
  "step": 50
24
  },
25
  {
26
  "epoch": 0.14,
27
+ "grad_norm": 1.1371592283248901,
28
+ "learning_rate": 0.0009955637707948243,
29
+ "loss": 0.0983,
30
  "step": 75
31
  },
32
  {
33
  "epoch": 0.18,
34
+ "grad_norm": 1.7857468128204346,
35
+ "learning_rate": 0.0009911275415896488,
36
+ "loss": 0.0689,
37
  "step": 100
38
  },
39
  {
40
  "epoch": 0.23,
41
+ "grad_norm": 1.947630763053894,
42
+ "learning_rate": 0.0009865064695009243,
43
+ "loss": 0.0834,
44
  "step": 125
45
  },
46
  {
47
  "epoch": 0.27,
48
+ "grad_norm": 0.750166654586792,
49
+ "learning_rate": 0.0009818853974121996,
50
+ "loss": 0.0655,
51
  "step": 150
52
  },
53
  {
54
  "epoch": 0.32,
55
+ "grad_norm": 3.860727548599243,
56
+ "learning_rate": 0.000977264325323475,
57
+ "loss": 0.0554,
58
  "step": 175
59
  },
60
  {
61
  "epoch": 0.37,
62
+ "grad_norm": 0.3753944933414459,
63
+ "learning_rate": 0.0009726432532347505,
64
+ "loss": 0.0785,
65
  "step": 200
66
  },
67
  {
68
  "epoch": 0.41,
69
+ "grad_norm": 0.4372863471508026,
70
+ "learning_rate": 0.0009680221811460259,
71
+ "loss": 0.063,
72
  "step": 225
73
  },
74
  {
75
  "epoch": 0.46,
76
+ "grad_norm": 0.31646546721458435,
77
+ "learning_rate": 0.0009634011090573013,
78
+ "loss": 0.0487,
79
  "step": 250
80
  },
81
  {
82
  "epoch": 0.5,
83
+ "grad_norm": 0.8565055131912231,
84
+ "learning_rate": 0.0009587800369685768,
85
+ "loss": 0.0586,
86
  "step": 275
87
  },
88
  {
89
  "epoch": 0.55,
90
+ "grad_norm": 0.5980587601661682,
91
+ "learning_rate": 0.0009541589648798521,
92
+ "loss": 0.0683,
93
  "step": 300
94
  },
95
  {
96
  "epoch": 0.6,
97
+ "grad_norm": 0.2764056324958801,
98
+ "learning_rate": 0.0009495378927911276,
99
+ "loss": 0.0697,
100
  "step": 325
101
  },
102
  {
103
  "epoch": 0.64,
104
+ "grad_norm": 1.5360766649246216,
105
+ "learning_rate": 0.0009449168207024029,
106
+ "loss": 0.0639,
107
  "step": 350
108
  },
109
  {
110
  "epoch": 0.69,
111
+ "grad_norm": 0.38272273540496826,
112
+ "learning_rate": 0.0009402957486136784,
113
+ "loss": 0.0816,
114
  "step": 375
115
  },
116
  {
117
  "epoch": 0.73,
118
+ "grad_norm": 0.362632155418396,
119
+ "learning_rate": 0.0009356746765249538,
120
+ "loss": 0.0734,
121
  "step": 400
122
  },
123
  {
124
  "epoch": 0.78,
125
+ "grad_norm": 1.5462536811828613,
126
+ "learning_rate": 0.0009310536044362292,
127
+ "loss": 0.2255,
128
  "step": 425
129
  },
130
  {
131
  "epoch": 0.82,
132
+ "grad_norm": 0.6713312268257141,
133
+ "learning_rate": 0.0009264325323475047,
134
+ "loss": 0.1296,
135
  "step": 450
136
  },
137
  {
138
  "epoch": 0.87,
139
+ "grad_norm": 0.8870647549629211,
140
  "learning_rate": 0.0009219963031423291,
141
+ "loss": 0.0904,
142
  "step": 475
143
  },
144
  {
145
  "epoch": 0.92,
146
+ "grad_norm": 0.4918694496154785,
147
  "learning_rate": 0.0009173752310536044,
148
+ "loss": 0.0689,
149
  "step": 500
150
  },
151
  {
152
  "epoch": 0.96,
153
+ "grad_norm": 0.3674885630607605,
154
  "learning_rate": 0.0009127541589648799,
155
+ "loss": 0.0726,
156
  "step": 525
157
  },
158
  {
159
  "epoch": 1.0,
160
+ "eval_loss": 0.2210056632757187,
161
+ "eval_runtime": 173.4286,
162
+ "eval_samples_per_second": 4.688,
163
+ "eval_steps_per_second": 0.784,
164
  "step": 546
165
  },
166
  {
167
  "epoch": 1.01,
168
+ "grad_norm": 0.8691617250442505,
169
  "learning_rate": 0.0009081330868761552,
170
+ "loss": 0.0421,
171
  "step": 550
172
  },
173
  {
174
  "epoch": 1.05,
175
+ "grad_norm": 0.8861550688743591,
176
  "learning_rate": 0.0009035120147874307,
177
+ "loss": 0.058,
178
  "step": 575
179
  },
180
  {
181
  "epoch": 1.1,
182
+ "grad_norm": 0.10453300923109055,
183
  "learning_rate": 0.000898890942698706,
184
+ "loss": 0.0391,
185
  "step": 600
186
  },
187
  {
188
  "epoch": 1.14,
189
+ "grad_norm": 0.11498710513114929,
190
  "learning_rate": 0.0008942698706099815,
191
+ "loss": 0.0413,
192
  "step": 625
193
  },
194
  {
195
  "epoch": 1.19,
196
+ "grad_norm": 0.3542003333568573,
197
  "learning_rate": 0.0008896487985212569,
198
+ "loss": 0.0764,
199
  "step": 650
200
  },
201
  {
202
  "epoch": 1.24,
203
+ "grad_norm": 0.5665566921234131,
204
  "learning_rate": 0.0008850277264325323,
205
+ "loss": 0.0464,
206
  "step": 675
207
  },
208
  {
209
  "epoch": 1.28,
210
+ "grad_norm": 0.37183037400245667,
211
  "learning_rate": 0.0008804066543438077,
212
+ "loss": 0.0557,
213
  "step": 700
214
  },
215
  {
216
  "epoch": 1.33,
217
+ "grad_norm": 2.33689546585083,
218
  "learning_rate": 0.0008757855822550833,
219
+ "loss": 0.0723,
220
  "step": 725
221
  },
222
  {
223
  "epoch": 1.37,
224
+ "grad_norm": 0.47746214270591736,
225
  "learning_rate": 0.0008711645101663586,
226
+ "loss": 0.0613,
227
  "step": 750
228
  },
229
  {
230
  "epoch": 1.42,
231
+ "grad_norm": 0.5573539137840271,
232
  "learning_rate": 0.0008665434380776341,
233
+ "loss": 0.0581,
234
  "step": 775
235
  },
236
  {
237
  "epoch": 1.47,
238
+ "grad_norm": 0.5228638648986816,
239
  "learning_rate": 0.0008619223659889095,
240
+ "loss": 0.0438,
241
  "step": 800
242
  },
243
  {
244
  "epoch": 1.51,
245
+ "grad_norm": 1.0585103034973145,
246
  "learning_rate": 0.0008573012939001849,
247
+ "loss": 0.0357,
248
  "step": 825
249
  },
250
  {
251
  "epoch": 1.56,
252
+ "grad_norm": 0.13868175446987152,
253
  "learning_rate": 0.0008526802218114603,
254
+ "loss": 0.0374,
255
  "step": 850
256
  },
257
  {
258
  "epoch": 1.6,
259
+ "grad_norm": 0.43853959441185,
260
  "learning_rate": 0.0008480591497227357,
261
+ "loss": 0.0482,
262
  "step": 875
263
  },
264
  {
265
  "epoch": 1.65,
266
+ "grad_norm": 0.47208574414253235,
267
  "learning_rate": 0.0008434380776340112,
268
+ "loss": 0.0551,
269
  "step": 900
270
  },
271
  {
272
  "epoch": 1.69,
273
+ "grad_norm": 0.2631681561470032,
274
  "learning_rate": 0.0008388170055452865,
275
+ "loss": 0.0717,
276
  "step": 925
277
  },
278
  {
279
  "epoch": 1.74,
280
+ "grad_norm": 0.23163950443267822,
281
  "learning_rate": 0.000834195933456562,
282
+ "loss": 0.0415,
283
  "step": 950
284
  },
285
  {
286
  "epoch": 1.79,
287
+ "grad_norm": 0.45487725734710693,
288
  "learning_rate": 0.0008295748613678373,
289
+ "loss": 0.0392,
290
  "step": 975
291
  },
292
  {
293
  "epoch": 1.83,
294
+ "grad_norm": 0.40454888343811035,
295
  "learning_rate": 0.0008249537892791128,
296
+ "loss": 0.0342,
297
  "step": 1000
298
  },
299
  {
300
  "epoch": 1.88,
301
+ "grad_norm": 0.10719649493694305,
302
  "learning_rate": 0.0008203327171903881,
303
+ "loss": 0.0499,
304
  "step": 1025
305
  },
306
  {
307
  "epoch": 1.92,
308
+ "grad_norm": 0.5795795917510986,
309
  "learning_rate": 0.0008157116451016636,
310
+ "loss": 0.0553,
311
  "step": 1050
312
  },
313
  {
314
  "epoch": 1.97,
315
+ "grad_norm": 0.2069532871246338,
316
  "learning_rate": 0.000811090573012939,
317
+ "loss": 0.0419,
318
  "step": 1075
319
  },
320
  {
321
  "epoch": 2.0,
322
+ "eval_loss": 0.21386997401714325,
323
+ "eval_runtime": 174.0952,
324
+ "eval_samples_per_second": 4.67,
325
  "eval_steps_per_second": 0.781,
326
  "step": 1092
327
  },
328
  {
329
  "epoch": 2.01,
330
+ "grad_norm": 0.170976459980011,
331
  "learning_rate": 0.0008064695009242144,
332
+ "loss": 0.0373,
333
  "step": 1100
334
  },
335
  {
336
  "epoch": 2.06,
337
+ "grad_norm": 0.10965342819690704,
338
  "learning_rate": 0.0008018484288354898,
339
+ "loss": 0.0286,
340
  "step": 1125
341
  },
342
  {
343
  "epoch": 2.11,
344
+ "grad_norm": 0.02158469147980213,
345
  "learning_rate": 0.0007972273567467652,
346
+ "loss": 0.05,
347
  "step": 1150
348
  },
349
  {
350
  "epoch": 2.15,
351
+ "grad_norm": 1.0225136280059814,
352
  "learning_rate": 0.0007926062846580406,
353
+ "loss": 0.0423,
354
  "step": 1175
355
  },
356
  {
357
  "epoch": 2.2,
358
+ "grad_norm": 0.09866318106651306,
359
  "learning_rate": 0.0007879852125693162,
360
+ "loss": 0.0376,
361
  "step": 1200
362
  },
363
  {
364
  "epoch": 2.24,
365
+ "grad_norm": 0.23199380934238434,
366
  "learning_rate": 0.0007833641404805915,
367
+ "loss": 0.0293,
368
  "step": 1225
369
  },
370
  {
371
  "epoch": 2.29,
372
+ "grad_norm": 0.05752483755350113,
373
  "learning_rate": 0.000778743068391867,
374
+ "loss": 0.0381,
375
  "step": 1250
376
  },
377
  {
378
  "epoch": 2.34,
379
+ "grad_norm": 0.13506996631622314,
380
  "learning_rate": 0.0007741219963031424,
381
+ "loss": 0.0394,
382
  "step": 1275
383
  },
384
  {
385
  "epoch": 2.38,
386
+ "grad_norm": 1.1013309955596924,
387
  "learning_rate": 0.0007695009242144178,
388
+ "loss": 0.0394,
389
  "step": 1300
390
  },
391
  {
392
  "epoch": 2.43,
393
+ "grad_norm": 0.43956679105758667,
394
  "learning_rate": 0.0007648798521256932,
395
+ "loss": 0.0459,
396
  "step": 1325
397
  },
398
  {
399
  "epoch": 2.47,
400
+ "grad_norm": 0.39061295986175537,
401
  "learning_rate": 0.0007602587800369686,
402
+ "loss": 0.037,
403
  "step": 1350
404
  },
405
  {
406
  "epoch": 2.52,
407
+ "grad_norm": 0.2657981216907501,
408
  "learning_rate": 0.0007556377079482441,
409
+ "loss": 0.0327,
410
  "step": 1375
411
  },
412
  {
413
  "epoch": 2.56,
414
+ "grad_norm": 0.4138255715370178,
415
  "learning_rate": 0.0007510166358595194,
416
+ "loss": 0.0307,
417
  "step": 1400
418
  },
419
  {
420
  "epoch": 2.61,
421
+ "grad_norm": 0.32367995381355286,
422
  "learning_rate": 0.0007463955637707949,
423
+ "loss": 0.0335,
424
  "step": 1425
425
  },
426
  {
427
  "epoch": 2.66,
428
+ "grad_norm": 0.5355994701385498,
429
  "learning_rate": 0.0007417744916820702,
430
+ "loss": 0.0262,
431
  "step": 1450
432
  },
433
  {
434
  "epoch": 2.7,
435
+ "grad_norm": 3.182929039001465,
436
  "learning_rate": 0.0007371534195933457,
437
+ "loss": 0.0302,
438
  "step": 1475
439
  },
440
  {
441
  "epoch": 2.75,
442
+ "grad_norm": 0.9068237543106079,
443
  "learning_rate": 0.000732532347504621,
444
+ "loss": 0.0318,
445
  "step": 1500
446
  },
447
  {
448
  "epoch": 2.79,
449
+ "grad_norm": 0.804796576499939,
450
  "learning_rate": 0.0007279112754158965,
451
+ "loss": 0.0462,
452
  "step": 1525
453
  },
454
  {
455
  "epoch": 2.84,
456
+ "grad_norm": 0.40627536177635193,
457
  "learning_rate": 0.0007232902033271719,
458
+ "loss": 0.0226,
459
  "step": 1550
460
  },
461
  {
462
  "epoch": 2.88,
463
+ "grad_norm": 0.2852160632610321,
464
  "learning_rate": 0.0007186691312384473,
465
+ "loss": 0.0327,
466
  "step": 1575
467
  },
468
  {
469
  "epoch": 2.93,
470
+ "grad_norm": 0.5738157629966736,
471
  "learning_rate": 0.0007140480591497227,
472
+ "loss": 0.0317,
473
  "step": 1600
474
  },
475
  {
476
  "epoch": 2.98,
477
+ "grad_norm": 0.2782443165779114,
478
  "learning_rate": 0.0007094269870609981,
479
+ "loss": 0.0322,
480
  "step": 1625
481
  },
482
  {
483
  "epoch": 3.0,
484
+ "eval_loss": 0.1934811770915985,
485
+ "eval_runtime": 175.4238,
486
+ "eval_samples_per_second": 4.634,
487
+ "eval_steps_per_second": 0.775,
488
  "step": 1638
489
  },
490
  {
491
  "epoch": 3.02,
492
+ "grad_norm": 0.027267010882496834,
493
  "learning_rate": 0.0007048059149722735,
494
+ "loss": 0.0248,
495
  "step": 1650
496
  },
497
  {
498
  "epoch": 3.07,
499
+ "grad_norm": 0.23983055353164673,
500
  "learning_rate": 0.000700184842883549,
501
+ "loss": 0.0252,
502
  "step": 1675
503
  },
504
  {
505
  "epoch": 3.11,
506
+ "grad_norm": 0.03389419987797737,
507
  "learning_rate": 0.0006955637707948245,
508
+ "loss": 0.0216,
509
  "step": 1700
510
  },
511
  {
512
  "epoch": 3.16,
513
+ "grad_norm": 2.448323965072632,
514
  "learning_rate": 0.0006909426987060999,
515
+ "loss": 0.0402,
516
  "step": 1725
517
  },
518
  {
519
  "epoch": 3.21,
520
+ "grad_norm": 0.5986452102661133,
521
  "learning_rate": 0.0006863216266173753,
522
+ "loss": 0.0349,
523
  "step": 1750
524
  },
525
  {
526
  "epoch": 3.25,
527
+ "grad_norm": 0.046656377613544464,
528
  "learning_rate": 0.0006817005545286507,
529
+ "loss": 0.0179,
530
  "step": 1775
531
  },
532
  {
533
  "epoch": 3.3,
534
+ "grad_norm": 0.2432301789522171,
535
  "learning_rate": 0.0006770794824399261,
536
+ "loss": 0.0261,
537
  "step": 1800
538
  },
539
  {
540
  "epoch": 3.34,
541
+ "grad_norm": 0.4144662022590637,
542
  "learning_rate": 0.0006724584103512015,
543
+ "loss": 0.0256,
544
  "step": 1825
545
  },
546
  {
547
  "epoch": 3.39,
548
+ "grad_norm": 0.27171510457992554,
549
  "learning_rate": 0.000667837338262477,
550
+ "loss": 0.0322,
551
  "step": 1850
552
  },
553
  {
554
  "epoch": 3.43,
555
+ "grad_norm": 0.1022319346666336,
556
  "learning_rate": 0.0006632162661737523,
557
+ "loss": 0.0293,
558
  "step": 1875
559
  },
560
  {
561
  "epoch": 3.48,
562
+ "grad_norm": 0.16478094458580017,
563
  "learning_rate": 0.0006585951940850278,
564
+ "loss": 0.0178,
565
  "step": 1900
566
  },
567
  {
568
  "epoch": 3.53,
569
+ "grad_norm": 0.1675555408000946,
570
  "learning_rate": 0.0006539741219963031,
571
+ "loss": 0.0174,
572
  "step": 1925
573
  },
574
  {
575
  "epoch": 3.57,
576
+ "grad_norm": 0.39023590087890625,
577
  "learning_rate": 0.0006493530499075786,
578
+ "loss": 0.0149,
579
  "step": 1950
580
  },
581
  {
582
  "epoch": 3.62,
583
+ "grad_norm": 0.025721503421664238,
584
  "learning_rate": 0.0006447319778188539,
585
+ "loss": 0.0231,
586
  "step": 1975
587
  },
588
  {
589
  "epoch": 3.66,
590
+ "grad_norm": 0.3088337182998657,
591
  "learning_rate": 0.0006401109057301294,
592
+ "loss": 0.0283,
593
  "step": 2000
594
  },
595
  {
596
  "epoch": 3.71,
597
+ "grad_norm": 0.06729228049516678,
598
  "learning_rate": 0.0006354898336414048,
599
+ "loss": 0.0204,
600
  "step": 2025
601
  },
602
  {
603
  "epoch": 3.75,
604
+ "grad_norm": 0.18552298843860626,
605
  "learning_rate": 0.0006308687615526802,
606
+ "loss": 0.0274,
607
  "step": 2050
608
  },
609
  {
610
  "epoch": 3.8,
611
+ "grad_norm": 0.08045148104429245,
612
  "learning_rate": 0.0006262476894639556,
613
+ "loss": 0.0218,
614
  "step": 2075
615
  },
616
  {
617
  "epoch": 3.85,
618
+ "grad_norm": 0.6443850994110107,
619
  "learning_rate": 0.000621626617375231,
620
+ "loss": 0.0207,
621
  "step": 2100
622
  },
623
  {
624
  "epoch": 3.89,
625
+ "grad_norm": 0.6463542580604553,
626
  "learning_rate": 0.0006170055452865064,
627
+ "loss": 0.0322,
628
  "step": 2125
629
  },
630
  {
631
  "epoch": 3.94,
632
+ "grad_norm": 0.2903934419155121,
633
  "learning_rate": 0.000612384473197782,
634
+ "loss": 0.031,
635
  "step": 2150
636
  },
637
  {
638
  "epoch": 3.98,
639
+ "grad_norm": 0.1343035101890564,
640
  "learning_rate": 0.0006077634011090574,
641
+ "loss": 0.0175,
642
  "step": 2175
643
  },
644
  {
645
  "epoch": 4.0,
646
+ "eval_loss": 0.1896440088748932,
647
+ "eval_runtime": 176.3159,
648
+ "eval_samples_per_second": 4.611,
649
+ "eval_steps_per_second": 0.771,
650
  "step": 2184
651
  },
652
  {
653
  "epoch": 4.03,
654
+ "grad_norm": 0.10466930270195007,
655
  "learning_rate": 0.0006031423290203328,
656
+ "loss": 0.0215,
657
  "step": 2200
658
  },
659
  {
660
  "epoch": 4.08,
661
+ "grad_norm": 0.35988566279411316,
662
  "learning_rate": 0.0005985212569316082,
663
+ "loss": 0.0193,
664
  "step": 2225
665
  },
666
  {
667
  "epoch": 4.12,
668
+ "grad_norm": 0.16410423815250397,
669
  "learning_rate": 0.0005939001848428836,
670
+ "loss": 0.0143,
671
  "step": 2250
672
  },
673
  {
674
  "epoch": 4.17,
675
+ "grad_norm": 0.2650511562824249,
676
  "learning_rate": 0.000589279112754159,
677
+ "loss": 0.0268,
678
  "step": 2275
679
  },
680
  {
681
  "epoch": 4.21,
682
+ "grad_norm": 0.2793768048286438,
683
  "learning_rate": 0.0005846580406654344,
684
+ "loss": 0.0159,
685
  "step": 2300
686
  },
687
  {
688
  "epoch": 4.26,
689
+ "grad_norm": 2.7625114917755127,
690
  "learning_rate": 0.0005800369685767099,
691
+ "loss": 0.0226,
692
  "step": 2325
693
  },
694
  {
695
  "epoch": 4.3,
696
+ "grad_norm": 0.45461520552635193,
697
  "learning_rate": 0.0005754158964879852,
698
+ "loss": 0.0137,
699
  "step": 2350
700
  },
701
  {
702
  "epoch": 4.35,
703
+ "grad_norm": 0.28511613607406616,
704
  "learning_rate": 0.0005707948243992607,
705
+ "loss": 0.0184,
706
  "step": 2375
707
  },
708
  {
709
  "epoch": 4.4,
710
+ "grad_norm": 0.5333670377731323,
711
  "learning_rate": 0.000566173752310536,
712
+ "loss": 0.0186,
713
  "step": 2400
714
  },
715
  {
716
  "epoch": 4.44,
717
+ "grad_norm": 0.41222718358039856,
718
  "learning_rate": 0.0005615526802218115,
719
+ "loss": 0.011,
720
  "step": 2425
721
  },
722
  {
723
  "epoch": 4.49,
724
+ "grad_norm": 0.27146583795547485,
725
  "learning_rate": 0.0005569316081330868,
726
+ "loss": 0.0165,
727
  "step": 2450
728
  },
729
  {
730
  "epoch": 4.53,
731
+ "grad_norm": 0.29553595185279846,
732
  "learning_rate": 0.0005523105360443623,
733
+ "loss": 0.0138,
734
  "step": 2475
735
  },
736
  {
737
  "epoch": 4.58,
738
+ "grad_norm": 0.13532432913780212,
739
  "learning_rate": 0.0005476894639556377,
740
+ "loss": 0.0167,
741
  "step": 2500
742
  },
743
  {
744
  "epoch": 4.62,
745
+ "grad_norm": 0.10051342844963074,
746
  "learning_rate": 0.0005430683918669131,
747
+ "loss": 0.0152,
748
  "step": 2525
749
  },
750
  {
751
  "epoch": 4.67,
752
+ "grad_norm": 0.023720353841781616,
753
  "learning_rate": 0.0005384473197781885,
754
+ "loss": 0.0155,
755
  "step": 2550
756
  },
757
  {
758
  "epoch": 4.72,
759
+ "grad_norm": 0.2686695456504822,
760
  "learning_rate": 0.0005338262476894639,
761
+ "loss": 0.0125,
762
  "step": 2575
763
  },
764
  {
765
  "epoch": 4.76,
766
+ "grad_norm": 0.33857473731040955,
767
  "learning_rate": 0.0005292051756007393,
768
+ "loss": 0.0332,
769
  "step": 2600
770
  },
771
  {
772
  "epoch": 4.81,
773
+ "grad_norm": 0.0131806880235672,
774
  "learning_rate": 0.0005245841035120147,
775
+ "loss": 0.014,
776
  "step": 2625
777
  },
778
  {
779
  "epoch": 4.85,
780
+ "grad_norm": 0.4342842698097229,
781
  "learning_rate": 0.0005199630314232903,
782
+ "loss": 0.016,
783
  "step": 2650
784
  },
785
  {
786
  "epoch": 4.9,
787
+ "grad_norm": 0.005540889222174883,
788
  "learning_rate": 0.0005153419593345657,
789
+ "loss": 0.0134,
790
  "step": 2675
791
  },
792
  {
793
  "epoch": 4.95,
794
+ "grad_norm": 0.004122666083276272,
795
  "learning_rate": 0.0005107208872458411,
796
+ "loss": 0.0223,
797
  "step": 2700
798
  },
799
  {
800
  "epoch": 4.99,
801
+ "grad_norm": 0.14384405314922333,
802
  "learning_rate": 0.0005060998151571165,
803
+ "loss": 0.0266,
804
  "step": 2725
805
  },
806
  {
807
  "epoch": 5.0,
808
+ "eval_loss": 0.19267761707305908,
809
+ "eval_runtime": 179.8301,
810
+ "eval_samples_per_second": 4.521,
811
+ "eval_steps_per_second": 0.756,
812
  "step": 2730
813
  },
814
  {
815
  "epoch": 5.04,
816
+ "grad_norm": 0.3819844126701355,
817
  "learning_rate": 0.0005014787430683919,
818
+ "loss": 0.0166,
819
  "step": 2750
820
  },
821
  {
822
  "epoch": 5.08,
823
+ "grad_norm": 0.27138832211494446,
824
  "learning_rate": 0.0004968576709796673,
825
+ "loss": 0.0056,
826
  "step": 2775
827
  },
828
  {
829
  "epoch": 5.13,
830
+ "grad_norm": 0.36033156514167786,
831
  "learning_rate": 0.0004922365988909427,
832
+ "loss": 0.0084,
833
  "step": 2800
834
  },
835
  {
836
  "epoch": 5.17,
837
+ "grad_norm": 0.3422500789165497,
838
  "learning_rate": 0.0004876155268022181,
839
+ "loss": 0.0089,
840
  "step": 2825
841
  },
842
  {
843
  "epoch": 5.22,
844
+ "grad_norm": 0.12272176891565323,
845
  "learning_rate": 0.0004829944547134935,
846
+ "loss": 0.0079,
847
  "step": 2850
848
  },
849
  {
850
  "epoch": 5.27,
851
+ "grad_norm": 0.03446231782436371,
852
  "learning_rate": 0.000478373382624769,
853
+ "loss": 0.011,
854
  "step": 2875
855
  },
856
  {
857
  "epoch": 5.31,
858
+ "grad_norm": 0.2042599320411682,
859
  "learning_rate": 0.0004737523105360444,
860
+ "loss": 0.0091,
861
  "step": 2900
862
  },
863
  {
864
  "epoch": 5.36,
865
+ "grad_norm": 0.18888217210769653,
866
  "learning_rate": 0.0004691312384473198,
867
+ "loss": 0.0146,
868
  "step": 2925
869
  },
870
  {
871
  "epoch": 5.4,
872
+ "grad_norm": 4.216693878173828,
873
  "learning_rate": 0.0004645101663585952,
874
+ "loss": 0.0162,
875
  "step": 2950
876
  },
877
  {
878
  "epoch": 5.45,
879
+ "grad_norm": 0.20249082148075104,
880
  "learning_rate": 0.0004598890942698706,
881
+ "loss": 0.0193,
882
  "step": 2975
883
  },
884
  {
885
  "epoch": 5.49,
886
+ "grad_norm": 0.37886273860931396,
887
  "learning_rate": 0.00045526802218114607,
888
+ "loss": 0.0163,
889
  "step": 3000
890
  },
891
  {
892
  "epoch": 5.54,
893
+ "grad_norm": 0.24141408503055573,
894
  "learning_rate": 0.0004506469500924215,
895
+ "loss": 0.0147,
896
  "step": 3025
897
  },
898
  {
899
  "epoch": 5.59,
900
+ "grad_norm": 0.23406554758548737,
901
  "learning_rate": 0.0004460258780036969,
902
+ "loss": 0.0145,
903
  "step": 3050
904
  },
905
  {
906
  "epoch": 5.63,
907
+ "grad_norm": 0.355023056268692,
908
  "learning_rate": 0.0004414048059149723,
909
+ "loss": 0.0144,
910
  "step": 3075
911
  },
912
  {
913
  "epoch": 5.68,
914
+ "grad_norm": 0.18628603219985962,
915
  "learning_rate": 0.0004367837338262477,
916
+ "loss": 0.0105,
917
  "step": 3100
918
  },
919
  {
920
  "epoch": 5.72,
921
+ "grad_norm": 0.328931987285614,
922
  "learning_rate": 0.0004321626617375231,
923
+ "loss": 0.0107,
924
  "step": 3125
925
  },
926
  {
927
  "epoch": 5.77,
928
+ "grad_norm": 0.004133810754865408,
929
  "learning_rate": 0.0004275415896487985,
930
+ "loss": 0.01,
931
  "step": 3150
932
  },
933
  {
934
  "epoch": 5.82,
935
+ "grad_norm": 0.036314379423856735,
936
  "learning_rate": 0.0004229205175600739,
937
  "loss": 0.0127,
938
  "step": 3175
939
  },
940
  {
941
  "epoch": 5.86,
942
+ "grad_norm": 0.27704620361328125,
943
  "learning_rate": 0.00041829944547134933,
944
+ "loss": 0.0111,
945
  "step": 3200
946
  },
947
  {
948
  "epoch": 5.91,
949
+ "grad_norm": 0.5109962821006775,
950
  "learning_rate": 0.00041367837338262474,
951
+ "loss": 0.0157,
952
  "step": 3225
953
  },
954
  {
955
  "epoch": 5.95,
956
+ "grad_norm": 0.09048620611429214,
957
  "learning_rate": 0.0004090573012939002,
958
+ "loss": 0.0184,
959
  "step": 3250
960
  },
961
  {
962
  "epoch": 6.0,
963
+ "grad_norm": 0.010707640089094639,
964
  "learning_rate": 0.0004044362292051756,
965
+ "loss": 0.0178,
966
  "step": 3275
967
  },
968
  {
969
  "epoch": 6.0,
970
+ "eval_loss": 0.20126062631607056,
971
+ "eval_runtime": 179.6758,
972
+ "eval_samples_per_second": 4.525,
973
+ "eval_steps_per_second": 0.757,
974
  "step": 3276
975
  },
976
  {
977
  "epoch": 6.04,
978
+ "grad_norm": 0.013095181435346603,
979
  "learning_rate": 0.000399815157116451,
980
+ "loss": 0.0106,
981
  "step": 3300
982
  },
983
  {
984
  "epoch": 6.09,
985
+ "grad_norm": 0.15969859063625336,
986
  "learning_rate": 0.0003951940850277264,
987
+ "loss": 0.0098,
988
  "step": 3325
989
  },
990
  {
991
  "epoch": 6.14,
992
+ "grad_norm": 0.09395785629749298,
993
  "learning_rate": 0.0003905730129390019,
994
+ "loss": 0.012,
995
  "step": 3350
996
  },
997
  {
998
  "epoch": 6.18,
999
+ "grad_norm": 0.010071701370179653,
1000
  "learning_rate": 0.0003859519408502773,
1001
+ "loss": 0.0071,
1002
  "step": 3375
1003
  },
1004
  {
1005
  "epoch": 6.23,
1006
+ "grad_norm": 0.005003762431442738,
1007
  "learning_rate": 0.0003813308687615527,
1008
+ "loss": 0.0093,
1009
  "step": 3400
1010
  },
1011
  {
1012
  "epoch": 6.27,
1013
+ "grad_norm": 0.01751079224050045,
1014
  "learning_rate": 0.0003767097966728281,
1015
+ "loss": 0.0093,
1016
  "step": 3425
1017
  },
1018
  {
1019
  "epoch": 6.32,
1020
+ "grad_norm": 0.048858534544706345,
1021
  "learning_rate": 0.0003720887245841035,
1022
+ "loss": 0.0092,
1023
  "step": 3450
1024
  },
1025
  {
1026
  "epoch": 6.36,
1027
+ "grad_norm": 0.05492233484983444,
1028
  "learning_rate": 0.0003674676524953789,
1029
+ "loss": 0.0137,
1030
  "step": 3475
1031
  },
1032
  {
1033
  "epoch": 6.41,
1034
+ "grad_norm": 0.011647823266685009,
1035
  "learning_rate": 0.0003628465804066544,
1036
+ "loss": 0.008,
1037
  "step": 3500
1038
  },
1039
  {
1040
  "epoch": 6.46,
1041
+ "grad_norm": 0.02023889683187008,
1042
  "learning_rate": 0.0003582255083179298,
1043
+ "loss": 0.0064,
1044
  "step": 3525
1045
  },
1046
  {
1047
  "epoch": 6.5,
1048
+ "grad_norm": 0.1093795895576477,
1049
  "learning_rate": 0.0003536044362292052,
1050
+ "loss": 0.0087,
1051
  "step": 3550
1052
  },
1053
  {
1054
  "epoch": 6.55,
1055
+ "grad_norm": 0.33639025688171387,
1056
  "learning_rate": 0.0003489833641404806,
1057
+ "loss": 0.0127,
1058
  "step": 3575
1059
  },
1060
  {
1061
  "epoch": 6.59,
1062
+ "grad_norm": 0.08823797106742859,
1063
  "learning_rate": 0.000344362292051756,
1064
+ "loss": 0.0112,
1065
  "step": 3600
1066
  },
1067
  {
1068
  "epoch": 6.64,
1069
+ "grad_norm": 0.052434779703617096,
1070
  "learning_rate": 0.0003397412199630314,
1071
+ "loss": 0.0116,
1072
  "step": 3625
1073
  },
1074
  {
1075
  "epoch": 6.68,
1076
+ "grad_norm": 0.1535090208053589,
1077
  "learning_rate": 0.0003351201478743068,
1078
+ "loss": 0.011,
1079
  "step": 3650
1080
  },
1081
  {
1082
  "epoch": 6.73,
1083
+ "grad_norm": 0.2711283564567566,
1084
  "learning_rate": 0.00033049907578558223,
1085
+ "loss": 0.007,
1086
  "step": 3675
1087
  },
1088
  {
1089
  "epoch": 6.78,
1090
+ "grad_norm": 0.006919647566974163,
1091
  "learning_rate": 0.00032587800369685764,
1092
+ "loss": 0.0098,
1093
  "step": 3700
1094
  },
1095
  {
1096
  "epoch": 6.82,
1097
+ "grad_norm": 0.03872460126876831,
1098
  "learning_rate": 0.0003212569316081331,
1099
+ "loss": 0.0092,
1100
  "step": 3725
1101
  },
1102
  {
1103
  "epoch": 6.87,
1104
+ "grad_norm": 0.0396348237991333,
1105
  "learning_rate": 0.0003166358595194085,
1106
+ "loss": 0.0126,
1107
  "step": 3750
1108
  },
1109
  {
1110
  "epoch": 6.91,
1111
+ "grad_norm": 0.008865280076861382,
1112
  "learning_rate": 0.0003120147874306839,
1113
+ "loss": 0.0097,
1114
  "step": 3775
1115
  },
1116
  {
1117
  "epoch": 6.96,
1118
+ "grad_norm": 0.2857593894004822,
1119
  "learning_rate": 0.0003073937153419594,
1120
+ "loss": 0.0081,
1121
  "step": 3800
1122
  },
1123
  {
1124
  "epoch": 7.0,
1125
+ "eval_loss": 0.19787272810935974,
1126
+ "eval_runtime": 178.7624,
1127
+ "eval_samples_per_second": 4.548,
1128
+ "eval_steps_per_second": 0.761,
1129
  "step": 3822
1130
  },
1131
  {
1132
  "epoch": 7.01,
1133
+ "grad_norm": 0.162245973944664,
1134
  "learning_rate": 0.0003027726432532348,
1135
+ "loss": 0.0091,
1136
  "step": 3825
1137
  },
1138
  {
1139
  "epoch": 7.05,
1140
+ "grad_norm": 0.08275479078292847,
1141
  "learning_rate": 0.0002981515711645102,
1142
+ "loss": 0.0064,
1143
  "step": 3850
1144
  },
1145
  {
1146
  "epoch": 7.1,
1147
+ "grad_norm": 0.04956310614943504,
1148
  "learning_rate": 0.0002935304990757856,
1149
+ "loss": 0.0033,
1150
  "step": 3875
1151
  },
1152
  {
1153
  "epoch": 7.14,
1154
+ "grad_norm": 0.2950696647167206,
1155
  "learning_rate": 0.000288909426987061,
1156
+ "loss": 0.0059,
1157
  "step": 3900
1158
  },
1159
  {
1160
  "epoch": 7.19,
1161
+ "grad_norm": 0.16667646169662476,
1162
  "learning_rate": 0.0002842883548983364,
1163
+ "loss": 0.0065,
1164
  "step": 3925
1165
  },
1166
  {
1167
  "epoch": 7.23,
1168
+ "grad_norm": 0.018928788602352142,
1169
  "learning_rate": 0.0002796672828096118,
1170
+ "loss": 0.0053,
1171
  "step": 3950
1172
  },
1173
  {
1174
  "epoch": 7.28,
1175
+ "grad_norm": 0.01914687640964985,
1176
  "learning_rate": 0.0002750462107208873,
1177
+ "loss": 0.0058,
1178
  "step": 3975
1179
  },
1180
  {
1181
  "epoch": 7.33,
1182
+ "grad_norm": 0.009565665386617184,
1183
  "learning_rate": 0.0002704251386321627,
1184
+ "loss": 0.0042,
1185
  "step": 4000
1186
  },
1187
  {
1188
  "epoch": 7.37,
1189
+ "grad_norm": 0.10117679834365845,
1190
  "learning_rate": 0.0002658040665434381,
1191
+ "loss": 0.0081,
1192
  "step": 4025
1193
  },
1194
  {
1195
  "epoch": 7.42,
1196
+ "grad_norm": 0.10825569927692413,
1197
  "learning_rate": 0.0002611829944547135,
1198
+ "loss": 0.0088,
1199
  "step": 4050
1200
  },
1201
  {
1202
  "epoch": 7.46,
1203
+ "grad_norm": 0.008808852173388004,
1204
  "learning_rate": 0.0002565619223659889,
1205
+ "loss": 0.0052,
1206
  "step": 4075
1207
  },
1208
  {
1209
  "epoch": 7.51,
1210
+ "grad_norm": 0.0186983160674572,
1211
  "learning_rate": 0.0002519408502772643,
1212
+ "loss": 0.0051,
1213
  "step": 4100
1214
  },
1215
  {
1216
  "epoch": 7.55,
1217
+ "grad_norm": 0.07354945689439774,
1218
  "learning_rate": 0.0002473197781885397,
1219
+ "loss": 0.0055,
1220
  "step": 4125
1221
  },
1222
  {
1223
  "epoch": 7.6,
1224
+ "grad_norm": 0.0021155644208192825,
1225
  "learning_rate": 0.0002426987060998152,
1226
+ "loss": 0.0044,
1227
  "step": 4150
1228
  },
1229
  {
1230
  "epoch": 7.65,
1231
+ "grad_norm": 0.08616074174642563,
1232
  "learning_rate": 0.0002380776340110906,
1233
+ "loss": 0.0037,
1234
  "step": 4175
1235
  },
1236
  {
1237
  "epoch": 7.69,
1238
+ "grad_norm": 0.009911403059959412,
1239
  "learning_rate": 0.000233456561922366,
1240
+ "loss": 0.0073,
1241
  "step": 4200
1242
  },
1243
  {
1244
  "epoch": 7.74,
1245
+ "grad_norm": 0.36762863397598267,
1246
  "learning_rate": 0.0002288354898336414,
1247
+ "loss": 0.004,
1248
  "step": 4225
1249
  },
1250
  {
1251
  "epoch": 7.78,
1252
+ "grad_norm": 0.0590713806450367,
1253
  "learning_rate": 0.00022421441774491682,
1254
+ "loss": 0.0034,
1255
  "step": 4250
1256
  },
1257
  {
1258
  "epoch": 7.83,
1259
+ "grad_norm": 0.0876949205994606,
1260
  "learning_rate": 0.00021959334565619225,
1261
+ "loss": 0.0061,
1262
  "step": 4275
1263
  },
1264
  {
1265
  "epoch": 7.88,
1266
+ "grad_norm": 0.2488565295934677,
1267
  "learning_rate": 0.00021497227356746766,
1268
+ "loss": 0.0047,
1269
  "step": 4300
1270
  },
1271
  {
1272
  "epoch": 7.92,
1273
+ "grad_norm": 0.16184526681900024,
1274
  "learning_rate": 0.00021035120147874306,
1275
+ "loss": 0.0064,
1276
  "step": 4325
1277
  },
1278
  {
1279
  "epoch": 7.97,
1280
+ "grad_norm": 0.025223182514309883,
1281
  "learning_rate": 0.00020573012939001847,
1282
+ "loss": 0.0081,
1283
  "step": 4350
1284
  },
1285
  {
1286
  "epoch": 8.0,
1287
+ "eval_loss": 0.21132159233093262,
1288
+ "eval_runtime": 178.6799,
1289
+ "eval_samples_per_second": 4.55,
1290
+ "eval_steps_per_second": 0.761,
1291
  "step": 4368
1292
  },
1293
  {
1294
  "epoch": 8.01,
1295
+ "grad_norm": 0.04916756972670555,
1296
  "learning_rate": 0.00020110905730129388,
1297
+ "loss": 0.0049,
1298
  "step": 4375
1299
  },
1300
  {
1301
  "epoch": 8.06,
1302
+ "grad_norm": 0.010703769512474537,
1303
  "learning_rate": 0.00019648798521256934,
1304
+ "loss": 0.0034,
1305
  "step": 4400
1306
  },
1307
  {
1308
  "epoch": 8.1,
1309
+ "grad_norm": 0.004313566256314516,
1310
  "learning_rate": 0.00019186691312384475,
1311
+ "loss": 0.003,
1312
  "step": 4425
1313
  },
1314
  {
1315
  "epoch": 8.15,
1316
+ "grad_norm": 0.18936963379383087,
1317
  "learning_rate": 0.00018724584103512016,
1318
+ "loss": 0.004,
1319
  "step": 4450
1320
  },
1321
  {
1322
  "epoch": 8.2,
1323
+ "grad_norm": 0.0596047043800354,
1324
  "learning_rate": 0.00018262476894639556,
1325
+ "loss": 0.0027,
1326
  "step": 4475
1327
  },
1328
  {
1329
  "epoch": 8.24,
1330
+ "grad_norm": 0.0016723590670153499,
1331
  "learning_rate": 0.00017800369685767097,
1332
+ "loss": 0.0037,
1333
  "step": 4500
1334
  },
1335
  {
1336
  "epoch": 8.29,
1337
+ "grad_norm": 0.026407798752188683,
1338
  "learning_rate": 0.0001733826247689464,
1339
+ "loss": 0.0026,
1340
  "step": 4525
1341
  },
1342
  {
1343
  "epoch": 8.33,
1344
+ "grad_norm": 0.004466090817004442,
1345
  "learning_rate": 0.0001687615526802218,
1346
+ "loss": 0.0044,
1347
  "step": 4550
1348
  },
1349
  {
1350
  "epoch": 8.38,
1351
+ "grad_norm": 0.013297215104103088,
1352
  "learning_rate": 0.00016414048059149722,
1353
+ "loss": 0.0044,
1354
  "step": 4575
1355
  },
1356
  {
1357
  "epoch": 8.42,
1358
+ "grad_norm": 0.013365192338824272,
1359
  "learning_rate": 0.00015951940850277263,
1360
+ "loss": 0.0033,
1361
  "step": 4600
1362
  },
1363
  {
1364
  "epoch": 8.47,
1365
+ "grad_norm": 0.32592836022377014,
1366
  "learning_rate": 0.0001548983364140481,
1367
+ "loss": 0.0056,
1368
  "step": 4625
1369
  },
1370
  {
1371
  "epoch": 8.52,
1372
+ "grad_norm": 0.023310931399464607,
1373
  "learning_rate": 0.0001502772643253235,
1374
+ "loss": 0.0017,
1375
  "step": 4650
1376
  },
1377
  {
1378
  "epoch": 8.56,
1379
+ "grad_norm": 0.0938984677195549,
1380
  "learning_rate": 0.0001456561922365989,
1381
+ "loss": 0.0028,
1382
  "step": 4675
1383
  },
1384
  {
1385
  "epoch": 8.61,
1386
+ "grad_norm": 0.006782053969800472,
1387
  "learning_rate": 0.0001410351201478743,
1388
+ "loss": 0.0019,
1389
  "step": 4700
1390
  },
1391
  {
1392
  "epoch": 8.65,
1393
+ "grad_norm": 0.08395280689001083,
1394
  "learning_rate": 0.00013641404805914972,
1395
+ "loss": 0.0024,
1396
  "step": 4725
1397
  },
1398
  {
1399
  "epoch": 8.7,
1400
+ "grad_norm": 0.04261644929647446,
1401
  "learning_rate": 0.00013179297597042515,
1402
+ "loss": 0.0029,
1403
  "step": 4750
1404
  },
1405
  {
1406
  "epoch": 8.75,
1407
+ "grad_norm": 0.020602483302354813,
1408
  "learning_rate": 0.00012717190388170056,
1409
+ "loss": 0.0025,
1410
  "step": 4775
1411
  },
1412
  {
1413
  "epoch": 8.79,
1414
+ "grad_norm": 0.0013005019864067435,
1415
  "learning_rate": 0.00012255083179297597,
1416
+ "loss": 0.0024,
1417
  "step": 4800
1418
  },
1419
  {
1420
  "epoch": 8.84,
1421
+ "grad_norm": 0.0019000261090695858,
1422
  "learning_rate": 0.00011792975970425139,
1423
+ "loss": 0.004,
1424
  "step": 4825
1425
  },
1426
  {
1427
  "epoch": 8.88,
1428
+ "grad_norm": 0.02021609991788864,
1429
  "learning_rate": 0.00011330868761552681,
1430
+ "loss": 0.0023,
1431
  "step": 4850
1432
  },
1433
  {
1434
  "epoch": 8.93,
1435
+ "grad_norm": 0.012654704973101616,
1436
  "learning_rate": 0.00010868761552680221,
1437
+ "loss": 0.0036,
1438
  "step": 4875
1439
  },
1440
  {
1441
  "epoch": 8.97,
1442
+ "grad_norm": 0.009410886093974113,
1443
  "learning_rate": 0.00010406654343807764,
1444
+ "loss": 0.0018,
1445
  "step": 4900
1446
  },
1447
  {
1448
  "epoch": 9.0,
1449
+ "eval_loss": 0.2146490514278412,
1450
+ "eval_runtime": 176.7137,
1451
+ "eval_samples_per_second": 4.601,
1452
+ "eval_steps_per_second": 0.77,
1453
  "step": 4914
1454
  },
1455
  {
1456
  "epoch": 9.02,
1457
+ "grad_norm": 0.00876565556973219,
1458
  "learning_rate": 9.944547134935306e-05,
1459
+ "loss": 0.0024,
1460
  "step": 4925
1461
  },
1462
  {
1463
  "epoch": 9.07,
1464
+ "grad_norm": 0.001896819332614541,
1465
  "learning_rate": 9.482439926062846e-05,
1466
  "loss": 0.0013,
1467
  "step": 4950
1468
  },
1469
  {
1470
  "epoch": 9.11,
1471
+ "grad_norm": 0.007586441468447447,
1472
  "learning_rate": 9.020332717190388e-05,
1473
+ "loss": 0.0017,
1474
  "step": 4975
1475
  },
1476
  {
1477
  "epoch": 9.16,
1478
+ "grad_norm": 0.006564935203641653,
1479
  "learning_rate": 8.558225508317929e-05,
1480
+ "loss": 0.003,
1481
  "step": 5000
1482
  },
1483
  {
1484
  "epoch": 9.2,
1485
+ "grad_norm": 0.005424303933978081,
1486
  "learning_rate": 8.096118299445473e-05,
1487
+ "loss": 0.0014,
1488
  "step": 5025
1489
  },
1490
  {
1491
  "epoch": 9.25,
1492
+ "grad_norm": 0.0165091622620821,
1493
  "learning_rate": 7.634011090573013e-05,
1494
+ "loss": 0.0027,
1495
  "step": 5050
1496
  },
1497
  {
1498
  "epoch": 9.29,
1499
+ "grad_norm": 0.09231999516487122,
1500
  "learning_rate": 7.171903881700554e-05,
1501
+ "loss": 0.0018,
1502
  "step": 5075
1503
  },
1504
  {
1505
  "epoch": 9.34,
1506
+ "grad_norm": 0.16238878667354584,
1507
  "learning_rate": 6.709796672828096e-05,
1508
+ "loss": 0.0015,
1509
  "step": 5100
1510
  },
1511
  {
1512
  "epoch": 9.39,
1513
+ "grad_norm": 0.04476441815495491,
1514
  "learning_rate": 6.247689463955638e-05,
1515
+ "loss": 0.0011,
1516
  "step": 5125
1517
  },
1518
  {
1519
  "epoch": 9.43,
1520
+ "grad_norm": 0.00874653086066246,
1521
  "learning_rate": 5.785582255083179e-05,
1522
+ "loss": 0.0008,
1523
  "step": 5150
1524
  },
1525
  {
1526
  "epoch": 9.48,
1527
+ "grad_norm": 0.010477906093001366,
1528
  "learning_rate": 5.323475046210721e-05,
1529
+ "loss": 0.0021,
1530
  "step": 5175
1531
  },
1532
  {
1533
  "epoch": 9.52,
1534
+ "grad_norm": 0.00953985471278429,
1535
  "learning_rate": 4.8613678373382625e-05,
1536
+ "loss": 0.0017,
1537
  "step": 5200
1538
  },
1539
  {
1540
  "epoch": 9.57,
1541
+ "grad_norm": 0.0022518846672028303,
1542
  "learning_rate": 4.3992606284658045e-05,
1543
+ "loss": 0.0019,
1544
  "step": 5225
1545
  },
1546
  {
1547
  "epoch": 9.62,
1548
+ "grad_norm": 0.037685129791498184,
1549
  "learning_rate": 3.937153419593346e-05,
1550
  "loss": 0.001,
1551
  "step": 5250
1552
  },
1553
  {
1554
  "epoch": 9.66,
1555
+ "grad_norm": 0.08190955966711044,
1556
  "learning_rate": 3.4750462107208874e-05,
1557
+ "loss": 0.0017,
1558
  "step": 5275
1559
  },
1560
  {
1561
  "epoch": 9.71,
1562
+ "grad_norm": 0.017375241965055466,
1563
  "learning_rate": 3.012939001848429e-05,
1564
+ "loss": 0.0016,
1565
  "step": 5300
1566
  },
1567
  {
1568
  "epoch": 9.75,
1569
+ "grad_norm": 0.03486447408795357,
1570
  "learning_rate": 2.5508317929759705e-05,
1571
+ "loss": 0.0012,
1572
  "step": 5325
1573
  },
1574
  {
1575
  "epoch": 9.8,
1576
+ "grad_norm": 0.0786125510931015,
1577
  "learning_rate": 2.088724584103512e-05,
1578
+ "loss": 0.0012,
1579
  "step": 5350
1580
  },
1581
  {
1582
  "epoch": 9.84,
1583
+ "grad_norm": 0.09049534052610397,
1584
  "learning_rate": 1.6266173752310537e-05,
1585
+ "loss": 0.0012,
1586
  "step": 5375
1587
  },
1588
  {
1589
  "epoch": 9.89,
1590
+ "grad_norm": 0.012832165695726871,
1591
  "learning_rate": 1.1645101663585952e-05,
1592
+ "loss": 0.0014,
1593
  "step": 5400
1594
  },
1595
  {
1596
  "epoch": 9.94,
1597
+ "grad_norm": 0.006516186986118555,
1598
  "learning_rate": 7.024029574861368e-06,
1599
+ "loss": 0.0015,
1600
  "step": 5425
1601
  },
1602
  {
1603
  "epoch": 9.98,
1604
+ "grad_norm": 0.02494051493704319,
1605
  "learning_rate": 2.402957486136784e-06,
1606
+ "loss": 0.0015,
1607
  "step": 5450
1608
  },
1609
  {
1610
  "epoch": 10.0,
1611
+ "eval_loss": 0.221242755651474,
1612
+ "eval_runtime": 176.7742,
1613
+ "eval_samples_per_second": 4.599,
1614
+ "eval_steps_per_second": 0.769,
1615
  "step": 5460
1616
  },
1617
  {
1618
  "epoch": 10.0,
1619
  "step": 5460,
1620
  "total_flos": 9.7789895073792e+18,
1621
+ "train_loss": 0.024085146698745945,
1622
+ "train_runtime": 10729.5864,
1623
+ "train_samples_per_second": 3.05,
1624
  "train_steps_per_second": 0.509
1625
  }
1626
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d450ec50cab4188af1d2f839c282d646159565242704493acf7a0046664a3f1
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bd466f2e7cc2734735d5e5c8279377c63a93a1f9a6913853ad505a1a2013446
3
  size 5112