somaia02 commited on
Commit
24eb3aa
·
1 Parent(s): 65e80ca

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -12,12 +12,12 @@
12
  "lora_dropout": 0.05,
13
  "modules_to_save": null,
14
  "peft_type": "LORA",
15
- "r": 8,
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
- "q_proj",
20
  "k_proj",
 
21
  "v_proj"
22
  ],
23
  "task_type": "CAUSAL_LM"
 
12
  "lora_dropout": 0.05,
13
  "modules_to_save": null,
14
  "peft_type": "LORA",
15
+ "r": 16,
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
 
19
  "k_proj",
20
+ "q_proj",
21
  "v_proj"
22
  ],
23
  "task_type": "CAUSAL_LM"
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd42a6a3b012049dbfcf0d4ad62b630ef8dca1676e140ca7ca180a24e13c910d
3
- size 2669168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a8a809056b973f3df74a8e0401d2f6b3f48855ca88f92c074a282b2fc872456
3
+ size 5323528
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:628d3ad5e22c61b9e139f66dac9c70cf34279bc9e3690968e4e15d122f68c3fb
3
- size 5399290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cace089f6f62f3f0c39575883a3ef27ad996c274acf20a6fd9d2d6ee12a92918
3
+ size 10707706
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:acaefe44adfe5a8938c73ae689d02d9d4a52614411ab8ab1eb80188c16c1a919
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e4a99a8f1604eb88b6b6efd9bc6ced6e284090d6037a758d0561d10a951f004
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a4dd1d74816502c8ecbc715add6bae4e99a2b4e50b653b0b20cfecda567b3eb
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c82bedf0a611f290596df2fde142fbda2afa059d93dc846b92ee4f876380a79
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.4207456707954407,
3
- "best_model_checkpoint": "bart_lora_outputs\\checkpoint-6000",
4
- "epoch": 9.787928221859707,
5
  "eval_steps": 100,
6
- "global_step": 6000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -11,4089 +11,349 @@
11
  {
12
  "epoch": 0.02,
13
  "learning_rate": 2e-05,
14
- "loss": 2.9197,
15
  "step": 10
16
  },
17
  {
18
  "epoch": 0.03,
19
  "learning_rate": 4e-05,
20
- "loss": 2.872,
21
  "step": 20
22
  },
23
  {
24
  "epoch": 0.05,
25
  "learning_rate": 6e-05,
26
- "loss": 2.6183,
27
  "step": 30
28
  },
29
  {
30
  "epoch": 0.07,
31
  "learning_rate": 8e-05,
32
- "loss": 2.4094,
33
  "step": 40
34
  },
35
  {
36
  "epoch": 0.08,
37
  "learning_rate": 0.0001,
38
- "loss": 2.15,
39
  "step": 50
40
  },
41
  {
42
  "epoch": 0.1,
43
  "learning_rate": 0.00012,
44
- "loss": 1.7088,
45
  "step": 60
46
  },
47
  {
48
  "epoch": 0.11,
49
  "learning_rate": 0.00014000000000000001,
50
- "loss": 1.413,
51
  "step": 70
52
  },
53
  {
54
  "epoch": 0.13,
55
  "learning_rate": 0.00016,
56
- "loss": 1.2326,
57
  "step": 80
58
  },
59
  {
60
  "epoch": 0.15,
61
  "learning_rate": 0.00017999999999999998,
62
- "loss": 1.1108,
63
  "step": 90
64
  },
65
  {
66
  "epoch": 0.16,
67
  "learning_rate": 0.0002,
68
- "loss": 1.0331,
69
  "step": 100
70
  },
71
  {
72
  "epoch": 0.16,
73
- "eval_loss": 0.8141101598739624,
74
- "eval_runtime": 5.616,
75
- "eval_samples_per_second": 208.511,
76
- "eval_steps_per_second": 13.177,
77
  "step": 100
78
  },
79
  {
80
  "epoch": 0.18,
81
  "learning_rate": 0.00022,
82
- "loss": 0.9807,
83
  "step": 110
84
  },
85
  {
86
  "epoch": 0.2,
87
  "learning_rate": 0.00024,
88
- "loss": 0.9853,
89
  "step": 120
90
  },
91
  {
92
  "epoch": 0.21,
93
  "learning_rate": 0.00026000000000000003,
94
- "loss": 0.9411,
95
  "step": 130
96
  },
97
  {
98
  "epoch": 0.23,
99
  "learning_rate": 0.00028000000000000003,
100
- "loss": 0.8733,
101
  "step": 140
102
  },
103
  {
104
  "epoch": 0.24,
105
  "learning_rate": 0.0003,
106
- "loss": 0.8347,
107
  "step": 150
108
  },
109
  {
110
  "epoch": 0.26,
111
  "learning_rate": 0.00032,
112
- "loss": 0.8786,
113
  "step": 160
114
  },
115
  {
116
  "epoch": 0.28,
117
  "learning_rate": 0.00034,
118
- "loss": 0.7676,
119
  "step": 170
120
  },
121
  {
122
  "epoch": 0.29,
123
  "learning_rate": 0.00035999999999999997,
124
- "loss": 0.8399,
125
  "step": 180
126
  },
127
  {
128
  "epoch": 0.31,
129
  "learning_rate": 0.00038,
130
- "loss": 0.838,
131
  "step": 190
132
  },
133
  {
134
  "epoch": 0.33,
135
  "learning_rate": 0.0004,
136
- "loss": 0.7888,
137
  "step": 200
138
  },
139
  {
140
  "epoch": 0.33,
141
- "eval_loss": 0.6890668869018555,
142
- "eval_runtime": 5.748,
143
- "eval_samples_per_second": 203.722,
144
- "eval_steps_per_second": 12.874,
145
  "step": 200
146
  },
147
  {
148
  "epoch": 0.34,
149
  "learning_rate": 0.00042,
150
- "loss": 0.7596,
151
  "step": 210
152
  },
153
  {
154
  "epoch": 0.36,
155
  "learning_rate": 0.00044,
156
- "loss": 0.7959,
157
  "step": 220
158
  },
159
  {
160
  "epoch": 0.38,
161
  "learning_rate": 0.00046,
162
- "loss": 0.7637,
163
  "step": 230
164
  },
165
  {
166
  "epoch": 0.39,
167
  "learning_rate": 0.00048,
168
- "loss": 0.7413,
169
  "step": 240
170
  },
171
  {
172
  "epoch": 0.41,
173
  "learning_rate": 0.0005,
174
- "loss": 0.7559,
175
  "step": 250
176
  },
177
  {
178
  "epoch": 0.42,
179
  "learning_rate": 0.0005200000000000001,
180
- "loss": 0.7465,
181
  "step": 260
182
  },
183
  {
184
  "epoch": 0.44,
185
  "learning_rate": 0.00054,
186
- "loss": 0.7311,
187
  "step": 270
188
  },
189
  {
190
  "epoch": 0.46,
191
  "learning_rate": 0.0005600000000000001,
192
- "loss": 0.7031,
193
  "step": 280
194
  },
195
  {
196
  "epoch": 0.47,
197
  "learning_rate": 0.00058,
198
- "loss": 0.7565,
199
  "step": 290
200
  },
201
  {
202
  "epoch": 0.49,
203
  "learning_rate": 0.0006,
204
- "loss": 0.6811,
205
  "step": 300
206
  },
207
  {
208
  "epoch": 0.49,
209
- "eval_loss": 0.6422508955001831,
210
- "eval_runtime": 5.623,
211
- "eval_samples_per_second": 208.251,
212
- "eval_steps_per_second": 13.16,
213
  "step": 300
214
  },
215
  {
216
  "epoch": 0.51,
217
  "learning_rate": 0.00062,
218
- "loss": 0.7063,
219
  "step": 310
220
  },
221
  {
222
  "epoch": 0.52,
223
  "learning_rate": 0.00064,
224
- "loss": 0.727,
225
  "step": 320
226
  },
227
  {
228
  "epoch": 0.54,
229
  "learning_rate": 0.00066,
230
- "loss": 0.6678,
231
  "step": 330
232
  },
233
  {
234
  "epoch": 0.55,
235
  "learning_rate": 0.00068,
236
- "loss": 0.6465,
237
  "step": 340
238
  },
239
  {
240
  "epoch": 0.57,
241
  "learning_rate": 0.0007,
242
- "loss": 0.6647,
243
  "step": 350
244
  },
245
  {
246
  "epoch": 0.59,
247
  "learning_rate": 0.0007199999999999999,
248
- "loss": 0.6754,
249
  "step": 360
250
  },
251
  {
252
  "epoch": 0.6,
253
  "learning_rate": 0.00074,
254
- "loss": 0.6419,
255
  "step": 370
256
  },
257
  {
258
  "epoch": 0.62,
259
  "learning_rate": 0.00076,
260
- "loss": 0.6746,
261
  "step": 380
262
  },
263
  {
264
  "epoch": 0.64,
265
  "learning_rate": 0.0007800000000000001,
266
- "loss": 0.6279,
267
  "step": 390
268
  },
269
  {
270
  "epoch": 0.65,
271
  "learning_rate": 0.0008,
272
- "loss": 0.7124,
273
  "step": 400
274
  },
275
  {
276
  "epoch": 0.65,
277
- "eval_loss": 0.5565645098686218,
278
- "eval_runtime": 5.756,
279
- "eval_samples_per_second": 203.439,
280
- "eval_steps_per_second": 12.856,
281
  "step": 400
282
  },
283
  {
284
  "epoch": 0.67,
285
  "learning_rate": 0.00082,
286
- "loss": 0.6656,
287
  "step": 410
288
  },
289
  {
290
  "epoch": 0.69,
291
  "learning_rate": 0.00084,
292
- "loss": 0.6298,
293
  "step": 420
294
  },
295
  {
296
  "epoch": 0.7,
297
  "learning_rate": 0.00086,
298
- "loss": 0.6795,
299
  "step": 430
300
  },
301
  {
302
  "epoch": 0.72,
303
  "learning_rate": 0.00088,
304
- "loss": 0.6279,
305
  "step": 440
306
  },
307
  {
308
  "epoch": 0.73,
309
  "learning_rate": 0.0009000000000000001,
310
- "loss": 0.6663,
311
  "step": 450
312
  },
313
  {
314
  "epoch": 0.75,
315
  "learning_rate": 0.00092,
316
- "loss": 0.662,
317
  "step": 460
318
  },
319
  {
320
  "epoch": 0.77,
321
  "learning_rate": 0.00094,
322
- "loss": 0.6339,
323
  "step": 470
324
  },
325
  {
326
  "epoch": 0.78,
327
  "learning_rate": 0.00096,
328
- "loss": 0.6099,
329
  "step": 480
330
  },
331
  {
332
  "epoch": 0.8,
333
  "learning_rate": 0.00098,
334
- "loss": 0.6465,
335
  "step": 490
336
  },
337
  {
338
  "epoch": 0.82,
339
  "learning_rate": 0.001,
340
- "loss": 0.615,
341
  "step": 500
342
  },
343
  {
344
  "epoch": 0.82,
345
- "eval_loss": 0.5440758466720581,
346
- "eval_runtime": 5.541,
347
- "eval_samples_per_second": 211.333,
348
- "eval_steps_per_second": 13.355,
349
  "step": 500
350
- },
351
- {
352
- "epoch": 0.83,
353
- "learning_rate": 0.0009982238010657195,
354
- "loss": 0.6562,
355
- "step": 510
356
- },
357
- {
358
- "epoch": 0.85,
359
- "learning_rate": 0.0009964476021314388,
360
- "loss": 0.6576,
361
- "step": 520
362
- },
363
- {
364
- "epoch": 0.86,
365
- "learning_rate": 0.000994671403197158,
366
- "loss": 0.6182,
367
- "step": 530
368
- },
369
- {
370
- "epoch": 0.88,
371
- "learning_rate": 0.0009928952042628776,
372
- "loss": 0.6067,
373
- "step": 540
374
- },
375
- {
376
- "epoch": 0.9,
377
- "learning_rate": 0.0009911190053285969,
378
- "loss": 0.5993,
379
- "step": 550
380
- },
381
- {
382
- "epoch": 0.91,
383
- "learning_rate": 0.0009893428063943162,
384
- "loss": 0.6218,
385
- "step": 560
386
- },
387
- {
388
- "epoch": 0.93,
389
- "learning_rate": 0.0009875666074600357,
390
- "loss": 0.6154,
391
- "step": 570
392
- },
393
- {
394
- "epoch": 0.95,
395
- "learning_rate": 0.000985790408525755,
396
- "loss": 0.612,
397
- "step": 580
398
- },
399
- {
400
- "epoch": 0.96,
401
- "learning_rate": 0.0009840142095914742,
402
- "loss": 0.6217,
403
- "step": 590
404
- },
405
- {
406
- "epoch": 0.98,
407
- "learning_rate": 0.0009822380106571937,
408
- "loss": 0.6161,
409
- "step": 600
410
- },
411
- {
412
- "epoch": 0.98,
413
- "eval_loss": 0.5353884100914001,
414
- "eval_runtime": 5.655,
415
- "eval_samples_per_second": 207.073,
416
- "eval_steps_per_second": 13.086,
417
- "step": 600
418
- },
419
- {
420
- "epoch": 1.0,
421
- "learning_rate": 0.000980461811722913,
422
- "loss": 0.6054,
423
- "step": 610
424
- },
425
- {
426
- "epoch": 1.01,
427
- "learning_rate": 0.0009786856127886323,
428
- "loss": 0.5417,
429
- "step": 620
430
- },
431
- {
432
- "epoch": 1.03,
433
- "learning_rate": 0.0009769094138543518,
434
- "loss": 0.5902,
435
- "step": 630
436
- },
437
- {
438
- "epoch": 1.04,
439
- "learning_rate": 0.0009751332149200711,
440
- "loss": 0.6063,
441
- "step": 640
442
- },
443
- {
444
- "epoch": 1.06,
445
- "learning_rate": 0.0009733570159857904,
446
- "loss": 0.543,
447
- "step": 650
448
- },
449
- {
450
- "epoch": 1.08,
451
- "learning_rate": 0.0009715808170515098,
452
- "loss": 0.5668,
453
- "step": 660
454
- },
455
- {
456
- "epoch": 1.09,
457
- "learning_rate": 0.0009698046181172292,
458
- "loss": 0.5594,
459
- "step": 670
460
- },
461
- {
462
- "epoch": 1.11,
463
- "learning_rate": 0.0009680284191829485,
464
- "loss": 0.5758,
465
- "step": 680
466
- },
467
- {
468
- "epoch": 1.13,
469
- "learning_rate": 0.0009662522202486678,
470
- "loss": 0.5845,
471
- "step": 690
472
- },
473
- {
474
- "epoch": 1.14,
475
- "learning_rate": 0.0009644760213143872,
476
- "loss": 0.574,
477
- "step": 700
478
- },
479
- {
480
- "epoch": 1.14,
481
- "eval_loss": 0.5269472002983093,
482
- "eval_runtime": 5.818,
483
- "eval_samples_per_second": 201.271,
484
- "eval_steps_per_second": 12.719,
485
- "step": 700
486
- },
487
- {
488
- "epoch": 1.16,
489
- "learning_rate": 0.0009626998223801065,
490
- "loss": 0.5216,
491
- "step": 710
492
- },
493
- {
494
- "epoch": 1.17,
495
- "learning_rate": 0.0009609236234458259,
496
- "loss": 0.6191,
497
- "step": 720
498
- },
499
- {
500
- "epoch": 1.19,
501
- "learning_rate": 0.0009591474245115453,
502
- "loss": 0.5832,
503
- "step": 730
504
- },
505
- {
506
- "epoch": 1.21,
507
- "learning_rate": 0.0009573712255772646,
508
- "loss": 0.5622,
509
- "step": 740
510
- },
511
- {
512
- "epoch": 1.22,
513
- "learning_rate": 0.000955595026642984,
514
- "loss": 0.599,
515
- "step": 750
516
- },
517
- {
518
- "epoch": 1.24,
519
- "learning_rate": 0.0009538188277087034,
520
- "loss": 0.564,
521
- "step": 760
522
- },
523
- {
524
- "epoch": 1.26,
525
- "learning_rate": 0.0009520426287744227,
526
- "loss": 0.5996,
527
- "step": 770
528
- },
529
- {
530
- "epoch": 1.27,
531
- "learning_rate": 0.0009502664298401421,
532
- "loss": 0.5337,
533
- "step": 780
534
- },
535
- {
536
- "epoch": 1.29,
537
- "learning_rate": 0.0009484902309058615,
538
- "loss": 0.6051,
539
- "step": 790
540
- },
541
- {
542
- "epoch": 1.31,
543
- "learning_rate": 0.0009467140319715807,
544
- "loss": 0.6023,
545
- "step": 800
546
- },
547
- {
548
- "epoch": 1.31,
549
- "eval_loss": 0.5132582187652588,
550
- "eval_runtime": 5.778,
551
- "eval_samples_per_second": 202.665,
552
- "eval_steps_per_second": 12.807,
553
- "step": 800
554
- },
555
- {
556
- "epoch": 1.32,
557
- "learning_rate": 0.0009449378330373001,
558
- "loss": 0.5531,
559
- "step": 810
560
- },
561
- {
562
- "epoch": 1.34,
563
- "learning_rate": 0.0009431616341030196,
564
- "loss": 0.5623,
565
- "step": 820
566
- },
567
- {
568
- "epoch": 1.35,
569
- "learning_rate": 0.0009413854351687389,
570
- "loss": 0.5389,
571
- "step": 830
572
- },
573
- {
574
- "epoch": 1.37,
575
- "learning_rate": 0.0009396092362344583,
576
- "loss": 0.5446,
577
- "step": 840
578
- },
579
- {
580
- "epoch": 1.39,
581
- "learning_rate": 0.0009378330373001777,
582
- "loss": 0.5512,
583
- "step": 850
584
- },
585
- {
586
- "epoch": 1.4,
587
- "learning_rate": 0.000936056838365897,
588
- "loss": 0.571,
589
- "step": 860
590
- },
591
- {
592
- "epoch": 1.42,
593
- "learning_rate": 0.0009342806394316164,
594
- "loss": 0.5415,
595
- "step": 870
596
- },
597
- {
598
- "epoch": 1.44,
599
- "learning_rate": 0.0009325044404973358,
600
- "loss": 0.5807,
601
- "step": 880
602
- },
603
- {
604
- "epoch": 1.45,
605
- "learning_rate": 0.0009307282415630552,
606
- "loss": 0.5304,
607
- "step": 890
608
- },
609
- {
610
- "epoch": 1.47,
611
- "learning_rate": 0.0009289520426287745,
612
- "loss": 0.5468,
613
- "step": 900
614
- },
615
- {
616
- "epoch": 1.47,
617
- "eval_loss": 0.4914248585700989,
618
- "eval_runtime": 5.762,
619
- "eval_samples_per_second": 203.227,
620
- "eval_steps_per_second": 12.843,
621
- "step": 900
622
- },
623
- {
624
- "epoch": 1.48,
625
- "learning_rate": 0.0009271758436944939,
626
- "loss": 0.6005,
627
- "step": 910
628
- },
629
- {
630
- "epoch": 1.5,
631
- "learning_rate": 0.0009253996447602132,
632
- "loss": 0.59,
633
- "step": 920
634
- },
635
- {
636
- "epoch": 1.52,
637
- "learning_rate": 0.0009236234458259325,
638
- "loss": 0.5487,
639
- "step": 930
640
- },
641
- {
642
- "epoch": 1.53,
643
- "learning_rate": 0.0009218472468916519,
644
- "loss": 0.5783,
645
- "step": 940
646
- },
647
- {
648
- "epoch": 1.55,
649
- "learning_rate": 0.0009200710479573713,
650
- "loss": 0.5579,
651
- "step": 950
652
- },
653
- {
654
- "epoch": 1.57,
655
- "learning_rate": 0.0009182948490230906,
656
- "loss": 0.5618,
657
- "step": 960
658
- },
659
- {
660
- "epoch": 1.58,
661
- "learning_rate": 0.00091651865008881,
662
- "loss": 0.5434,
663
- "step": 970
664
- },
665
- {
666
- "epoch": 1.6,
667
- "learning_rate": 0.0009147424511545294,
668
- "loss": 0.5715,
669
- "step": 980
670
- },
671
- {
672
- "epoch": 1.62,
673
- "learning_rate": 0.0009129662522202487,
674
- "loss": 0.5257,
675
- "step": 990
676
- },
677
- {
678
- "epoch": 1.63,
679
- "learning_rate": 0.0009111900532859681,
680
- "loss": 0.5391,
681
- "step": 1000
682
- },
683
- {
684
- "epoch": 1.63,
685
- "eval_loss": 0.4895668029785156,
686
- "eval_runtime": 5.802,
687
- "eval_samples_per_second": 201.826,
688
- "eval_steps_per_second": 12.754,
689
- "step": 1000
690
- },
691
- {
692
- "epoch": 1.65,
693
- "learning_rate": 0.0009094138543516875,
694
- "loss": 0.5178,
695
- "step": 1010
696
- },
697
- {
698
- "epoch": 1.66,
699
- "learning_rate": 0.0009076376554174067,
700
- "loss": 0.5655,
701
- "step": 1020
702
- },
703
- {
704
- "epoch": 1.68,
705
- "learning_rate": 0.0009058614564831261,
706
- "loss": 0.5352,
707
- "step": 1030
708
- },
709
- {
710
- "epoch": 1.7,
711
- "learning_rate": 0.0009040852575488455,
712
- "loss": 0.5355,
713
- "step": 1040
714
- },
715
- {
716
- "epoch": 1.71,
717
- "learning_rate": 0.0009023090586145648,
718
- "loss": 0.5322,
719
- "step": 1050
720
- },
721
- {
722
- "epoch": 1.73,
723
- "learning_rate": 0.0009005328596802842,
724
- "loss": 0.5427,
725
- "step": 1060
726
- },
727
- {
728
- "epoch": 1.75,
729
- "learning_rate": 0.0008987566607460036,
730
- "loss": 0.5531,
731
- "step": 1070
732
- },
733
- {
734
- "epoch": 1.76,
735
- "learning_rate": 0.0008969804618117229,
736
- "loss": 0.5485,
737
- "step": 1080
738
- },
739
- {
740
- "epoch": 1.78,
741
- "learning_rate": 0.0008952042628774423,
742
- "loss": 0.5366,
743
- "step": 1090
744
- },
745
- {
746
- "epoch": 1.79,
747
- "learning_rate": 0.0008934280639431617,
748
- "loss": 0.5455,
749
- "step": 1100
750
- },
751
- {
752
- "epoch": 1.79,
753
- "eval_loss": 0.4829932451248169,
754
- "eval_runtime": 5.871,
755
- "eval_samples_per_second": 199.454,
756
- "eval_steps_per_second": 12.604,
757
- "step": 1100
758
- },
759
- {
760
- "epoch": 1.81,
761
- "learning_rate": 0.000891651865008881,
762
- "loss": 0.5378,
763
- "step": 1110
764
- },
765
- {
766
- "epoch": 1.83,
767
- "learning_rate": 0.0008898756660746004,
768
- "loss": 0.5516,
769
- "step": 1120
770
- },
771
- {
772
- "epoch": 1.84,
773
- "learning_rate": 0.0008880994671403197,
774
- "loss": 0.541,
775
- "step": 1130
776
- },
777
- {
778
- "epoch": 1.86,
779
- "learning_rate": 0.0008863232682060391,
780
- "loss": 0.55,
781
- "step": 1140
782
- },
783
- {
784
- "epoch": 1.88,
785
- "learning_rate": 0.0008845470692717584,
786
- "loss": 0.5451,
787
- "step": 1150
788
- },
789
- {
790
- "epoch": 1.89,
791
- "learning_rate": 0.0008827708703374778,
792
- "loss": 0.5508,
793
- "step": 1160
794
- },
795
- {
796
- "epoch": 1.91,
797
- "learning_rate": 0.0008809946714031972,
798
- "loss": 0.4816,
799
- "step": 1170
800
- },
801
- {
802
- "epoch": 1.92,
803
- "learning_rate": 0.0008792184724689165,
804
- "loss": 0.5529,
805
- "step": 1180
806
- },
807
- {
808
- "epoch": 1.94,
809
- "learning_rate": 0.0008774422735346359,
810
- "loss": 0.5119,
811
- "step": 1190
812
- },
813
- {
814
- "epoch": 1.96,
815
- "learning_rate": 0.0008756660746003553,
816
- "loss": 0.5431,
817
- "step": 1200
818
- },
819
- {
820
- "epoch": 1.96,
821
- "eval_loss": 0.4757298529148102,
822
- "eval_runtime": 5.601,
823
- "eval_samples_per_second": 209.069,
824
- "eval_steps_per_second": 13.212,
825
- "step": 1200
826
- },
827
- {
828
- "epoch": 1.97,
829
- "learning_rate": 0.0008738898756660746,
830
- "loss": 0.5315,
831
- "step": 1210
832
- },
833
- {
834
- "epoch": 1.99,
835
- "learning_rate": 0.000872113676731794,
836
- "loss": 0.5455,
837
- "step": 1220
838
- },
839
- {
840
- "epoch": 2.01,
841
- "learning_rate": 0.0008703374777975134,
842
- "loss": 0.5274,
843
- "step": 1230
844
- },
845
- {
846
- "epoch": 2.02,
847
- "learning_rate": 0.0008685612788632326,
848
- "loss": 0.5383,
849
- "step": 1240
850
- },
851
- {
852
- "epoch": 2.04,
853
- "learning_rate": 0.000866785079928952,
854
- "loss": 0.5185,
855
- "step": 1250
856
- },
857
- {
858
- "epoch": 2.06,
859
- "learning_rate": 0.0008650088809946714,
860
- "loss": 0.5093,
861
- "step": 1260
862
- },
863
- {
864
- "epoch": 2.07,
865
- "learning_rate": 0.0008632326820603907,
866
- "loss": 0.5237,
867
- "step": 1270
868
- },
869
- {
870
- "epoch": 2.09,
871
- "learning_rate": 0.0008614564831261101,
872
- "loss": 0.4849,
873
- "step": 1280
874
- },
875
- {
876
- "epoch": 2.1,
877
- "learning_rate": 0.0008596802841918295,
878
- "loss": 0.5198,
879
- "step": 1290
880
- },
881
- {
882
- "epoch": 2.12,
883
- "learning_rate": 0.0008579040852575488,
884
- "loss": 0.4962,
885
- "step": 1300
886
- },
887
- {
888
- "epoch": 2.12,
889
- "eval_loss": 0.4822671711444855,
890
- "eval_runtime": 5.607,
891
- "eval_samples_per_second": 208.845,
892
- "eval_steps_per_second": 13.198,
893
- "step": 1300
894
- },
895
- {
896
- "epoch": 2.14,
897
- "learning_rate": 0.0008561278863232682,
898
- "loss": 0.5036,
899
- "step": 1310
900
- },
901
- {
902
- "epoch": 2.15,
903
- "learning_rate": 0.0008543516873889876,
904
- "loss": 0.498,
905
- "step": 1320
906
- },
907
- {
908
- "epoch": 2.17,
909
- "learning_rate": 0.0008525754884547069,
910
- "loss": 0.494,
911
- "step": 1330
912
- },
913
- {
914
- "epoch": 2.19,
915
- "learning_rate": 0.0008507992895204263,
916
- "loss": 0.5209,
917
- "step": 1340
918
- },
919
- {
920
- "epoch": 2.2,
921
- "learning_rate": 0.0008490230905861456,
922
- "loss": 0.485,
923
- "step": 1350
924
- },
925
- {
926
- "epoch": 2.22,
927
- "learning_rate": 0.000847246891651865,
928
- "loss": 0.5423,
929
- "step": 1360
930
- },
931
- {
932
- "epoch": 2.23,
933
- "learning_rate": 0.0008454706927175843,
934
- "loss": 0.4856,
935
- "step": 1370
936
- },
937
- {
938
- "epoch": 2.25,
939
- "learning_rate": 0.0008436944937833037,
940
- "loss": 0.4983,
941
- "step": 1380
942
- },
943
- {
944
- "epoch": 2.27,
945
- "learning_rate": 0.0008419182948490231,
946
- "loss": 0.5286,
947
- "step": 1390
948
- },
949
- {
950
- "epoch": 2.28,
951
- "learning_rate": 0.0008401420959147424,
952
- "loss": 0.5206,
953
- "step": 1400
954
- },
955
- {
956
- "epoch": 2.28,
957
- "eval_loss": 0.47338804602622986,
958
- "eval_runtime": 5.569,
959
- "eval_samples_per_second": 210.27,
960
- "eval_steps_per_second": 13.288,
961
- "step": 1400
962
- },
963
- {
964
- "epoch": 2.3,
965
- "learning_rate": 0.0008383658969804618,
966
- "loss": 0.5231,
967
- "step": 1410
968
- },
969
- {
970
- "epoch": 2.32,
971
- "learning_rate": 0.0008365896980461812,
972
- "loss": 0.5336,
973
- "step": 1420
974
- },
975
- {
976
- "epoch": 2.33,
977
- "learning_rate": 0.0008348134991119005,
978
- "loss": 0.4807,
979
- "step": 1430
980
- },
981
- {
982
- "epoch": 2.35,
983
- "learning_rate": 0.00083303730017762,
984
- "loss": 0.5065,
985
- "step": 1440
986
- },
987
- {
988
- "epoch": 2.37,
989
- "learning_rate": 0.0008312611012433394,
990
- "loss": 0.5215,
991
- "step": 1450
992
- },
993
- {
994
- "epoch": 2.38,
995
- "learning_rate": 0.0008294849023090586,
996
- "loss": 0.502,
997
- "step": 1460
998
- },
999
- {
1000
- "epoch": 2.4,
1001
- "learning_rate": 0.000827708703374778,
1002
- "loss": 0.51,
1003
- "step": 1470
1004
- },
1005
- {
1006
- "epoch": 2.41,
1007
- "learning_rate": 0.0008259325044404974,
1008
- "loss": 0.5132,
1009
- "step": 1480
1010
- },
1011
- {
1012
- "epoch": 2.43,
1013
- "learning_rate": 0.0008241563055062167,
1014
- "loss": 0.4988,
1015
- "step": 1490
1016
- },
1017
- {
1018
- "epoch": 2.45,
1019
- "learning_rate": 0.0008223801065719361,
1020
- "loss": 0.5115,
1021
- "step": 1500
1022
- },
1023
- {
1024
- "epoch": 2.45,
1025
- "eval_loss": 0.4651834964752197,
1026
- "eval_runtime": 5.5836,
1027
- "eval_samples_per_second": 209.72,
1028
- "eval_steps_per_second": 13.253,
1029
- "step": 1500
1030
- },
1031
- {
1032
- "epoch": 2.46,
1033
- "learning_rate": 0.0008206039076376555,
1034
- "loss": 0.4981,
1035
- "step": 1510
1036
- },
1037
- {
1038
- "epoch": 2.48,
1039
- "learning_rate": 0.0008188277087033748,
1040
- "loss": 0.5488,
1041
- "step": 1520
1042
- },
1043
- {
1044
- "epoch": 2.5,
1045
- "learning_rate": 0.0008170515097690942,
1046
- "loss": 0.4906,
1047
- "step": 1530
1048
- },
1049
- {
1050
- "epoch": 2.51,
1051
- "learning_rate": 0.0008152753108348136,
1052
- "loss": 0.4722,
1053
- "step": 1540
1054
- },
1055
- {
1056
- "epoch": 2.53,
1057
- "learning_rate": 0.0008134991119005329,
1058
- "loss": 0.448,
1059
- "step": 1550
1060
- },
1061
- {
1062
- "epoch": 2.54,
1063
- "learning_rate": 0.0008117229129662523,
1064
- "loss": 0.5048,
1065
- "step": 1560
1066
- },
1067
- {
1068
- "epoch": 2.56,
1069
- "learning_rate": 0.0008099467140319717,
1070
- "loss": 0.4938,
1071
- "step": 1570
1072
- },
1073
- {
1074
- "epoch": 2.58,
1075
- "learning_rate": 0.0008081705150976909,
1076
- "loss": 0.4949,
1077
- "step": 1580
1078
- },
1079
- {
1080
- "epoch": 2.59,
1081
- "learning_rate": 0.0008063943161634103,
1082
- "loss": 0.5185,
1083
- "step": 1590
1084
- },
1085
- {
1086
- "epoch": 2.61,
1087
- "learning_rate": 0.0008046181172291297,
1088
- "loss": 0.4809,
1089
- "step": 1600
1090
- },
1091
- {
1092
- "epoch": 2.61,
1093
- "eval_loss": 0.4656440317630768,
1094
- "eval_runtime": 5.645,
1095
- "eval_samples_per_second": 207.439,
1096
- "eval_steps_per_second": 13.109,
1097
- "step": 1600
1098
- },
1099
- {
1100
- "epoch": 2.63,
1101
- "learning_rate": 0.0008028419182948491,
1102
- "loss": 0.4819,
1103
- "step": 1610
1104
- },
1105
- {
1106
- "epoch": 2.64,
1107
- "learning_rate": 0.0008010657193605684,
1108
- "loss": 0.513,
1109
- "step": 1620
1110
- },
1111
- {
1112
- "epoch": 2.66,
1113
- "learning_rate": 0.0007992895204262878,
1114
- "loss": 0.5312,
1115
- "step": 1630
1116
- },
1117
- {
1118
- "epoch": 2.68,
1119
- "learning_rate": 0.0007975133214920072,
1120
- "loss": 0.5252,
1121
- "step": 1640
1122
- },
1123
- {
1124
- "epoch": 2.69,
1125
- "learning_rate": 0.0007957371225577265,
1126
- "loss": 0.4977,
1127
- "step": 1650
1128
- },
1129
- {
1130
- "epoch": 2.71,
1131
- "learning_rate": 0.0007939609236234459,
1132
- "loss": 0.4707,
1133
- "step": 1660
1134
- },
1135
- {
1136
- "epoch": 2.72,
1137
- "learning_rate": 0.0007921847246891653,
1138
- "loss": 0.5138,
1139
- "step": 1670
1140
- },
1141
- {
1142
- "epoch": 2.74,
1143
- "learning_rate": 0.0007904085257548845,
1144
- "loss": 0.5096,
1145
- "step": 1680
1146
- },
1147
- {
1148
- "epoch": 2.76,
1149
- "learning_rate": 0.0007886323268206039,
1150
- "loss": 0.5299,
1151
- "step": 1690
1152
- },
1153
- {
1154
- "epoch": 2.77,
1155
- "learning_rate": 0.0007868561278863233,
1156
- "loss": 0.5147,
1157
- "step": 1700
1158
- },
1159
- {
1160
- "epoch": 2.77,
1161
- "eval_loss": 0.47206491231918335,
1162
- "eval_runtime": 5.616,
1163
- "eval_samples_per_second": 208.511,
1164
- "eval_steps_per_second": 13.177,
1165
- "step": 1700
1166
- },
1167
- {
1168
- "epoch": 2.79,
1169
- "learning_rate": 0.0007850799289520426,
1170
- "loss": 0.5177,
1171
- "step": 1710
1172
- },
1173
- {
1174
- "epoch": 2.81,
1175
- "learning_rate": 0.000783303730017762,
1176
- "loss": 0.4924,
1177
- "step": 1720
1178
- },
1179
- {
1180
- "epoch": 2.82,
1181
- "learning_rate": 0.0007815275310834814,
1182
- "loss": 0.5463,
1183
- "step": 1730
1184
- },
1185
- {
1186
- "epoch": 2.84,
1187
- "learning_rate": 0.0007797513321492007,
1188
- "loss": 0.5046,
1189
- "step": 1740
1190
- },
1191
- {
1192
- "epoch": 2.85,
1193
- "learning_rate": 0.0007779751332149201,
1194
- "loss": 0.4933,
1195
- "step": 1750
1196
- },
1197
- {
1198
- "epoch": 2.87,
1199
- "learning_rate": 0.0007761989342806395,
1200
- "loss": 0.5742,
1201
- "step": 1760
1202
- },
1203
- {
1204
- "epoch": 2.89,
1205
- "learning_rate": 0.0007744227353463588,
1206
- "loss": 0.4831,
1207
- "step": 1770
1208
- },
1209
- {
1210
- "epoch": 2.9,
1211
- "learning_rate": 0.0007726465364120782,
1212
- "loss": 0.5179,
1213
- "step": 1780
1214
- },
1215
- {
1216
- "epoch": 2.92,
1217
- "learning_rate": 0.0007708703374777975,
1218
- "loss": 0.52,
1219
- "step": 1790
1220
- },
1221
- {
1222
- "epoch": 2.94,
1223
- "learning_rate": 0.0007690941385435168,
1224
- "loss": 0.4878,
1225
- "step": 1800
1226
- },
1227
- {
1228
- "epoch": 2.94,
1229
- "eval_loss": 0.45261794328689575,
1230
- "eval_runtime": 5.761,
1231
- "eval_samples_per_second": 203.263,
1232
- "eval_steps_per_second": 12.845,
1233
- "step": 1800
1234
- },
1235
- {
1236
- "epoch": 2.95,
1237
- "learning_rate": 0.0007673179396092362,
1238
- "loss": 0.465,
1239
- "step": 1810
1240
- },
1241
- {
1242
- "epoch": 2.97,
1243
- "learning_rate": 0.0007655417406749556,
1244
- "loss": 0.5103,
1245
- "step": 1820
1246
- },
1247
- {
1248
- "epoch": 2.99,
1249
- "learning_rate": 0.0007637655417406749,
1250
- "loss": 0.4947,
1251
- "step": 1830
1252
- },
1253
- {
1254
- "epoch": 3.0,
1255
- "learning_rate": 0.0007619893428063943,
1256
- "loss": 0.4861,
1257
- "step": 1840
1258
- },
1259
- {
1260
- "epoch": 3.02,
1261
- "learning_rate": 0.0007602131438721137,
1262
- "loss": 0.4779,
1263
- "step": 1850
1264
- },
1265
- {
1266
- "epoch": 3.03,
1267
- "learning_rate": 0.0007584369449378331,
1268
- "loss": 0.483,
1269
- "step": 1860
1270
- },
1271
- {
1272
- "epoch": 3.05,
1273
- "learning_rate": 0.0007566607460035524,
1274
- "loss": 0.5363,
1275
- "step": 1870
1276
- },
1277
- {
1278
- "epoch": 3.07,
1279
- "learning_rate": 0.0007548845470692718,
1280
- "loss": 0.5,
1281
- "step": 1880
1282
- },
1283
- {
1284
- "epoch": 3.08,
1285
- "learning_rate": 0.0007531083481349912,
1286
- "loss": 0.492,
1287
- "step": 1890
1288
- },
1289
- {
1290
- "epoch": 3.1,
1291
- "learning_rate": 0.0007513321492007104,
1292
- "loss": 0.4791,
1293
- "step": 1900
1294
- },
1295
- {
1296
- "epoch": 3.1,
1297
- "eval_loss": 0.45457181334495544,
1298
- "eval_runtime": 5.836,
1299
- "eval_samples_per_second": 200.65,
1300
- "eval_steps_per_second": 12.68,
1301
- "step": 1900
1302
- },
1303
- {
1304
- "epoch": 3.12,
1305
- "learning_rate": 0.0007495559502664298,
1306
- "loss": 0.5009,
1307
- "step": 1910
1308
- },
1309
- {
1310
- "epoch": 3.13,
1311
- "learning_rate": 0.0007477797513321492,
1312
- "loss": 0.4826,
1313
- "step": 1920
1314
- },
1315
- {
1316
- "epoch": 3.15,
1317
- "learning_rate": 0.0007460035523978685,
1318
- "loss": 0.4623,
1319
- "step": 1930
1320
- },
1321
- {
1322
- "epoch": 3.16,
1323
- "learning_rate": 0.0007442273534635879,
1324
- "loss": 0.4975,
1325
- "step": 1940
1326
- },
1327
- {
1328
- "epoch": 3.18,
1329
- "learning_rate": 0.0007424511545293073,
1330
- "loss": 0.4911,
1331
- "step": 1950
1332
- },
1333
- {
1334
- "epoch": 3.2,
1335
- "learning_rate": 0.0007406749555950266,
1336
- "loss": 0.4896,
1337
- "step": 1960
1338
- },
1339
- {
1340
- "epoch": 3.21,
1341
- "learning_rate": 0.000738898756660746,
1342
- "loss": 0.5118,
1343
- "step": 1970
1344
- },
1345
- {
1346
- "epoch": 3.23,
1347
- "learning_rate": 0.0007371225577264654,
1348
- "loss": 0.4827,
1349
- "step": 1980
1350
- },
1351
- {
1352
- "epoch": 3.25,
1353
- "learning_rate": 0.0007353463587921847,
1354
- "loss": 0.4929,
1355
- "step": 1990
1356
- },
1357
- {
1358
- "epoch": 3.26,
1359
- "learning_rate": 0.000733570159857904,
1360
- "loss": 0.462,
1361
- "step": 2000
1362
- },
1363
- {
1364
- "epoch": 3.26,
1365
- "eval_loss": 0.4662368595600128,
1366
- "eval_runtime": 5.657,
1367
- "eval_samples_per_second": 206.999,
1368
- "eval_steps_per_second": 13.081,
1369
- "step": 2000
1370
- },
1371
- {
1372
- "epoch": 3.28,
1373
- "learning_rate": 0.0007317939609236234,
1374
- "loss": 0.505,
1375
- "step": 2010
1376
- },
1377
- {
1378
- "epoch": 3.3,
1379
- "learning_rate": 0.0007300177619893427,
1380
- "loss": 0.4871,
1381
- "step": 2020
1382
- },
1383
- {
1384
- "epoch": 3.31,
1385
- "learning_rate": 0.0007282415630550621,
1386
- "loss": 0.4841,
1387
- "step": 2030
1388
- },
1389
- {
1390
- "epoch": 3.33,
1391
- "learning_rate": 0.0007264653641207815,
1392
- "loss": 0.4918,
1393
- "step": 2040
1394
- },
1395
- {
1396
- "epoch": 3.34,
1397
- "learning_rate": 0.0007246891651865008,
1398
- "loss": 0.4559,
1399
- "step": 2050
1400
- },
1401
- {
1402
- "epoch": 3.36,
1403
- "learning_rate": 0.0007229129662522202,
1404
- "loss": 0.4672,
1405
- "step": 2060
1406
- },
1407
- {
1408
- "epoch": 3.38,
1409
- "learning_rate": 0.0007211367673179397,
1410
- "loss": 0.4624,
1411
- "step": 2070
1412
- },
1413
- {
1414
- "epoch": 3.39,
1415
- "learning_rate": 0.000719360568383659,
1416
- "loss": 0.4405,
1417
- "step": 2080
1418
- },
1419
- {
1420
- "epoch": 3.41,
1421
- "learning_rate": 0.0007175843694493784,
1422
- "loss": 0.4842,
1423
- "step": 2090
1424
- },
1425
- {
1426
- "epoch": 3.43,
1427
- "learning_rate": 0.0007158081705150978,
1428
- "loss": 0.4959,
1429
- "step": 2100
1430
- },
1431
- {
1432
- "epoch": 3.43,
1433
- "eval_loss": 0.4539988338947296,
1434
- "eval_runtime": 5.63,
1435
- "eval_samples_per_second": 207.992,
1436
- "eval_steps_per_second": 13.144,
1437
- "step": 2100
1438
- },
1439
- {
1440
- "epoch": 3.44,
1441
- "learning_rate": 0.0007140319715808172,
1442
- "loss": 0.4884,
1443
- "step": 2110
1444
- },
1445
- {
1446
- "epoch": 3.46,
1447
- "learning_rate": 0.0007122557726465364,
1448
- "loss": 0.4808,
1449
- "step": 2120
1450
- },
1451
- {
1452
- "epoch": 3.47,
1453
- "learning_rate": 0.0007104795737122558,
1454
- "loss": 0.4494,
1455
- "step": 2130
1456
- },
1457
- {
1458
- "epoch": 3.49,
1459
- "learning_rate": 0.0007087033747779752,
1460
- "loss": 0.4789,
1461
- "step": 2140
1462
- },
1463
- {
1464
- "epoch": 3.51,
1465
- "learning_rate": 0.0007069271758436945,
1466
- "loss": 0.472,
1467
- "step": 2150
1468
- },
1469
- {
1470
- "epoch": 3.52,
1471
- "learning_rate": 0.0007051509769094139,
1472
- "loss": 0.4731,
1473
- "step": 2160
1474
- },
1475
- {
1476
- "epoch": 3.54,
1477
- "learning_rate": 0.0007033747779751333,
1478
- "loss": 0.4861,
1479
- "step": 2170
1480
- },
1481
- {
1482
- "epoch": 3.56,
1483
- "learning_rate": 0.0007015985790408526,
1484
- "loss": 0.4876,
1485
- "step": 2180
1486
- },
1487
- {
1488
- "epoch": 3.57,
1489
- "learning_rate": 0.000699822380106572,
1490
- "loss": 0.5136,
1491
- "step": 2190
1492
- },
1493
- {
1494
- "epoch": 3.59,
1495
- "learning_rate": 0.0006980461811722914,
1496
- "loss": 0.4671,
1497
- "step": 2200
1498
- },
1499
- {
1500
- "epoch": 3.59,
1501
- "eval_loss": 0.4472385346889496,
1502
- "eval_runtime": 5.617,
1503
- "eval_samples_per_second": 208.473,
1504
- "eval_steps_per_second": 13.174,
1505
- "step": 2200
1506
- },
1507
- {
1508
- "epoch": 3.61,
1509
- "learning_rate": 0.0006962699822380107,
1510
- "loss": 0.4671,
1511
- "step": 2210
1512
- },
1513
- {
1514
- "epoch": 3.62,
1515
- "learning_rate": 0.0006944937833037301,
1516
- "loss": 0.4896,
1517
- "step": 2220
1518
- },
1519
- {
1520
- "epoch": 3.64,
1521
- "learning_rate": 0.0006927175843694495,
1522
- "loss": 0.4768,
1523
- "step": 2230
1524
- },
1525
- {
1526
- "epoch": 3.65,
1527
- "learning_rate": 0.0006909413854351687,
1528
- "loss": 0.5263,
1529
- "step": 2240
1530
- },
1531
- {
1532
- "epoch": 3.67,
1533
- "learning_rate": 0.0006891651865008881,
1534
- "loss": 0.4718,
1535
- "step": 2250
1536
- },
1537
- {
1538
- "epoch": 3.69,
1539
- "learning_rate": 0.0006873889875666075,
1540
- "loss": 0.4854,
1541
- "step": 2260
1542
- },
1543
- {
1544
- "epoch": 3.7,
1545
- "learning_rate": 0.0006856127886323268,
1546
- "loss": 0.4614,
1547
- "step": 2270
1548
- },
1549
- {
1550
- "epoch": 3.72,
1551
- "learning_rate": 0.0006838365896980462,
1552
- "loss": 0.467,
1553
- "step": 2280
1554
- },
1555
- {
1556
- "epoch": 3.74,
1557
- "learning_rate": 0.0006820603907637656,
1558
- "loss": 0.4824,
1559
- "step": 2290
1560
- },
1561
- {
1562
- "epoch": 3.75,
1563
- "learning_rate": 0.0006802841918294849,
1564
- "loss": 0.479,
1565
- "step": 2300
1566
- },
1567
- {
1568
- "epoch": 3.75,
1569
- "eval_loss": 0.45660024881362915,
1570
- "eval_runtime": 5.623,
1571
- "eval_samples_per_second": 208.251,
1572
- "eval_steps_per_second": 13.16,
1573
- "step": 2300
1574
- },
1575
- {
1576
- "epoch": 3.77,
1577
- "learning_rate": 0.0006785079928952043,
1578
- "loss": 0.4761,
1579
- "step": 2310
1580
- },
1581
- {
1582
- "epoch": 3.78,
1583
- "learning_rate": 0.0006767317939609237,
1584
- "loss": 0.4789,
1585
- "step": 2320
1586
- },
1587
- {
1588
- "epoch": 3.8,
1589
- "learning_rate": 0.000674955595026643,
1590
- "loss": 0.4888,
1591
- "step": 2330
1592
- },
1593
- {
1594
- "epoch": 3.82,
1595
- "learning_rate": 0.0006731793960923623,
1596
- "loss": 0.4774,
1597
- "step": 2340
1598
- },
1599
- {
1600
- "epoch": 3.83,
1601
- "learning_rate": 0.0006714031971580817,
1602
- "loss": 0.4621,
1603
- "step": 2350
1604
- },
1605
- {
1606
- "epoch": 3.85,
1607
- "learning_rate": 0.0006696269982238011,
1608
- "loss": 0.4804,
1609
- "step": 2360
1610
- },
1611
- {
1612
- "epoch": 3.87,
1613
- "learning_rate": 0.0006678507992895204,
1614
- "loss": 0.466,
1615
- "step": 2370
1616
- },
1617
- {
1618
- "epoch": 3.88,
1619
- "learning_rate": 0.0006660746003552398,
1620
- "loss": 0.4846,
1621
- "step": 2380
1622
- },
1623
- {
1624
- "epoch": 3.9,
1625
- "learning_rate": 0.0006642984014209592,
1626
- "loss": 0.4631,
1627
- "step": 2390
1628
- },
1629
- {
1630
- "epoch": 3.92,
1631
- "learning_rate": 0.0006625222024866785,
1632
- "loss": 0.4568,
1633
- "step": 2400
1634
- },
1635
- {
1636
- "epoch": 3.92,
1637
- "eval_loss": 0.4491855502128601,
1638
- "eval_runtime": 5.621,
1639
- "eval_samples_per_second": 208.325,
1640
- "eval_steps_per_second": 13.165,
1641
- "step": 2400
1642
- },
1643
- {
1644
- "epoch": 3.93,
1645
- "learning_rate": 0.0006607460035523979,
1646
- "loss": 0.5022,
1647
- "step": 2410
1648
- },
1649
- {
1650
- "epoch": 3.95,
1651
- "learning_rate": 0.0006589698046181173,
1652
- "loss": 0.4546,
1653
- "step": 2420
1654
- },
1655
- {
1656
- "epoch": 3.96,
1657
- "learning_rate": 0.0006571936056838366,
1658
- "loss": 0.4666,
1659
- "step": 2430
1660
- },
1661
- {
1662
- "epoch": 3.98,
1663
- "learning_rate": 0.000655417406749556,
1664
- "loss": 0.4915,
1665
- "step": 2440
1666
- },
1667
- {
1668
- "epoch": 4.0,
1669
- "learning_rate": 0.0006536412078152753,
1670
- "loss": 0.4763,
1671
- "step": 2450
1672
- },
1673
- {
1674
- "epoch": 4.01,
1675
- "learning_rate": 0.0006518650088809946,
1676
- "loss": 0.4625,
1677
- "step": 2460
1678
- },
1679
- {
1680
- "epoch": 4.03,
1681
- "learning_rate": 0.000650088809946714,
1682
- "loss": 0.4447,
1683
- "step": 2470
1684
- },
1685
- {
1686
- "epoch": 4.05,
1687
- "learning_rate": 0.0006483126110124334,
1688
- "loss": 0.4559,
1689
- "step": 2480
1690
- },
1691
- {
1692
- "epoch": 4.06,
1693
- "learning_rate": 0.0006465364120781527,
1694
- "loss": 0.4881,
1695
- "step": 2490
1696
- },
1697
- {
1698
- "epoch": 4.08,
1699
- "learning_rate": 0.0006447602131438721,
1700
- "loss": 0.472,
1701
- "step": 2500
1702
- },
1703
- {
1704
- "epoch": 4.08,
1705
- "eval_loss": 0.45457056164741516,
1706
- "eval_runtime": 5.638,
1707
- "eval_samples_per_second": 207.697,
1708
- "eval_steps_per_second": 13.125,
1709
- "step": 2500
1710
- },
1711
- {
1712
- "epoch": 4.09,
1713
- "learning_rate": 0.0006429840142095915,
1714
- "loss": 0.4606,
1715
- "step": 2510
1716
- },
1717
- {
1718
- "epoch": 4.11,
1719
- "learning_rate": 0.0006412078152753108,
1720
- "loss": 0.4599,
1721
- "step": 2520
1722
- },
1723
- {
1724
- "epoch": 4.13,
1725
- "learning_rate": 0.0006394316163410302,
1726
- "loss": 0.4846,
1727
- "step": 2530
1728
- },
1729
- {
1730
- "epoch": 4.14,
1731
- "learning_rate": 0.0006376554174067496,
1732
- "loss": 0.4575,
1733
- "step": 2540
1734
- },
1735
- {
1736
- "epoch": 4.16,
1737
- "learning_rate": 0.0006358792184724689,
1738
- "loss": 0.4758,
1739
- "step": 2550
1740
- },
1741
- {
1742
- "epoch": 4.18,
1743
- "learning_rate": 0.0006341030195381882,
1744
- "loss": 0.4963,
1745
- "step": 2560
1746
- },
1747
- {
1748
- "epoch": 4.19,
1749
- "learning_rate": 0.0006323268206039076,
1750
- "loss": 0.4315,
1751
- "step": 2570
1752
- },
1753
- {
1754
- "epoch": 4.21,
1755
- "learning_rate": 0.0006305506216696269,
1756
- "loss": 0.4759,
1757
- "step": 2580
1758
- },
1759
- {
1760
- "epoch": 4.23,
1761
- "learning_rate": 0.0006287744227353463,
1762
- "loss": 0.4627,
1763
- "step": 2590
1764
- },
1765
- {
1766
- "epoch": 4.24,
1767
- "learning_rate": 0.0006269982238010657,
1768
- "loss": 0.4411,
1769
- "step": 2600
1770
- },
1771
- {
1772
- "epoch": 4.24,
1773
- "eval_loss": 0.4529522657394409,
1774
- "eval_runtime": 5.693,
1775
- "eval_samples_per_second": 205.69,
1776
- "eval_steps_per_second": 12.998,
1777
- "step": 2600
1778
- },
1779
- {
1780
- "epoch": 4.26,
1781
- "learning_rate": 0.0006252220248667851,
1782
- "loss": 0.4747,
1783
- "step": 2610
1784
- },
1785
- {
1786
- "epoch": 4.27,
1787
- "learning_rate": 0.0006234458259325044,
1788
- "loss": 0.4977,
1789
- "step": 2620
1790
- },
1791
- {
1792
- "epoch": 4.29,
1793
- "learning_rate": 0.0006216696269982238,
1794
- "loss": 0.4606,
1795
- "step": 2630
1796
- },
1797
- {
1798
- "epoch": 4.31,
1799
- "learning_rate": 0.0006198934280639432,
1800
- "loss": 0.4678,
1801
- "step": 2640
1802
- },
1803
- {
1804
- "epoch": 4.32,
1805
- "learning_rate": 0.0006181172291296625,
1806
- "loss": 0.4665,
1807
- "step": 2650
1808
- },
1809
- {
1810
- "epoch": 4.34,
1811
- "learning_rate": 0.0006163410301953819,
1812
- "loss": 0.4571,
1813
- "step": 2660
1814
- },
1815
- {
1816
- "epoch": 4.36,
1817
- "learning_rate": 0.0006145648312611012,
1818
- "loss": 0.4412,
1819
- "step": 2670
1820
- },
1821
- {
1822
- "epoch": 4.37,
1823
- "learning_rate": 0.0006127886323268205,
1824
- "loss": 0.4521,
1825
- "step": 2680
1826
- },
1827
- {
1828
- "epoch": 4.39,
1829
- "learning_rate": 0.00061101243339254,
1830
- "loss": 0.4387,
1831
- "step": 2690
1832
- },
1833
- {
1834
- "epoch": 4.4,
1835
- "learning_rate": 0.0006092362344582594,
1836
- "loss": 0.4636,
1837
- "step": 2700
1838
- },
1839
- {
1840
- "epoch": 4.4,
1841
- "eval_loss": 0.44287770986557007,
1842
- "eval_runtime": 5.817,
1843
- "eval_samples_per_second": 201.306,
1844
- "eval_steps_per_second": 12.721,
1845
- "step": 2700
1846
- },
1847
- {
1848
- "epoch": 4.42,
1849
- "learning_rate": 0.0006074600355239787,
1850
- "loss": 0.4726,
1851
- "step": 2710
1852
- },
1853
- {
1854
- "epoch": 4.44,
1855
- "learning_rate": 0.0006056838365896981,
1856
- "loss": 0.4297,
1857
- "step": 2720
1858
- },
1859
- {
1860
- "epoch": 4.45,
1861
- "learning_rate": 0.0006039076376554175,
1862
- "loss": 0.4889,
1863
- "step": 2730
1864
- },
1865
- {
1866
- "epoch": 4.47,
1867
- "learning_rate": 0.0006021314387211368,
1868
- "loss": 0.439,
1869
- "step": 2740
1870
- },
1871
- {
1872
- "epoch": 4.49,
1873
- "learning_rate": 0.0006003552397868562,
1874
- "loss": 0.4527,
1875
- "step": 2750
1876
- },
1877
- {
1878
- "epoch": 4.5,
1879
- "learning_rate": 0.0005985790408525756,
1880
- "loss": 0.5186,
1881
- "step": 2760
1882
- },
1883
- {
1884
- "epoch": 4.52,
1885
- "learning_rate": 0.0005968028419182949,
1886
- "loss": 0.4882,
1887
- "step": 2770
1888
- },
1889
- {
1890
- "epoch": 4.54,
1891
- "learning_rate": 0.0005950266429840142,
1892
- "loss": 0.4759,
1893
- "step": 2780
1894
- },
1895
- {
1896
- "epoch": 4.55,
1897
- "learning_rate": 0.0005932504440497336,
1898
- "loss": 0.4744,
1899
- "step": 2790
1900
- },
1901
- {
1902
- "epoch": 4.57,
1903
- "learning_rate": 0.0005914742451154529,
1904
- "loss": 0.4512,
1905
- "step": 2800
1906
- },
1907
- {
1908
- "epoch": 4.57,
1909
- "eval_loss": 0.4442508816719055,
1910
- "eval_runtime": 5.775,
1911
- "eval_samples_per_second": 202.77,
1912
- "eval_steps_per_second": 12.814,
1913
- "step": 2800
1914
- },
1915
- {
1916
- "epoch": 4.58,
1917
- "learning_rate": 0.0005896980461811723,
1918
- "loss": 0.4589,
1919
- "step": 2810
1920
- },
1921
- {
1922
- "epoch": 4.6,
1923
- "learning_rate": 0.0005879218472468917,
1924
- "loss": 0.4497,
1925
- "step": 2820
1926
- },
1927
- {
1928
- "epoch": 4.62,
1929
- "learning_rate": 0.0005861456483126111,
1930
- "loss": 0.4618,
1931
- "step": 2830
1932
- },
1933
- {
1934
- "epoch": 4.63,
1935
- "learning_rate": 0.0005843694493783304,
1936
- "loss": 0.4897,
1937
- "step": 2840
1938
- },
1939
- {
1940
- "epoch": 4.65,
1941
- "learning_rate": 0.0005825932504440498,
1942
- "loss": 0.4577,
1943
- "step": 2850
1944
- },
1945
- {
1946
- "epoch": 4.67,
1947
- "learning_rate": 0.0005808170515097692,
1948
- "loss": 0.474,
1949
- "step": 2860
1950
- },
1951
- {
1952
- "epoch": 4.68,
1953
- "learning_rate": 0.0005790408525754885,
1954
- "loss": 0.4409,
1955
- "step": 2870
1956
- },
1957
- {
1958
- "epoch": 4.7,
1959
- "learning_rate": 0.0005772646536412079,
1960
- "loss": 0.4716,
1961
- "step": 2880
1962
- },
1963
- {
1964
- "epoch": 4.71,
1965
- "learning_rate": 0.0005754884547069273,
1966
- "loss": 0.4833,
1967
- "step": 2890
1968
- },
1969
- {
1970
- "epoch": 4.73,
1971
- "learning_rate": 0.0005737122557726465,
1972
- "loss": 0.4487,
1973
- "step": 2900
1974
- },
1975
- {
1976
- "epoch": 4.73,
1977
- "eval_loss": 0.439519464969635,
1978
- "eval_runtime": 5.776,
1979
- "eval_samples_per_second": 202.735,
1980
- "eval_steps_per_second": 12.812,
1981
- "step": 2900
1982
- },
1983
- {
1984
- "epoch": 4.75,
1985
- "learning_rate": 0.0005719360568383659,
1986
- "loss": 0.4426,
1987
- "step": 2910
1988
- },
1989
- {
1990
- "epoch": 4.76,
1991
- "learning_rate": 0.0005701598579040853,
1992
- "loss": 0.4588,
1993
- "step": 2920
1994
- },
1995
- {
1996
- "epoch": 4.78,
1997
- "learning_rate": 0.0005683836589698046,
1998
- "loss": 0.4676,
1999
- "step": 2930
2000
- },
2001
- {
2002
- "epoch": 4.8,
2003
- "learning_rate": 0.000566607460035524,
2004
- "loss": 0.5016,
2005
- "step": 2940
2006
- },
2007
- {
2008
- "epoch": 4.81,
2009
- "learning_rate": 0.0005648312611012434,
2010
- "loss": 0.4464,
2011
- "step": 2950
2012
- },
2013
- {
2014
- "epoch": 4.83,
2015
- "learning_rate": 0.0005630550621669627,
2016
- "loss": 0.4589,
2017
- "step": 2960
2018
- },
2019
- {
2020
- "epoch": 4.85,
2021
- "learning_rate": 0.0005612788632326821,
2022
- "loss": 0.4499,
2023
- "step": 2970
2024
- },
2025
- {
2026
- "epoch": 4.86,
2027
- "learning_rate": 0.0005595026642984015,
2028
- "loss": 0.4361,
2029
- "step": 2980
2030
- },
2031
- {
2032
- "epoch": 4.88,
2033
- "learning_rate": 0.0005577264653641208,
2034
- "loss": 0.4437,
2035
- "step": 2990
2036
- },
2037
- {
2038
- "epoch": 4.89,
2039
- "learning_rate": 0.0005559502664298401,
2040
- "loss": 0.4724,
2041
- "step": 3000
2042
- },
2043
- {
2044
- "epoch": 4.89,
2045
- "eval_loss": 0.43853452801704407,
2046
- "eval_runtime": 5.773,
2047
- "eval_samples_per_second": 202.84,
2048
- "eval_steps_per_second": 12.818,
2049
- "step": 3000
2050
- },
2051
- {
2052
- "epoch": 4.91,
2053
- "learning_rate": 0.0005541740674955595,
2054
- "loss": 0.4927,
2055
- "step": 3010
2056
- },
2057
- {
2058
- "epoch": 4.93,
2059
- "learning_rate": 0.0005523978685612788,
2060
- "loss": 0.4705,
2061
- "step": 3020
2062
- },
2063
- {
2064
- "epoch": 4.94,
2065
- "learning_rate": 0.0005506216696269982,
2066
- "loss": 0.4356,
2067
- "step": 3030
2068
- },
2069
- {
2070
- "epoch": 4.96,
2071
- "learning_rate": 0.0005488454706927176,
2072
- "loss": 0.5064,
2073
- "step": 3040
2074
- },
2075
- {
2076
- "epoch": 4.98,
2077
- "learning_rate": 0.0005470692717584369,
2078
- "loss": 0.4611,
2079
- "step": 3050
2080
- },
2081
- {
2082
- "epoch": 4.99,
2083
- "learning_rate": 0.0005452930728241563,
2084
- "loss": 0.4797,
2085
- "step": 3060
2086
- },
2087
- {
2088
- "epoch": 5.01,
2089
- "learning_rate": 0.0005435168738898757,
2090
- "loss": 0.432,
2091
- "step": 3070
2092
- },
2093
- {
2094
- "epoch": 5.02,
2095
- "learning_rate": 0.0005417406749555951,
2096
- "loss": 0.4598,
2097
- "step": 3080
2098
- },
2099
- {
2100
- "epoch": 5.04,
2101
- "learning_rate": 0.0005399644760213144,
2102
- "loss": 0.4524,
2103
- "step": 3090
2104
- },
2105
- {
2106
- "epoch": 5.06,
2107
- "learning_rate": 0.0005381882770870338,
2108
- "loss": 0.4553,
2109
- "step": 3100
2110
- },
2111
- {
2112
- "epoch": 5.06,
2113
- "eval_loss": 0.43742743134498596,
2114
- "eval_runtime": 5.776,
2115
- "eval_samples_per_second": 202.735,
2116
- "eval_steps_per_second": 12.812,
2117
- "step": 3100
2118
- },
2119
- {
2120
- "epoch": 5.07,
2121
- "learning_rate": 0.0005364120781527531,
2122
- "loss": 0.4454,
2123
- "step": 3110
2124
- },
2125
- {
2126
- "epoch": 5.09,
2127
- "learning_rate": 0.0005346358792184724,
2128
- "loss": 0.4436,
2129
- "step": 3120
2130
- },
2131
- {
2132
- "epoch": 5.11,
2133
- "learning_rate": 0.0005328596802841918,
2134
- "loss": 0.4414,
2135
- "step": 3130
2136
- },
2137
- {
2138
- "epoch": 5.12,
2139
- "learning_rate": 0.0005310834813499112,
2140
- "loss": 0.4395,
2141
- "step": 3140
2142
- },
2143
- {
2144
- "epoch": 5.14,
2145
- "learning_rate": 0.0005293072824156305,
2146
- "loss": 0.4259,
2147
- "step": 3150
2148
- },
2149
- {
2150
- "epoch": 5.15,
2151
- "learning_rate": 0.0005275310834813499,
2152
- "loss": 0.4486,
2153
- "step": 3160
2154
- },
2155
- {
2156
- "epoch": 5.17,
2157
- "learning_rate": 0.0005257548845470693,
2158
- "loss": 0.4612,
2159
- "step": 3170
2160
- },
2161
- {
2162
- "epoch": 5.19,
2163
- "learning_rate": 0.0005239786856127886,
2164
- "loss": 0.4726,
2165
- "step": 3180
2166
- },
2167
- {
2168
- "epoch": 5.2,
2169
- "learning_rate": 0.000522202486678508,
2170
- "loss": 0.4725,
2171
- "step": 3190
2172
- },
2173
- {
2174
- "epoch": 5.22,
2175
- "learning_rate": 0.0005204262877442274,
2176
- "loss": 0.4798,
2177
- "step": 3200
2178
- },
2179
- {
2180
- "epoch": 5.22,
2181
- "eval_loss": 0.4378213882446289,
2182
- "eval_runtime": 5.571,
2183
- "eval_samples_per_second": 210.195,
2184
- "eval_steps_per_second": 13.283,
2185
- "step": 3200
2186
- },
2187
- {
2188
- "epoch": 5.24,
2189
- "learning_rate": 0.0005186500888099467,
2190
- "loss": 0.446,
2191
- "step": 3210
2192
- },
2193
- {
2194
- "epoch": 5.25,
2195
- "learning_rate": 0.000516873889875666,
2196
- "loss": 0.4919,
2197
- "step": 3220
2198
- },
2199
- {
2200
- "epoch": 5.27,
2201
- "learning_rate": 0.0005150976909413854,
2202
- "loss": 0.4294,
2203
- "step": 3230
2204
- },
2205
- {
2206
- "epoch": 5.29,
2207
- "learning_rate": 0.0005133214920071047,
2208
- "loss": 0.474,
2209
- "step": 3240
2210
- },
2211
- {
2212
- "epoch": 5.3,
2213
- "learning_rate": 0.0005115452930728241,
2214
- "loss": 0.4951,
2215
- "step": 3250
2216
- },
2217
- {
2218
- "epoch": 5.32,
2219
- "learning_rate": 0.0005097690941385435,
2220
- "loss": 0.4588,
2221
- "step": 3260
2222
- },
2223
- {
2224
- "epoch": 5.33,
2225
- "learning_rate": 0.0005079928952042628,
2226
- "loss": 0.4543,
2227
- "step": 3270
2228
- },
2229
- {
2230
- "epoch": 5.35,
2231
- "learning_rate": 0.0005062166962699822,
2232
- "loss": 0.4488,
2233
- "step": 3280
2234
- },
2235
- {
2236
- "epoch": 5.37,
2237
- "learning_rate": 0.0005044404973357016,
2238
- "loss": 0.4653,
2239
- "step": 3290
2240
- },
2241
- {
2242
- "epoch": 5.38,
2243
- "learning_rate": 0.0005026642984014209,
2244
- "loss": 0.4735,
2245
- "step": 3300
2246
- },
2247
- {
2248
- "epoch": 5.38,
2249
- "eval_loss": 0.43299299478530884,
2250
- "eval_runtime": 5.567,
2251
- "eval_samples_per_second": 210.346,
2252
- "eval_steps_per_second": 13.293,
2253
- "step": 3300
2254
- },
2255
- {
2256
- "epoch": 5.4,
2257
- "learning_rate": 0.0005008880994671403,
2258
- "loss": 0.4339,
2259
- "step": 3310
2260
- },
2261
- {
2262
- "epoch": 5.42,
2263
- "learning_rate": 0.0004991119005328598,
2264
- "loss": 0.4268,
2265
- "step": 3320
2266
- },
2267
- {
2268
- "epoch": 5.43,
2269
- "learning_rate": 0.000497335701598579,
2270
- "loss": 0.4174,
2271
- "step": 3330
2272
- },
2273
- {
2274
- "epoch": 5.45,
2275
- "learning_rate": 0.0004955595026642984,
2276
- "loss": 0.4805,
2277
- "step": 3340
2278
- },
2279
- {
2280
- "epoch": 5.46,
2281
- "learning_rate": 0.0004937833037300178,
2282
- "loss": 0.4537,
2283
- "step": 3350
2284
- },
2285
- {
2286
- "epoch": 5.48,
2287
- "learning_rate": 0.0004920071047957371,
2288
- "loss": 0.4559,
2289
- "step": 3360
2290
- },
2291
- {
2292
- "epoch": 5.5,
2293
- "learning_rate": 0.0004902309058614565,
2294
- "loss": 0.4401,
2295
- "step": 3370
2296
- },
2297
- {
2298
- "epoch": 5.51,
2299
- "learning_rate": 0.0004884547069271759,
2300
- "loss": 0.4571,
2301
- "step": 3380
2302
- },
2303
- {
2304
- "epoch": 5.53,
2305
- "learning_rate": 0.0004866785079928952,
2306
- "loss": 0.4396,
2307
- "step": 3390
2308
- },
2309
- {
2310
- "epoch": 5.55,
2311
- "learning_rate": 0.0004849023090586146,
2312
- "loss": 0.4203,
2313
- "step": 3400
2314
- },
2315
- {
2316
- "epoch": 5.55,
2317
- "eval_loss": 0.4426453411579132,
2318
- "eval_runtime": 5.572,
2319
- "eval_samples_per_second": 210.157,
2320
- "eval_steps_per_second": 13.281,
2321
- "step": 3400
2322
- },
2323
- {
2324
- "epoch": 5.56,
2325
- "learning_rate": 0.0004831261101243339,
2326
- "loss": 0.4444,
2327
- "step": 3410
2328
- },
2329
- {
2330
- "epoch": 5.58,
2331
- "learning_rate": 0.00048134991119005326,
2332
- "loss": 0.4433,
2333
- "step": 3420
2334
- },
2335
- {
2336
- "epoch": 5.6,
2337
- "learning_rate": 0.00047957371225577266,
2338
- "loss": 0.4328,
2339
- "step": 3430
2340
- },
2341
- {
2342
- "epoch": 5.61,
2343
- "learning_rate": 0.000477797513321492,
2344
- "loss": 0.4257,
2345
- "step": 3440
2346
- },
2347
- {
2348
- "epoch": 5.63,
2349
- "learning_rate": 0.00047602131438721133,
2350
- "loss": 0.4804,
2351
- "step": 3450
2352
- },
2353
- {
2354
- "epoch": 5.64,
2355
- "learning_rate": 0.00047424511545293073,
2356
- "loss": 0.4396,
2357
- "step": 3460
2358
- },
2359
- {
2360
- "epoch": 5.66,
2361
- "learning_rate": 0.00047246891651865007,
2362
- "loss": 0.4498,
2363
- "step": 3470
2364
- },
2365
- {
2366
- "epoch": 5.68,
2367
- "learning_rate": 0.00047069271758436946,
2368
- "loss": 0.4799,
2369
- "step": 3480
2370
- },
2371
- {
2372
- "epoch": 5.69,
2373
- "learning_rate": 0.00046891651865008885,
2374
- "loss": 0.4736,
2375
- "step": 3490
2376
- },
2377
- {
2378
- "epoch": 5.71,
2379
- "learning_rate": 0.0004671403197158082,
2380
- "loss": 0.4252,
2381
- "step": 3500
2382
- },
2383
- {
2384
- "epoch": 5.71,
2385
- "eval_loss": 0.441283255815506,
2386
- "eval_runtime": 5.574,
2387
- "eval_samples_per_second": 210.082,
2388
- "eval_steps_per_second": 13.276,
2389
- "step": 3500
2390
- },
2391
- {
2392
- "epoch": 5.73,
2393
- "learning_rate": 0.0004653641207815276,
2394
- "loss": 0.4661,
2395
- "step": 3510
2396
- },
2397
- {
2398
- "epoch": 5.74,
2399
- "learning_rate": 0.0004635879218472469,
2400
- "loss": 0.4545,
2401
- "step": 3520
2402
- },
2403
- {
2404
- "epoch": 5.76,
2405
- "learning_rate": 0.00046181172291296627,
2406
- "loss": 0.4573,
2407
- "step": 3530
2408
- },
2409
- {
2410
- "epoch": 5.77,
2411
- "learning_rate": 0.00046003552397868566,
2412
- "loss": 0.4464,
2413
- "step": 3540
2414
- },
2415
- {
2416
- "epoch": 5.79,
2417
- "learning_rate": 0.000458259325044405,
2418
- "loss": 0.4089,
2419
- "step": 3550
2420
- },
2421
- {
2422
- "epoch": 5.81,
2423
- "learning_rate": 0.00045648312611012434,
2424
- "loss": 0.4622,
2425
- "step": 3560
2426
- },
2427
- {
2428
- "epoch": 5.82,
2429
- "learning_rate": 0.00045470692717584373,
2430
- "loss": 0.4611,
2431
- "step": 3570
2432
- },
2433
- {
2434
- "epoch": 5.84,
2435
- "learning_rate": 0.00045293072824156307,
2436
- "loss": 0.4524,
2437
- "step": 3580
2438
- },
2439
- {
2440
- "epoch": 5.86,
2441
- "learning_rate": 0.0004511545293072824,
2442
- "loss": 0.439,
2443
- "step": 3590
2444
- },
2445
- {
2446
- "epoch": 5.87,
2447
- "learning_rate": 0.0004493783303730018,
2448
- "loss": 0.4246,
2449
- "step": 3600
2450
- },
2451
- {
2452
- "epoch": 5.87,
2453
- "eval_loss": 0.4371810257434845,
2454
- "eval_runtime": 5.588,
2455
- "eval_samples_per_second": 209.555,
2456
- "eval_steps_per_second": 13.243,
2457
- "step": 3600
2458
- },
2459
- {
2460
- "epoch": 5.89,
2461
- "learning_rate": 0.00044760213143872114,
2462
- "loss": 0.4645,
2463
- "step": 3610
2464
- },
2465
- {
2466
- "epoch": 5.91,
2467
- "learning_rate": 0.0004458259325044405,
2468
- "loss": 0.454,
2469
- "step": 3620
2470
- },
2471
- {
2472
- "epoch": 5.92,
2473
- "learning_rate": 0.0004440497335701599,
2474
- "loss": 0.45,
2475
- "step": 3630
2476
- },
2477
- {
2478
- "epoch": 5.94,
2479
- "learning_rate": 0.0004422735346358792,
2480
- "loss": 0.439,
2481
- "step": 3640
2482
- },
2483
- {
2484
- "epoch": 5.95,
2485
- "learning_rate": 0.0004404973357015986,
2486
- "loss": 0.4276,
2487
- "step": 3650
2488
- },
2489
- {
2490
- "epoch": 5.97,
2491
- "learning_rate": 0.00043872113676731795,
2492
- "loss": 0.4537,
2493
- "step": 3660
2494
- },
2495
- {
2496
- "epoch": 5.99,
2497
- "learning_rate": 0.0004369449378330373,
2498
- "loss": 0.4647,
2499
- "step": 3670
2500
- },
2501
- {
2502
- "epoch": 6.0,
2503
- "learning_rate": 0.0004351687388987567,
2504
- "loss": 0.4192,
2505
- "step": 3680
2506
- },
2507
- {
2508
- "epoch": 6.02,
2509
- "learning_rate": 0.000433392539964476,
2510
- "loss": 0.4178,
2511
- "step": 3690
2512
- },
2513
- {
2514
- "epoch": 6.04,
2515
- "learning_rate": 0.00043161634103019536,
2516
- "loss": 0.4158,
2517
- "step": 3700
2518
- },
2519
- {
2520
- "epoch": 6.04,
2521
- "eval_loss": 0.43516233563423157,
2522
- "eval_runtime": 5.574,
2523
- "eval_samples_per_second": 210.082,
2524
- "eval_steps_per_second": 13.276,
2525
- "step": 3700
2526
- },
2527
- {
2528
- "epoch": 6.05,
2529
- "learning_rate": 0.00042984014209591475,
2530
- "loss": 0.4581,
2531
- "step": 3710
2532
- },
2533
- {
2534
- "epoch": 6.07,
2535
- "learning_rate": 0.0004280639431616341,
2536
- "loss": 0.4422,
2537
- "step": 3720
2538
- },
2539
- {
2540
- "epoch": 6.08,
2541
- "learning_rate": 0.00042628774422735343,
2542
- "loss": 0.4191,
2543
- "step": 3730
2544
- },
2545
- {
2546
- "epoch": 6.1,
2547
- "learning_rate": 0.0004245115452930728,
2548
- "loss": 0.434,
2549
- "step": 3740
2550
- },
2551
- {
2552
- "epoch": 6.12,
2553
- "learning_rate": 0.00042273534635879216,
2554
- "loss": 0.416,
2555
- "step": 3750
2556
- },
2557
- {
2558
- "epoch": 6.13,
2559
- "learning_rate": 0.00042095914742451156,
2560
- "loss": 0.4116,
2561
- "step": 3760
2562
- },
2563
- {
2564
- "epoch": 6.15,
2565
- "learning_rate": 0.0004191829484902309,
2566
- "loss": 0.4545,
2567
- "step": 3770
2568
- },
2569
- {
2570
- "epoch": 6.17,
2571
- "learning_rate": 0.00041740674955595023,
2572
- "loss": 0.4507,
2573
- "step": 3780
2574
- },
2575
- {
2576
- "epoch": 6.18,
2577
- "learning_rate": 0.0004156305506216697,
2578
- "loss": 0.4345,
2579
- "step": 3790
2580
- },
2581
- {
2582
- "epoch": 6.2,
2583
- "learning_rate": 0.000413854351687389,
2584
- "loss": 0.4603,
2585
- "step": 3800
2586
- },
2587
- {
2588
- "epoch": 6.2,
2589
- "eval_loss": 0.43294015526771545,
2590
- "eval_runtime": 5.595,
2591
- "eval_samples_per_second": 209.293,
2592
- "eval_steps_per_second": 13.226,
2593
- "step": 3800
2594
- },
2595
- {
2596
- "epoch": 6.22,
2597
- "learning_rate": 0.00041207815275310836,
2598
- "loss": 0.4586,
2599
- "step": 3810
2600
- },
2601
- {
2602
- "epoch": 6.23,
2603
- "learning_rate": 0.00041030195381882775,
2604
- "loss": 0.4205,
2605
- "step": 3820
2606
- },
2607
- {
2608
- "epoch": 6.25,
2609
- "learning_rate": 0.0004085257548845471,
2610
- "loss": 0.4412,
2611
- "step": 3830
2612
- },
2613
- {
2614
- "epoch": 6.26,
2615
- "learning_rate": 0.00040674955595026643,
2616
- "loss": 0.4632,
2617
- "step": 3840
2618
- },
2619
- {
2620
- "epoch": 6.28,
2621
- "learning_rate": 0.0004049733570159858,
2622
- "loss": 0.4498,
2623
- "step": 3850
2624
- },
2625
- {
2626
- "epoch": 6.3,
2627
- "learning_rate": 0.00040319715808170517,
2628
- "loss": 0.4457,
2629
- "step": 3860
2630
- },
2631
- {
2632
- "epoch": 6.31,
2633
- "learning_rate": 0.00040142095914742456,
2634
- "loss": 0.4461,
2635
- "step": 3870
2636
- },
2637
- {
2638
- "epoch": 6.33,
2639
- "learning_rate": 0.0003996447602131439,
2640
- "loss": 0.4239,
2641
- "step": 3880
2642
- },
2643
- {
2644
- "epoch": 6.35,
2645
- "learning_rate": 0.00039786856127886324,
2646
- "loss": 0.4439,
2647
- "step": 3890
2648
- },
2649
- {
2650
- "epoch": 6.36,
2651
- "learning_rate": 0.00039609236234458263,
2652
- "loss": 0.4095,
2653
- "step": 3900
2654
- },
2655
- {
2656
- "epoch": 6.36,
2657
- "eval_loss": 0.4296843111515045,
2658
- "eval_runtime": 5.575,
2659
- "eval_samples_per_second": 210.044,
2660
- "eval_steps_per_second": 13.273,
2661
- "step": 3900
2662
- },
2663
- {
2664
- "epoch": 6.38,
2665
- "learning_rate": 0.00039431616341030197,
2666
- "loss": 0.4338,
2667
- "step": 3910
2668
- },
2669
- {
2670
- "epoch": 6.39,
2671
- "learning_rate": 0.0003925399644760213,
2672
- "loss": 0.4279,
2673
- "step": 3920
2674
- },
2675
- {
2676
- "epoch": 6.41,
2677
- "learning_rate": 0.0003907637655417407,
2678
- "loss": 0.4214,
2679
- "step": 3930
2680
- },
2681
- {
2682
- "epoch": 6.43,
2683
- "learning_rate": 0.00038898756660746004,
2684
- "loss": 0.4151,
2685
- "step": 3940
2686
- },
2687
- {
2688
- "epoch": 6.44,
2689
- "learning_rate": 0.0003872113676731794,
2690
- "loss": 0.4393,
2691
- "step": 3950
2692
- },
2693
- {
2694
- "epoch": 6.46,
2695
- "learning_rate": 0.0003854351687388988,
2696
- "loss": 0.4259,
2697
- "step": 3960
2698
- },
2699
- {
2700
- "epoch": 6.48,
2701
- "learning_rate": 0.0003836589698046181,
2702
- "loss": 0.4354,
2703
- "step": 3970
2704
- },
2705
- {
2706
- "epoch": 6.49,
2707
- "learning_rate": 0.00038188277087033745,
2708
- "loss": 0.4135,
2709
- "step": 3980
2710
- },
2711
- {
2712
- "epoch": 6.51,
2713
- "learning_rate": 0.00038010657193605685,
2714
- "loss": 0.4317,
2715
- "step": 3990
2716
- },
2717
- {
2718
- "epoch": 6.53,
2719
- "learning_rate": 0.0003783303730017762,
2720
- "loss": 0.453,
2721
- "step": 4000
2722
- },
2723
- {
2724
- "epoch": 6.53,
2725
- "eval_loss": 0.43606194853782654,
2726
- "eval_runtime": 5.568,
2727
- "eval_samples_per_second": 210.308,
2728
- "eval_steps_per_second": 13.29,
2729
- "step": 4000
2730
- },
2731
- {
2732
- "epoch": 6.54,
2733
- "learning_rate": 0.0003765541740674956,
2734
- "loss": 0.4468,
2735
- "step": 4010
2736
- },
2737
- {
2738
- "epoch": 6.56,
2739
- "learning_rate": 0.0003747779751332149,
2740
- "loss": 0.4104,
2741
- "step": 4020
2742
- },
2743
- {
2744
- "epoch": 6.57,
2745
- "learning_rate": 0.00037300177619893426,
2746
- "loss": 0.4593,
2747
- "step": 4030
2748
- },
2749
- {
2750
- "epoch": 6.59,
2751
- "learning_rate": 0.00037122557726465365,
2752
- "loss": 0.4376,
2753
- "step": 4040
2754
- },
2755
- {
2756
- "epoch": 6.61,
2757
- "learning_rate": 0.000369449378330373,
2758
- "loss": 0.4585,
2759
- "step": 4050
2760
- },
2761
- {
2762
- "epoch": 6.62,
2763
- "learning_rate": 0.00036767317939609233,
2764
- "loss": 0.4177,
2765
- "step": 4060
2766
- },
2767
- {
2768
- "epoch": 6.64,
2769
- "learning_rate": 0.0003658969804618117,
2770
- "loss": 0.4852,
2771
- "step": 4070
2772
- },
2773
- {
2774
- "epoch": 6.66,
2775
- "learning_rate": 0.00036412078152753106,
2776
- "loss": 0.4167,
2777
- "step": 4080
2778
- },
2779
- {
2780
- "epoch": 6.67,
2781
- "learning_rate": 0.0003623445825932504,
2782
- "loss": 0.4477,
2783
- "step": 4090
2784
- },
2785
- {
2786
- "epoch": 6.69,
2787
- "learning_rate": 0.00036056838365896985,
2788
- "loss": 0.4219,
2789
- "step": 4100
2790
- },
2791
- {
2792
- "epoch": 6.69,
2793
- "eval_loss": 0.4315338730812073,
2794
- "eval_runtime": 5.58,
2795
- "eval_samples_per_second": 209.856,
2796
- "eval_steps_per_second": 13.262,
2797
- "step": 4100
2798
- },
2799
- {
2800
- "epoch": 6.7,
2801
- "learning_rate": 0.0003587921847246892,
2802
- "loss": 0.4345,
2803
- "step": 4110
2804
- },
2805
- {
2806
- "epoch": 6.72,
2807
- "learning_rate": 0.0003570159857904086,
2808
- "loss": 0.4706,
2809
- "step": 4120
2810
- },
2811
- {
2812
- "epoch": 6.74,
2813
- "learning_rate": 0.0003552397868561279,
2814
- "loss": 0.4254,
2815
- "step": 4130
2816
- },
2817
- {
2818
- "epoch": 6.75,
2819
- "learning_rate": 0.00035346358792184726,
2820
- "loss": 0.4316,
2821
- "step": 4140
2822
- },
2823
- {
2824
- "epoch": 6.77,
2825
- "learning_rate": 0.00035168738898756665,
2826
- "loss": 0.4257,
2827
- "step": 4150
2828
- },
2829
- {
2830
- "epoch": 6.79,
2831
- "learning_rate": 0.000349911190053286,
2832
- "loss": 0.4332,
2833
- "step": 4160
2834
- },
2835
- {
2836
- "epoch": 6.8,
2837
- "learning_rate": 0.00034813499111900533,
2838
- "loss": 0.4652,
2839
- "step": 4170
2840
- },
2841
- {
2842
- "epoch": 6.82,
2843
- "learning_rate": 0.0003463587921847247,
2844
- "loss": 0.4515,
2845
- "step": 4180
2846
- },
2847
- {
2848
- "epoch": 6.84,
2849
- "learning_rate": 0.00034458259325044407,
2850
- "loss": 0.4579,
2851
- "step": 4190
2852
- },
2853
- {
2854
- "epoch": 6.85,
2855
- "learning_rate": 0.0003428063943161634,
2856
- "loss": 0.4269,
2857
- "step": 4200
2858
- },
2859
- {
2860
- "epoch": 6.85,
2861
- "eval_loss": 0.4239509403705597,
2862
- "eval_runtime": 5.576,
2863
- "eval_samples_per_second": 210.006,
2864
- "eval_steps_per_second": 13.271,
2865
- "step": 4200
2866
- },
2867
- {
2868
- "epoch": 6.87,
2869
- "learning_rate": 0.0003410301953818828,
2870
- "loss": 0.448,
2871
- "step": 4210
2872
- },
2873
- {
2874
- "epoch": 6.88,
2875
- "learning_rate": 0.00033925399644760214,
2876
- "loss": 0.4353,
2877
- "step": 4220
2878
- },
2879
- {
2880
- "epoch": 6.9,
2881
- "learning_rate": 0.0003374777975133215,
2882
- "loss": 0.439,
2883
- "step": 4230
2884
- },
2885
- {
2886
- "epoch": 6.92,
2887
- "learning_rate": 0.00033570159857904087,
2888
- "loss": 0.4548,
2889
- "step": 4240
2890
- },
2891
- {
2892
- "epoch": 6.93,
2893
- "learning_rate": 0.0003339253996447602,
2894
- "loss": 0.4305,
2895
- "step": 4250
2896
- },
2897
- {
2898
- "epoch": 6.95,
2899
- "learning_rate": 0.0003321492007104796,
2900
- "loss": 0.4385,
2901
- "step": 4260
2902
- },
2903
- {
2904
- "epoch": 6.97,
2905
- "learning_rate": 0.00033037300177619894,
2906
- "loss": 0.4737,
2907
- "step": 4270
2908
- },
2909
- {
2910
- "epoch": 6.98,
2911
- "learning_rate": 0.0003285968028419183,
2912
- "loss": 0.4615,
2913
- "step": 4280
2914
- },
2915
- {
2916
- "epoch": 7.0,
2917
- "learning_rate": 0.0003268206039076377,
2918
- "loss": 0.4474,
2919
- "step": 4290
2920
- },
2921
- {
2922
- "epoch": 7.01,
2923
- "learning_rate": 0.000325044404973357,
2924
- "loss": 0.4098,
2925
- "step": 4300
2926
- },
2927
- {
2928
- "epoch": 7.01,
2929
- "eval_loss": 0.4368573725223541,
2930
- "eval_runtime": 5.581,
2931
- "eval_samples_per_second": 209.818,
2932
- "eval_steps_per_second": 13.259,
2933
- "step": 4300
2934
- },
2935
- {
2936
- "epoch": 7.03,
2937
- "learning_rate": 0.00032326820603907635,
2938
- "loss": 0.4292,
2939
- "step": 4310
2940
- },
2941
- {
2942
- "epoch": 7.05,
2943
- "learning_rate": 0.00032149200710479575,
2944
- "loss": 0.4143,
2945
- "step": 4320
2946
- },
2947
- {
2948
- "epoch": 7.06,
2949
- "learning_rate": 0.0003197158081705151,
2950
- "loss": 0.439,
2951
- "step": 4330
2952
- },
2953
- {
2954
- "epoch": 7.08,
2955
- "learning_rate": 0.0003179396092362344,
2956
- "loss": 0.4534,
2957
- "step": 4340
2958
- },
2959
- {
2960
- "epoch": 7.1,
2961
- "learning_rate": 0.0003161634103019538,
2962
- "loss": 0.4361,
2963
- "step": 4350
2964
- },
2965
- {
2966
- "epoch": 7.11,
2967
- "learning_rate": 0.00031438721136767316,
2968
- "loss": 0.4249,
2969
- "step": 4360
2970
- },
2971
- {
2972
- "epoch": 7.13,
2973
- "learning_rate": 0.00031261101243339255,
2974
- "loss": 0.455,
2975
- "step": 4370
2976
- },
2977
- {
2978
- "epoch": 7.15,
2979
- "learning_rate": 0.0003108348134991119,
2980
- "loss": 0.4154,
2981
- "step": 4380
2982
- },
2983
- {
2984
- "epoch": 7.16,
2985
- "learning_rate": 0.00030905861456483123,
2986
- "loss": 0.4328,
2987
- "step": 4390
2988
- },
2989
- {
2990
- "epoch": 7.18,
2991
- "learning_rate": 0.0003072824156305506,
2992
- "loss": 0.4153,
2993
- "step": 4400
2994
- },
2995
- {
2996
- "epoch": 7.18,
2997
- "eval_loss": 0.43138188123703003,
2998
- "eval_runtime": 5.569,
2999
- "eval_samples_per_second": 210.27,
3000
- "eval_steps_per_second": 13.288,
3001
- "step": 4400
3002
- },
3003
- {
3004
- "epoch": 7.19,
3005
- "learning_rate": 0.00030550621669627,
3006
- "loss": 0.4308,
3007
- "step": 4410
3008
- },
3009
- {
3010
- "epoch": 7.21,
3011
- "learning_rate": 0.00030373001776198936,
3012
- "loss": 0.4271,
3013
- "step": 4420
3014
- },
3015
- {
3016
- "epoch": 7.23,
3017
- "learning_rate": 0.00030195381882770875,
3018
- "loss": 0.4237,
3019
- "step": 4430
3020
- },
3021
- {
3022
- "epoch": 7.24,
3023
- "learning_rate": 0.0003001776198934281,
3024
- "loss": 0.4563,
3025
- "step": 4440
3026
- },
3027
- {
3028
- "epoch": 7.26,
3029
- "learning_rate": 0.00029840142095914743,
3030
- "loss": 0.4374,
3031
- "step": 4450
3032
- },
3033
- {
3034
- "epoch": 7.28,
3035
- "learning_rate": 0.0002966252220248668,
3036
- "loss": 0.4167,
3037
- "step": 4460
3038
- },
3039
- {
3040
- "epoch": 7.29,
3041
- "learning_rate": 0.00029484902309058616,
3042
- "loss": 0.4293,
3043
- "step": 4470
3044
- },
3045
- {
3046
- "epoch": 7.31,
3047
- "learning_rate": 0.00029307282415630555,
3048
- "loss": 0.4065,
3049
- "step": 4480
3050
- },
3051
- {
3052
- "epoch": 7.32,
3053
- "learning_rate": 0.0002912966252220249,
3054
- "loss": 0.434,
3055
- "step": 4490
3056
- },
3057
- {
3058
- "epoch": 7.34,
3059
- "learning_rate": 0.00028952042628774423,
3060
- "loss": 0.4177,
3061
- "step": 4500
3062
- },
3063
- {
3064
- "epoch": 7.34,
3065
- "eval_loss": 0.43316009640693665,
3066
- "eval_runtime": 5.57,
3067
- "eval_samples_per_second": 210.233,
3068
- "eval_steps_per_second": 13.285,
3069
- "step": 4500
3070
- },
3071
- {
3072
- "epoch": 7.36,
3073
- "learning_rate": 0.0002877442273534636,
3074
- "loss": 0.4295,
3075
- "step": 4510
3076
- },
3077
- {
3078
- "epoch": 7.37,
3079
- "learning_rate": 0.00028596802841918297,
3080
- "loss": 0.4235,
3081
- "step": 4520
3082
- },
3083
- {
3084
- "epoch": 7.39,
3085
- "learning_rate": 0.0002841918294849023,
3086
- "loss": 0.4207,
3087
- "step": 4530
3088
- },
3089
- {
3090
- "epoch": 7.41,
3091
- "learning_rate": 0.0002824156305506217,
3092
- "loss": 0.4188,
3093
- "step": 4540
3094
- },
3095
- {
3096
- "epoch": 7.42,
3097
- "learning_rate": 0.00028063943161634104,
3098
- "loss": 0.4304,
3099
- "step": 4550
3100
- },
3101
- {
3102
- "epoch": 7.44,
3103
- "learning_rate": 0.0002788632326820604,
3104
- "loss": 0.4522,
3105
- "step": 4560
3106
- },
3107
- {
3108
- "epoch": 7.46,
3109
- "learning_rate": 0.00027708703374777977,
3110
- "loss": 0.4179,
3111
- "step": 4570
3112
- },
3113
- {
3114
- "epoch": 7.47,
3115
- "learning_rate": 0.0002753108348134991,
3116
- "loss": 0.4339,
3117
- "step": 4580
3118
- },
3119
- {
3120
- "epoch": 7.49,
3121
- "learning_rate": 0.00027353463587921845,
3122
- "loss": 0.4207,
3123
- "step": 4590
3124
- },
3125
- {
3126
- "epoch": 7.5,
3127
- "learning_rate": 0.00027175843694493784,
3128
- "loss": 0.4131,
3129
- "step": 4600
3130
- },
3131
- {
3132
- "epoch": 7.5,
3133
- "eval_loss": 0.4320705831050873,
3134
- "eval_runtime": 5.5625,
3135
- "eval_samples_per_second": 210.516,
3136
- "eval_steps_per_second": 13.303,
3137
- "step": 4600
3138
- },
3139
- {
3140
- "epoch": 7.52,
3141
- "learning_rate": 0.0002699822380106572,
3142
- "loss": 0.42,
3143
- "step": 4610
3144
- },
3145
- {
3146
- "epoch": 7.54,
3147
- "learning_rate": 0.0002682060390763766,
3148
- "loss": 0.4311,
3149
- "step": 4620
3150
- },
3151
- {
3152
- "epoch": 7.55,
3153
- "learning_rate": 0.0002664298401420959,
3154
- "loss": 0.413,
3155
- "step": 4630
3156
- },
3157
- {
3158
- "epoch": 7.57,
3159
- "learning_rate": 0.00026465364120781525,
3160
- "loss": 0.4403,
3161
- "step": 4640
3162
- },
3163
- {
3164
- "epoch": 7.59,
3165
- "learning_rate": 0.00026287744227353465,
3166
- "loss": 0.4359,
3167
- "step": 4650
3168
- },
3169
- {
3170
- "epoch": 7.6,
3171
- "learning_rate": 0.000261101243339254,
3172
- "loss": 0.4614,
3173
- "step": 4660
3174
- },
3175
- {
3176
- "epoch": 7.62,
3177
- "learning_rate": 0.0002593250444049733,
3178
- "loss": 0.4462,
3179
- "step": 4670
3180
- },
3181
- {
3182
- "epoch": 7.63,
3183
- "learning_rate": 0.0002575488454706927,
3184
- "loss": 0.4333,
3185
- "step": 4680
3186
- },
3187
- {
3188
- "epoch": 7.65,
3189
- "learning_rate": 0.00025577264653641206,
3190
- "loss": 0.4529,
3191
- "step": 4690
3192
- },
3193
- {
3194
- "epoch": 7.67,
3195
- "learning_rate": 0.0002539964476021314,
3196
- "loss": 0.4107,
3197
- "step": 4700
3198
- },
3199
- {
3200
- "epoch": 7.67,
3201
- "eval_loss": 0.43221670389175415,
3202
- "eval_runtime": 5.5625,
3203
- "eval_samples_per_second": 210.516,
3204
- "eval_steps_per_second": 13.303,
3205
- "step": 4700
3206
- },
3207
- {
3208
- "epoch": 7.68,
3209
- "learning_rate": 0.0002522202486678508,
3210
- "loss": 0.4248,
3211
- "step": 4710
3212
- },
3213
- {
3214
- "epoch": 7.7,
3215
- "learning_rate": 0.00025044404973357013,
3216
- "loss": 0.4278,
3217
- "step": 4720
3218
- },
3219
- {
3220
- "epoch": 7.72,
3221
- "learning_rate": 0.0002486678507992895,
3222
- "loss": 0.4369,
3223
- "step": 4730
3224
- },
3225
- {
3226
- "epoch": 7.73,
3227
- "learning_rate": 0.0002468916518650089,
3228
- "loss": 0.4287,
3229
- "step": 4740
3230
- },
3231
- {
3232
- "epoch": 7.75,
3233
- "learning_rate": 0.00024511545293072826,
3234
- "loss": 0.4238,
3235
- "step": 4750
3236
- },
3237
- {
3238
- "epoch": 7.77,
3239
- "learning_rate": 0.0002433392539964476,
3240
- "loss": 0.4434,
3241
- "step": 4760
3242
- },
3243
- {
3244
- "epoch": 7.78,
3245
- "learning_rate": 0.00024156305506216696,
3246
- "loss": 0.428,
3247
- "step": 4770
3248
- },
3249
- {
3250
- "epoch": 7.8,
3251
- "learning_rate": 0.00023978685612788633,
3252
- "loss": 0.4409,
3253
- "step": 4780
3254
- },
3255
- {
3256
- "epoch": 7.81,
3257
- "learning_rate": 0.00023801065719360567,
3258
- "loss": 0.4214,
3259
- "step": 4790
3260
- },
3261
- {
3262
- "epoch": 7.83,
3263
- "learning_rate": 0.00023623445825932503,
3264
- "loss": 0.4228,
3265
- "step": 4800
3266
- },
3267
- {
3268
- "epoch": 7.83,
3269
- "eval_loss": 0.434493750333786,
3270
- "eval_runtime": 5.5781,
3271
- "eval_samples_per_second": 209.927,
3272
- "eval_steps_per_second": 13.266,
3273
- "step": 4800
3274
- },
3275
- {
3276
- "epoch": 7.85,
3277
- "learning_rate": 0.00023445825932504443,
3278
- "loss": 0.442,
3279
- "step": 4810
3280
- },
3281
- {
3282
- "epoch": 7.86,
3283
- "learning_rate": 0.0002326820603907638,
3284
- "loss": 0.4259,
3285
- "step": 4820
3286
- },
3287
- {
3288
- "epoch": 7.88,
3289
- "learning_rate": 0.00023090586145648313,
3290
- "loss": 0.4453,
3291
- "step": 4830
3292
- },
3293
- {
3294
- "epoch": 7.9,
3295
- "learning_rate": 0.0002291296625222025,
3296
- "loss": 0.4205,
3297
- "step": 4840
3298
- },
3299
- {
3300
- "epoch": 7.91,
3301
- "learning_rate": 0.00022735346358792187,
3302
- "loss": 0.445,
3303
- "step": 4850
3304
- },
3305
- {
3306
- "epoch": 7.93,
3307
- "learning_rate": 0.0002255772646536412,
3308
- "loss": 0.4142,
3309
- "step": 4860
3310
- },
3311
- {
3312
- "epoch": 7.94,
3313
- "learning_rate": 0.00022380106571936057,
3314
- "loss": 0.4717,
3315
- "step": 4870
3316
- },
3317
- {
3318
- "epoch": 7.96,
3319
- "learning_rate": 0.00022202486678507994,
3320
- "loss": 0.4014,
3321
- "step": 4880
3322
- },
3323
- {
3324
- "epoch": 7.98,
3325
- "learning_rate": 0.0002202486678507993,
3326
- "loss": 0.3956,
3327
- "step": 4890
3328
- },
3329
- {
3330
- "epoch": 7.99,
3331
- "learning_rate": 0.00021847246891651864,
3332
- "loss": 0.4197,
3333
- "step": 4900
3334
- },
3335
- {
3336
- "epoch": 7.99,
3337
- "eval_loss": 0.42817065119743347,
3338
- "eval_runtime": 5.657,
3339
- "eval_samples_per_second": 206.999,
3340
- "eval_steps_per_second": 13.081,
3341
- "step": 4900
3342
- },
3343
- {
3344
- "epoch": 8.01,
3345
- "learning_rate": 0.000216696269982238,
3346
- "loss": 0.3854,
3347
- "step": 4910
3348
- },
3349
- {
3350
- "epoch": 8.03,
3351
- "learning_rate": 0.00021492007104795738,
3352
- "loss": 0.4118,
3353
- "step": 4920
3354
- },
3355
- {
3356
- "epoch": 8.04,
3357
- "learning_rate": 0.00021314387211367671,
3358
- "loss": 0.4117,
3359
- "step": 4930
3360
- },
3361
- {
3362
- "epoch": 8.06,
3363
- "learning_rate": 0.00021136767317939608,
3364
- "loss": 0.4063,
3365
- "step": 4940
3366
- },
3367
- {
3368
- "epoch": 8.08,
3369
- "learning_rate": 0.00020959147424511545,
3370
- "loss": 0.4431,
3371
- "step": 4950
3372
- },
3373
- {
3374
- "epoch": 8.09,
3375
- "learning_rate": 0.00020781527531083484,
3376
- "loss": 0.4293,
3377
- "step": 4960
3378
- },
3379
- {
3380
- "epoch": 8.11,
3381
- "learning_rate": 0.00020603907637655418,
3382
- "loss": 0.4156,
3383
- "step": 4970
3384
- },
3385
- {
3386
- "epoch": 8.12,
3387
- "learning_rate": 0.00020426287744227355,
3388
- "loss": 0.4639,
3389
- "step": 4980
3390
- },
3391
- {
3392
- "epoch": 8.14,
3393
- "learning_rate": 0.0002024866785079929,
3394
- "loss": 0.4234,
3395
- "step": 4990
3396
- },
3397
- {
3398
- "epoch": 8.16,
3399
- "learning_rate": 0.00020071047957371228,
3400
- "loss": 0.3902,
3401
- "step": 5000
3402
- },
3403
- {
3404
- "epoch": 8.16,
3405
- "eval_loss": 0.42921850085258484,
3406
- "eval_runtime": 5.61,
3407
- "eval_samples_per_second": 208.734,
3408
- "eval_steps_per_second": 13.191,
3409
- "step": 5000
3410
- },
3411
- {
3412
- "epoch": 8.17,
3413
- "learning_rate": 0.00019893428063943162,
3414
- "loss": 0.4156,
3415
- "step": 5010
3416
- },
3417
- {
3418
- "epoch": 8.19,
3419
- "learning_rate": 0.00019715808170515098,
3420
- "loss": 0.4171,
3421
- "step": 5020
3422
- },
3423
- {
3424
- "epoch": 8.21,
3425
- "learning_rate": 0.00019538188277087035,
3426
- "loss": 0.4158,
3427
- "step": 5030
3428
- },
3429
- {
3430
- "epoch": 8.22,
3431
- "learning_rate": 0.0001936056838365897,
3432
- "loss": 0.4195,
3433
- "step": 5040
3434
- },
3435
- {
3436
- "epoch": 8.24,
3437
- "learning_rate": 0.00019182948490230906,
3438
- "loss": 0.4205,
3439
- "step": 5050
3440
- },
3441
- {
3442
- "epoch": 8.25,
3443
- "learning_rate": 0.00019005328596802842,
3444
- "loss": 0.4069,
3445
- "step": 5060
3446
- },
3447
- {
3448
- "epoch": 8.27,
3449
- "learning_rate": 0.0001882770870337478,
3450
- "loss": 0.4316,
3451
- "step": 5070
3452
- },
3453
- {
3454
- "epoch": 8.29,
3455
- "learning_rate": 0.00018650088809946713,
3456
- "loss": 0.4286,
3457
- "step": 5080
3458
- },
3459
- {
3460
- "epoch": 8.3,
3461
- "learning_rate": 0.0001847246891651865,
3462
- "loss": 0.4452,
3463
- "step": 5090
3464
- },
3465
- {
3466
- "epoch": 8.32,
3467
- "learning_rate": 0.00018294849023090586,
3468
- "loss": 0.4166,
3469
- "step": 5100
3470
- },
3471
- {
3472
- "epoch": 8.32,
3473
- "eval_loss": 0.43339869379997253,
3474
- "eval_runtime": 5.806,
3475
- "eval_samples_per_second": 201.687,
3476
- "eval_steps_per_second": 12.745,
3477
- "step": 5100
3478
- },
3479
- {
3480
- "epoch": 8.34,
3481
- "learning_rate": 0.0001811722912966252,
3482
- "loss": 0.448,
3483
- "step": 5110
3484
- },
3485
- {
3486
- "epoch": 8.35,
3487
- "learning_rate": 0.0001793960923623446,
3488
- "loss": 0.4058,
3489
- "step": 5120
3490
- },
3491
- {
3492
- "epoch": 8.37,
3493
- "learning_rate": 0.00017761989342806396,
3494
- "loss": 0.3919,
3495
- "step": 5130
3496
- },
3497
- {
3498
- "epoch": 8.38,
3499
- "learning_rate": 0.00017584369449378333,
3500
- "loss": 0.4525,
3501
- "step": 5140
3502
- },
3503
- {
3504
- "epoch": 8.4,
3505
- "learning_rate": 0.00017406749555950267,
3506
- "loss": 0.4269,
3507
- "step": 5150
3508
- },
3509
- {
3510
- "epoch": 8.42,
3511
- "learning_rate": 0.00017229129662522203,
3512
- "loss": 0.4355,
3513
- "step": 5160
3514
- },
3515
- {
3516
- "epoch": 8.43,
3517
- "learning_rate": 0.0001705150976909414,
3518
- "loss": 0.4458,
3519
- "step": 5170
3520
- },
3521
- {
3522
- "epoch": 8.45,
3523
- "learning_rate": 0.00016873889875666074,
3524
- "loss": 0.3966,
3525
- "step": 5180
3526
- },
3527
- {
3528
- "epoch": 8.47,
3529
- "learning_rate": 0.0001669626998223801,
3530
- "loss": 0.4058,
3531
- "step": 5190
3532
- },
3533
- {
3534
- "epoch": 8.48,
3535
- "learning_rate": 0.00016518650088809947,
3536
- "loss": 0.4199,
3537
- "step": 5200
3538
- },
3539
- {
3540
- "epoch": 8.48,
3541
- "eval_loss": 0.4262863099575043,
3542
- "eval_runtime": 5.725,
3543
- "eval_samples_per_second": 204.541,
3544
- "eval_steps_per_second": 12.926,
3545
- "step": 5200
3546
- },
3547
- {
3548
- "epoch": 8.5,
3549
- "learning_rate": 0.00016341030195381884,
3550
- "loss": 0.4185,
3551
- "step": 5210
3552
- },
3553
- {
3554
- "epoch": 8.52,
3555
- "learning_rate": 0.00016163410301953818,
3556
- "loss": 0.4145,
3557
- "step": 5220
3558
- },
3559
- {
3560
- "epoch": 8.53,
3561
- "learning_rate": 0.00015985790408525754,
3562
- "loss": 0.4178,
3563
- "step": 5230
3564
- },
3565
- {
3566
- "epoch": 8.55,
3567
- "learning_rate": 0.0001580817051509769,
3568
- "loss": 0.4285,
3569
- "step": 5240
3570
- },
3571
- {
3572
- "epoch": 8.56,
3573
- "learning_rate": 0.00015630550621669628,
3574
- "loss": 0.4062,
3575
- "step": 5250
3576
- },
3577
- {
3578
- "epoch": 8.58,
3579
- "learning_rate": 0.00015452930728241561,
3580
- "loss": 0.4308,
3581
- "step": 5260
3582
- },
3583
- {
3584
- "epoch": 8.6,
3585
- "learning_rate": 0.000152753108348135,
3586
- "loss": 0.4488,
3587
- "step": 5270
3588
- },
3589
- {
3590
- "epoch": 8.61,
3591
- "learning_rate": 0.00015097690941385437,
3592
- "loss": 0.4087,
3593
- "step": 5280
3594
- },
3595
- {
3596
- "epoch": 8.63,
3597
- "learning_rate": 0.00014920071047957371,
3598
- "loss": 0.421,
3599
- "step": 5290
3600
- },
3601
- {
3602
- "epoch": 8.65,
3603
- "learning_rate": 0.00014742451154529308,
3604
- "loss": 0.4415,
3605
- "step": 5300
3606
- },
3607
- {
3608
- "epoch": 8.65,
3609
- "eval_loss": 0.4254954755306244,
3610
- "eval_runtime": 5.593,
3611
- "eval_samples_per_second": 209.368,
3612
- "eval_steps_per_second": 13.231,
3613
- "step": 5300
3614
- },
3615
- {
3616
- "epoch": 8.66,
3617
- "learning_rate": 0.00014564831261101245,
3618
- "loss": 0.4358,
3619
- "step": 5310
3620
- },
3621
- {
3622
- "epoch": 8.68,
3623
- "learning_rate": 0.0001438721136767318,
3624
- "loss": 0.404,
3625
- "step": 5320
3626
- },
3627
- {
3628
- "epoch": 8.69,
3629
- "learning_rate": 0.00014209591474245115,
3630
- "loss": 0.4172,
3631
- "step": 5330
3632
- },
3633
- {
3634
- "epoch": 8.71,
3635
- "learning_rate": 0.00014031971580817052,
3636
- "loss": 0.3918,
3637
- "step": 5340
3638
- },
3639
- {
3640
- "epoch": 8.73,
3641
- "learning_rate": 0.00013854351687388988,
3642
- "loss": 0.4313,
3643
- "step": 5350
3644
- },
3645
- {
3646
- "epoch": 8.74,
3647
- "learning_rate": 0.00013676731793960922,
3648
- "loss": 0.4326,
3649
- "step": 5360
3650
- },
3651
- {
3652
- "epoch": 8.76,
3653
- "learning_rate": 0.0001349911190053286,
3654
- "loss": 0.4369,
3655
- "step": 5370
3656
- },
3657
- {
3658
- "epoch": 8.78,
3659
- "learning_rate": 0.00013321492007104796,
3660
- "loss": 0.4099,
3661
- "step": 5380
3662
- },
3663
- {
3664
- "epoch": 8.79,
3665
- "learning_rate": 0.00013143872113676732,
3666
- "loss": 0.4165,
3667
- "step": 5390
3668
- },
3669
- {
3670
- "epoch": 8.81,
3671
- "learning_rate": 0.00012966252220248666,
3672
- "loss": 0.4014,
3673
- "step": 5400
3674
- },
3675
- {
3676
- "epoch": 8.81,
3677
- "eval_loss": 0.4244977533817291,
3678
- "eval_runtime": 5.568,
3679
- "eval_samples_per_second": 210.308,
3680
- "eval_steps_per_second": 13.29,
3681
- "step": 5400
3682
- },
3683
- {
3684
- "epoch": 8.83,
3685
- "learning_rate": 0.00012788632326820603,
3686
- "loss": 0.4111,
3687
- "step": 5410
3688
- },
3689
- {
3690
- "epoch": 8.84,
3691
- "learning_rate": 0.0001261101243339254,
3692
- "loss": 0.4323,
3693
- "step": 5420
3694
- },
3695
- {
3696
- "epoch": 8.86,
3697
- "learning_rate": 0.00012433392539964476,
3698
- "loss": 0.4342,
3699
- "step": 5430
3700
- },
3701
- {
3702
- "epoch": 8.87,
3703
- "learning_rate": 0.00012255772646536413,
3704
- "loss": 0.4111,
3705
- "step": 5440
3706
- },
3707
- {
3708
- "epoch": 8.89,
3709
- "learning_rate": 0.00012078152753108348,
3710
- "loss": 0.4464,
3711
- "step": 5450
3712
- },
3713
- {
3714
- "epoch": 8.91,
3715
- "learning_rate": 0.00011900532859680283,
3716
- "loss": 0.4244,
3717
- "step": 5460
3718
- },
3719
- {
3720
- "epoch": 8.92,
3721
- "learning_rate": 0.00011722912966252221,
3722
- "loss": 0.4503,
3723
- "step": 5470
3724
- },
3725
- {
3726
- "epoch": 8.94,
3727
- "learning_rate": 0.00011545293072824157,
3728
- "loss": 0.4276,
3729
- "step": 5480
3730
- },
3731
- {
3732
- "epoch": 8.96,
3733
- "learning_rate": 0.00011367673179396093,
3734
- "loss": 0.4119,
3735
- "step": 5490
3736
- },
3737
- {
3738
- "epoch": 8.97,
3739
- "learning_rate": 0.00011190053285968029,
3740
- "loss": 0.4158,
3741
- "step": 5500
3742
- },
3743
- {
3744
- "epoch": 8.97,
3745
- "eval_loss": 0.4268430769443512,
3746
- "eval_runtime": 5.711,
3747
- "eval_samples_per_second": 205.042,
3748
- "eval_steps_per_second": 12.957,
3749
- "step": 5500
3750
- },
3751
- {
3752
- "epoch": 8.99,
3753
- "learning_rate": 0.00011012433392539965,
3754
- "loss": 0.4506,
3755
- "step": 5510
3756
- },
3757
- {
3758
- "epoch": 9.0,
3759
- "learning_rate": 0.000108348134991119,
3760
- "loss": 0.3832,
3761
- "step": 5520
3762
- },
3763
- {
3764
- "epoch": 9.02,
3765
- "learning_rate": 0.00010657193605683836,
3766
- "loss": 0.4018,
3767
- "step": 5530
3768
- },
3769
- {
3770
- "epoch": 9.04,
3771
- "learning_rate": 0.00010479573712255772,
3772
- "loss": 0.413,
3773
- "step": 5540
3774
- },
3775
- {
3776
- "epoch": 9.05,
3777
- "learning_rate": 0.00010301953818827709,
3778
- "loss": 0.4317,
3779
- "step": 5550
3780
- },
3781
- {
3782
- "epoch": 9.07,
3783
- "learning_rate": 0.00010124333925399646,
3784
- "loss": 0.4117,
3785
- "step": 5560
3786
- },
3787
- {
3788
- "epoch": 9.09,
3789
- "learning_rate": 9.946714031971581e-05,
3790
- "loss": 0.4068,
3791
- "step": 5570
3792
- },
3793
- {
3794
- "epoch": 9.1,
3795
- "learning_rate": 9.769094138543518e-05,
3796
- "loss": 0.3956,
3797
- "step": 5580
3798
- },
3799
- {
3800
- "epoch": 9.12,
3801
- "learning_rate": 9.591474245115453e-05,
3802
- "loss": 0.3948,
3803
- "step": 5590
3804
- },
3805
- {
3806
- "epoch": 9.14,
3807
- "learning_rate": 9.41385435168739e-05,
3808
- "loss": 0.3842,
3809
- "step": 5600
3810
- },
3811
- {
3812
- "epoch": 9.14,
3813
- "eval_loss": 0.42232006788253784,
3814
- "eval_runtime": 5.687,
3815
- "eval_samples_per_second": 205.907,
3816
- "eval_steps_per_second": 13.012,
3817
- "step": 5600
3818
- },
3819
- {
3820
- "epoch": 9.15,
3821
- "learning_rate": 9.236234458259325e-05,
3822
- "loss": 0.4099,
3823
- "step": 5610
3824
- },
3825
- {
3826
- "epoch": 9.17,
3827
- "learning_rate": 9.05861456483126e-05,
3828
- "loss": 0.413,
3829
- "step": 5620
3830
- },
3831
- {
3832
- "epoch": 9.18,
3833
- "learning_rate": 8.880994671403198e-05,
3834
- "loss": 0.4015,
3835
- "step": 5630
3836
- },
3837
- {
3838
- "epoch": 9.2,
3839
- "learning_rate": 8.703374777975133e-05,
3840
- "loss": 0.4304,
3841
- "step": 5640
3842
- },
3843
- {
3844
- "epoch": 9.22,
3845
- "learning_rate": 8.52575488454707e-05,
3846
- "loss": 0.4239,
3847
- "step": 5650
3848
- },
3849
- {
3850
- "epoch": 9.23,
3851
- "learning_rate": 8.348134991119005e-05,
3852
- "loss": 0.3997,
3853
- "step": 5660
3854
- },
3855
- {
3856
- "epoch": 9.25,
3857
- "learning_rate": 8.170515097690942e-05,
3858
- "loss": 0.4935,
3859
- "step": 5670
3860
- },
3861
- {
3862
- "epoch": 9.27,
3863
- "learning_rate": 7.992895204262877e-05,
3864
- "loss": 0.4035,
3865
- "step": 5680
3866
- },
3867
- {
3868
- "epoch": 9.28,
3869
- "learning_rate": 7.815275310834814e-05,
3870
- "loss": 0.435,
3871
- "step": 5690
3872
- },
3873
- {
3874
- "epoch": 9.3,
3875
- "learning_rate": 7.63765541740675e-05,
3876
- "loss": 0.4267,
3877
- "step": 5700
3878
- },
3879
- {
3880
- "epoch": 9.3,
3881
- "eval_loss": 0.4202769696712494,
3882
- "eval_runtime": 5.77,
3883
- "eval_samples_per_second": 202.946,
3884
- "eval_steps_per_second": 12.825,
3885
- "step": 5700
3886
- },
3887
- {
3888
- "epoch": 9.31,
3889
- "learning_rate": 7.460035523978686e-05,
3890
- "loss": 0.4154,
3891
- "step": 5710
3892
- },
3893
- {
3894
- "epoch": 9.33,
3895
- "learning_rate": 7.282415630550622e-05,
3896
- "loss": 0.4107,
3897
- "step": 5720
3898
- },
3899
- {
3900
- "epoch": 9.35,
3901
- "learning_rate": 7.104795737122558e-05,
3902
- "loss": 0.424,
3903
- "step": 5730
3904
- },
3905
- {
3906
- "epoch": 9.36,
3907
- "learning_rate": 6.927175843694494e-05,
3908
- "loss": 0.3896,
3909
- "step": 5740
3910
- },
3911
- {
3912
- "epoch": 9.38,
3913
- "learning_rate": 6.74955595026643e-05,
3914
- "loss": 0.4094,
3915
- "step": 5750
3916
- },
3917
- {
3918
- "epoch": 9.4,
3919
- "learning_rate": 6.571936056838366e-05,
3920
- "loss": 0.4134,
3921
- "step": 5760
3922
- },
3923
- {
3924
- "epoch": 9.41,
3925
- "learning_rate": 6.394316163410301e-05,
3926
- "loss": 0.4232,
3927
- "step": 5770
3928
- },
3929
- {
3930
- "epoch": 9.43,
3931
- "learning_rate": 6.216696269982238e-05,
3932
- "loss": 0.4005,
3933
- "step": 5780
3934
- },
3935
- {
3936
- "epoch": 9.45,
3937
- "learning_rate": 6.039076376554174e-05,
3938
- "loss": 0.4138,
3939
- "step": 5790
3940
- },
3941
- {
3942
- "epoch": 9.46,
3943
- "learning_rate": 5.861456483126111e-05,
3944
- "loss": 0.423,
3945
- "step": 5800
3946
- },
3947
- {
3948
- "epoch": 9.46,
3949
- "eval_loss": 0.4260061979293823,
3950
- "eval_runtime": 6.015,
3951
- "eval_samples_per_second": 194.679,
3952
- "eval_steps_per_second": 12.303,
3953
- "step": 5800
3954
- },
3955
- {
3956
- "epoch": 9.48,
3957
- "learning_rate": 5.6838365896980466e-05,
3958
- "loss": 0.3959,
3959
- "step": 5810
3960
- },
3961
- {
3962
- "epoch": 9.49,
3963
- "learning_rate": 5.5062166962699826e-05,
3964
- "loss": 0.4072,
3965
- "step": 5820
3966
- },
3967
- {
3968
- "epoch": 9.51,
3969
- "learning_rate": 5.328596802841918e-05,
3970
- "loss": 0.429,
3971
- "step": 5830
3972
- },
3973
- {
3974
- "epoch": 9.53,
3975
- "learning_rate": 5.1509769094138545e-05,
3976
- "loss": 0.4161,
3977
- "step": 5840
3978
- },
3979
- {
3980
- "epoch": 9.54,
3981
- "learning_rate": 4.9733570159857905e-05,
3982
- "loss": 0.4319,
3983
- "step": 5850
3984
- },
3985
- {
3986
- "epoch": 9.56,
3987
- "learning_rate": 4.7957371225577264e-05,
3988
- "loss": 0.4195,
3989
- "step": 5860
3990
- },
3991
- {
3992
- "epoch": 9.58,
3993
- "learning_rate": 4.6181172291296624e-05,
3994
- "loss": 0.409,
3995
- "step": 5870
3996
- },
3997
- {
3998
- "epoch": 9.59,
3999
- "learning_rate": 4.440497335701599e-05,
4000
- "loss": 0.4327,
4001
- "step": 5880
4002
- },
4003
- {
4004
- "epoch": 9.61,
4005
- "learning_rate": 4.262877442273535e-05,
4006
- "loss": 0.4337,
4007
- "step": 5890
4008
- },
4009
- {
4010
- "epoch": 9.62,
4011
- "learning_rate": 4.085257548845471e-05,
4012
- "loss": 0.4506,
4013
- "step": 5900
4014
- },
4015
- {
4016
- "epoch": 9.62,
4017
- "eval_loss": 0.4249822795391083,
4018
- "eval_runtime": 5.633,
4019
- "eval_samples_per_second": 207.881,
4020
- "eval_steps_per_second": 13.137,
4021
- "step": 5900
4022
- },
4023
- {
4024
- "epoch": 9.64,
4025
- "learning_rate": 3.907637655417407e-05,
4026
- "loss": 0.4067,
4027
- "step": 5910
4028
- },
4029
- {
4030
- "epoch": 9.66,
4031
- "learning_rate": 3.730017761989343e-05,
4032
- "loss": 0.3946,
4033
- "step": 5920
4034
- },
4035
- {
4036
- "epoch": 9.67,
4037
- "learning_rate": 3.552397868561279e-05,
4038
- "loss": 0.4301,
4039
- "step": 5930
4040
- },
4041
- {
4042
- "epoch": 9.69,
4043
- "learning_rate": 3.374777975133215e-05,
4044
- "loss": 0.4204,
4045
- "step": 5940
4046
- },
4047
- {
4048
- "epoch": 9.71,
4049
- "learning_rate": 3.197158081705151e-05,
4050
- "loss": 0.4298,
4051
- "step": 5950
4052
- },
4053
- {
4054
- "epoch": 9.72,
4055
- "learning_rate": 3.019538188277087e-05,
4056
- "loss": 0.4056,
4057
- "step": 5960
4058
- },
4059
- {
4060
- "epoch": 9.74,
4061
- "learning_rate": 2.8419182948490233e-05,
4062
- "loss": 0.4377,
4063
- "step": 5970
4064
- },
4065
- {
4066
- "epoch": 9.76,
4067
- "learning_rate": 2.664298401420959e-05,
4068
- "loss": 0.417,
4069
- "step": 5980
4070
- },
4071
- {
4072
- "epoch": 9.77,
4073
- "learning_rate": 2.4866785079928952e-05,
4074
- "loss": 0.4153,
4075
- "step": 5990
4076
- },
4077
- {
4078
- "epoch": 9.79,
4079
- "learning_rate": 2.3090586145648312e-05,
4080
- "loss": 0.4087,
4081
- "step": 6000
4082
- },
4083
- {
4084
- "epoch": 9.79,
4085
- "eval_loss": 0.4207456707954407,
4086
- "eval_runtime": 5.625,
4087
- "eval_samples_per_second": 208.177,
4088
- "eval_steps_per_second": 13.156,
4089
- "step": 6000
4090
  }
4091
  ],
4092
  "logging_steps": 10,
4093
  "max_steps": 6130,
4094
  "num_train_epochs": 10,
4095
  "save_steps": 500,
4096
- "total_flos": 1.3060363769413632e+16,
4097
  "trial_name": null,
4098
  "trial_params": null
4099
  }
 
1
  {
2
+ "best_metric": 0.5469470024108887,
3
+ "best_model_checkpoint": "bart_lora_outputs\\checkpoint-500",
4
+ "epoch": 0.8156606851549756,
5
  "eval_steps": 100,
6
+ "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
11
  {
12
  "epoch": 0.02,
13
  "learning_rate": 2e-05,
14
+ "loss": 2.9281,
15
  "step": 10
16
  },
17
  {
18
  "epoch": 0.03,
19
  "learning_rate": 4e-05,
20
+ "loss": 2.8201,
21
  "step": 20
22
  },
23
  {
24
  "epoch": 0.05,
25
  "learning_rate": 6e-05,
26
+ "loss": 2.579,
27
  "step": 30
28
  },
29
  {
30
  "epoch": 0.07,
31
  "learning_rate": 8e-05,
32
+ "loss": 2.4427,
33
  "step": 40
34
  },
35
  {
36
  "epoch": 0.08,
37
  "learning_rate": 0.0001,
38
+ "loss": 2.1681,
39
  "step": 50
40
  },
41
  {
42
  "epoch": 0.1,
43
  "learning_rate": 0.00012,
44
+ "loss": 1.7104,
45
  "step": 60
46
  },
47
  {
48
  "epoch": 0.11,
49
  "learning_rate": 0.00014000000000000001,
50
+ "loss": 1.4177,
51
  "step": 70
52
  },
53
  {
54
  "epoch": 0.13,
55
  "learning_rate": 0.00016,
56
+ "loss": 1.2515,
57
  "step": 80
58
  },
59
  {
60
  "epoch": 0.15,
61
  "learning_rate": 0.00017999999999999998,
62
+ "loss": 1.1238,
63
  "step": 90
64
  },
65
  {
66
  "epoch": 0.16,
67
  "learning_rate": 0.0002,
68
+ "loss": 1.0489,
69
  "step": 100
70
  },
71
  {
72
  "epoch": 0.16,
73
+ "eval_loss": 0.8469381928443909,
74
+ "eval_runtime": 5.9601,
75
+ "eval_samples_per_second": 196.474,
76
+ "eval_steps_per_second": 24.664,
77
  "step": 100
78
  },
79
  {
80
  "epoch": 0.18,
81
  "learning_rate": 0.00022,
82
+ "loss": 1.0079,
83
  "step": 110
84
  },
85
  {
86
  "epoch": 0.2,
87
  "learning_rate": 0.00024,
88
+ "loss": 1.0077,
89
  "step": 120
90
  },
91
  {
92
  "epoch": 0.21,
93
  "learning_rate": 0.00026000000000000003,
94
+ "loss": 0.9416,
95
  "step": 130
96
  },
97
  {
98
  "epoch": 0.23,
99
  "learning_rate": 0.00028000000000000003,
100
+ "loss": 0.8882,
101
  "step": 140
102
  },
103
  {
104
  "epoch": 0.24,
105
  "learning_rate": 0.0003,
106
+ "loss": 0.8595,
107
  "step": 150
108
  },
109
  {
110
  "epoch": 0.26,
111
  "learning_rate": 0.00032,
112
+ "loss": 0.8853,
113
  "step": 160
114
  },
115
  {
116
  "epoch": 0.28,
117
  "learning_rate": 0.00034,
118
+ "loss": 0.7678,
119
  "step": 170
120
  },
121
  {
122
  "epoch": 0.29,
123
  "learning_rate": 0.00035999999999999997,
124
+ "loss": 0.8595,
125
  "step": 180
126
  },
127
  {
128
  "epoch": 0.31,
129
  "learning_rate": 0.00038,
130
+ "loss": 0.8514,
131
  "step": 190
132
  },
133
  {
134
  "epoch": 0.33,
135
  "learning_rate": 0.0004,
136
+ "loss": 0.8128,
137
  "step": 200
138
  },
139
  {
140
  "epoch": 0.33,
141
+ "eval_loss": 0.6850531697273254,
142
+ "eval_runtime": 6.0279,
143
+ "eval_samples_per_second": 194.264,
144
+ "eval_steps_per_second": 24.387,
145
  "step": 200
146
  },
147
  {
148
  "epoch": 0.34,
149
  "learning_rate": 0.00042,
150
+ "loss": 0.7782,
151
  "step": 210
152
  },
153
  {
154
  "epoch": 0.36,
155
  "learning_rate": 0.00044,
156
+ "loss": 0.8064,
157
  "step": 220
158
  },
159
  {
160
  "epoch": 0.38,
161
  "learning_rate": 0.00046,
162
+ "loss": 0.7627,
163
  "step": 230
164
  },
165
  {
166
  "epoch": 0.39,
167
  "learning_rate": 0.00048,
168
+ "loss": 0.7447,
169
  "step": 240
170
  },
171
  {
172
  "epoch": 0.41,
173
  "learning_rate": 0.0005,
174
+ "loss": 0.7652,
175
  "step": 250
176
  },
177
  {
178
  "epoch": 0.42,
179
  "learning_rate": 0.0005200000000000001,
180
+ "loss": 0.7568,
181
  "step": 260
182
  },
183
  {
184
  "epoch": 0.44,
185
  "learning_rate": 0.00054,
186
+ "loss": 0.7291,
187
  "step": 270
188
  },
189
  {
190
  "epoch": 0.46,
191
  "learning_rate": 0.0005600000000000001,
192
+ "loss": 0.7118,
193
  "step": 280
194
  },
195
  {
196
  "epoch": 0.47,
197
  "learning_rate": 0.00058,
198
+ "loss": 0.7462,
199
  "step": 290
200
  },
201
  {
202
  "epoch": 0.49,
203
  "learning_rate": 0.0006,
204
+ "loss": 0.6866,
205
  "step": 300
206
  },
207
  {
208
  "epoch": 0.49,
209
+ "eval_loss": 0.629724383354187,
210
+ "eval_runtime": 6.0378,
211
+ "eval_samples_per_second": 193.943,
212
+ "eval_steps_per_second": 24.346,
213
  "step": 300
214
  },
215
  {
216
  "epoch": 0.51,
217
  "learning_rate": 0.00062,
218
+ "loss": 0.6995,
219
  "step": 310
220
  },
221
  {
222
  "epoch": 0.52,
223
  "learning_rate": 0.00064,
224
+ "loss": 0.724,
225
  "step": 320
226
  },
227
  {
228
  "epoch": 0.54,
229
  "learning_rate": 0.00066,
230
+ "loss": 0.6698,
231
  "step": 330
232
  },
233
  {
234
  "epoch": 0.55,
235
  "learning_rate": 0.00068,
236
+ "loss": 0.6516,
237
  "step": 340
238
  },
239
  {
240
  "epoch": 0.57,
241
  "learning_rate": 0.0007,
242
+ "loss": 0.6657,
243
  "step": 350
244
  },
245
  {
246
  "epoch": 0.59,
247
  "learning_rate": 0.0007199999999999999,
248
+ "loss": 0.6765,
249
  "step": 360
250
  },
251
  {
252
  "epoch": 0.6,
253
  "learning_rate": 0.00074,
254
+ "loss": 0.6596,
255
  "step": 370
256
  },
257
  {
258
  "epoch": 0.62,
259
  "learning_rate": 0.00076,
260
+ "loss": 0.6884,
261
  "step": 380
262
  },
263
  {
264
  "epoch": 0.64,
265
  "learning_rate": 0.0007800000000000001,
266
+ "loss": 0.647,
267
  "step": 390
268
  },
269
  {
270
  "epoch": 0.65,
271
  "learning_rate": 0.0008,
272
+ "loss": 0.713,
273
  "step": 400
274
  },
275
  {
276
  "epoch": 0.65,
277
+ "eval_loss": 0.5541791319847107,
278
+ "eval_runtime": 6.0716,
279
+ "eval_samples_per_second": 192.864,
280
+ "eval_steps_per_second": 24.211,
281
  "step": 400
282
  },
283
  {
284
  "epoch": 0.67,
285
  "learning_rate": 0.00082,
286
+ "loss": 0.6593,
287
  "step": 410
288
  },
289
  {
290
  "epoch": 0.69,
291
  "learning_rate": 0.00084,
292
+ "loss": 0.62,
293
  "step": 420
294
  },
295
  {
296
  "epoch": 0.7,
297
  "learning_rate": 0.00086,
298
+ "loss": 0.6912,
299
  "step": 430
300
  },
301
  {
302
  "epoch": 0.72,
303
  "learning_rate": 0.00088,
304
+ "loss": 0.6407,
305
  "step": 440
306
  },
307
  {
308
  "epoch": 0.73,
309
  "learning_rate": 0.0009000000000000001,
310
+ "loss": 0.6444,
311
  "step": 450
312
  },
313
  {
314
  "epoch": 0.75,
315
  "learning_rate": 0.00092,
316
+ "loss": 0.6591,
317
  "step": 460
318
  },
319
  {
320
  "epoch": 0.77,
321
  "learning_rate": 0.00094,
322
+ "loss": 0.6329,
323
  "step": 470
324
  },
325
  {
326
  "epoch": 0.78,
327
  "learning_rate": 0.00096,
328
+ "loss": 0.6097,
329
  "step": 480
330
  },
331
  {
332
  "epoch": 0.8,
333
  "learning_rate": 0.00098,
334
+ "loss": 0.6444,
335
  "step": 490
336
  },
337
  {
338
  "epoch": 0.82,
339
  "learning_rate": 0.001,
340
+ "loss": 0.6106,
341
  "step": 500
342
  },
343
  {
344
  "epoch": 0.82,
345
+ "eval_loss": 0.5469470024108887,
346
+ "eval_runtime": 6.1353,
347
+ "eval_samples_per_second": 190.861,
348
+ "eval_steps_per_second": 23.96,
349
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  }
351
  ],
352
  "logging_steps": 10,
353
  "max_steps": 6130,
354
  "num_train_epochs": 10,
355
  "save_steps": 500,
356
+ "total_flos": 939361765588992.0,
357
  "trial_name": null,
358
  "trial_params": null
359
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef926b73ee7f8ea582bb0c8e88b44eeac71091525992f56ddfde5d64524b7acf
3
  size 4600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c82f8daa53495bfef11faca7a8d954f8023465412fa138e9c80f9e8382c966a8
3
  size 4600