somaia02 commited on
Commit
858ae23
·
1 Parent(s): 60a0fa7

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
- "base_model_name_or_path": null,
5
  "bias": "none",
6
  "fan_in_fan_out": false,
7
  "inference_mode": true,
@@ -16,9 +16,9 @@
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
 
19
  "k_proj",
20
- "v_proj",
21
- "q_proj"
22
  ],
23
  "task_type": "CAUSAL_LM"
24
  }
 
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": "moussaKam/AraBART",
5
  "bias": "none",
6
  "fan_in_fan_out": false,
7
  "inference_mode": true,
 
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
+ "q_proj",
20
  "k_proj",
21
+ "v_proj"
 
22
  ],
23
  "task_type": "CAUSAL_LM"
24
  }
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7935188532ca057c97e35d2ac25f19e5fae4cf310316411033753c2aedac4f4f
3
- size 2671008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06a00240c5d34a9ff7edd4dd5a3e092a2a18c12ba3bd2b86f429da7a7b14e38e
3
+ size 2669168
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:466f7b6852211bc4dc3e6a84306a602d73d2cbae547dc06726b7afc040129e28
3
  size 5399290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6861692a197e3d3172244c88b75085295c5d68e644c6e610f8fd0d1cb88cdb8
3
  size 5399290
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:44d78c7a5b58042ac25fc7da4ea65482e30dd6f8da26fca722f9c52e7f8076c1
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2d2b8552fa83772d62927b9906f821a2ecef0bd6edbcaab4a1c02dfdd6b1cee
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a9689c060fa18f4a65213217a79366f663c930ab5ba6ad66745c2e7b18c5a2f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c82bedf0a611f290596df2fde142fbda2afa059d93dc846b92ee4f876380a79
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,1429 +1,359 @@
1
  {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
- "eval_steps": 30,
6
- "global_step": 1635,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.01,
13
  "learning_rate": 2e-05,
14
- "loss": 2.9348,
15
  "step": 10
16
  },
17
  {
18
- "epoch": 0.01,
19
  "learning_rate": 4e-05,
20
- "loss": 2.993,
21
  "step": 20
22
  },
23
  {
24
- "epoch": 0.02,
25
  "learning_rate": 6e-05,
26
- "loss": 2.6684,
27
- "step": 30
28
- },
29
- {
30
- "epoch": 0.02,
31
- "eval_loss": 2.1267645359039307,
32
- "eval_runtime": 14.5639,
33
- "eval_samples_per_second": 80.404,
34
- "eval_steps_per_second": 6.729,
35
  "step": 30
36
  },
37
  {
38
- "epoch": 0.02,
39
  "learning_rate": 8e-05,
40
- "loss": 2.493,
41
  "step": 40
42
  },
43
  {
44
- "epoch": 0.03,
45
  "learning_rate": 0.0001,
46
- "loss": 2.1131,
47
  "step": 50
48
  },
49
  {
50
- "epoch": 0.04,
51
  "learning_rate": 0.00012,
52
- "loss": 1.7994,
53
- "step": 60
54
- },
55
- {
56
- "epoch": 0.04,
57
- "eval_loss": 1.461239218711853,
58
- "eval_runtime": 15.3978,
59
- "eval_samples_per_second": 76.05,
60
- "eval_steps_per_second": 6.365,
61
  "step": 60
62
  },
63
  {
64
- "epoch": 0.04,
65
  "learning_rate": 0.00014000000000000001,
66
- "loss": 1.3698,
67
  "step": 70
68
  },
69
  {
70
- "epoch": 0.05,
71
  "learning_rate": 0.00016,
72
- "loss": 1.205,
73
  "step": 80
74
  },
75
  {
76
- "epoch": 0.06,
77
  "learning_rate": 0.00017999999999999998,
78
- "loss": 1.1072,
79
  "step": 90
80
  },
81
  {
82
- "epoch": 0.06,
83
- "eval_loss": 0.979037344455719,
84
- "eval_runtime": 15.999,
85
- "eval_samples_per_second": 73.192,
86
- "eval_steps_per_second": 6.125,
87
- "step": 90
88
  },
89
  {
90
- "epoch": 0.06,
91
- "learning_rate": 0.0002,
92
- "loss": 1.154,
 
 
93
  "step": 100
94
  },
95
  {
96
- "epoch": 0.07,
97
  "learning_rate": 0.00022,
98
- "loss": 0.9462,
99
  "step": 110
100
  },
101
  {
102
- "epoch": 0.07,
103
  "learning_rate": 0.00024,
104
- "loss": 1.052,
105
- "step": 120
106
- },
107
- {
108
- "epoch": 0.07,
109
- "eval_loss": 0.8346178531646729,
110
- "eval_runtime": 15.0815,
111
- "eval_samples_per_second": 77.645,
112
- "eval_steps_per_second": 6.498,
113
  "step": 120
114
  },
115
  {
116
- "epoch": 0.08,
117
  "learning_rate": 0.00026000000000000003,
118
- "loss": 1.0541,
119
  "step": 130
120
  },
121
  {
122
- "epoch": 0.09,
123
  "learning_rate": 0.00028000000000000003,
124
- "loss": 0.9987,
125
  "step": 140
126
  },
127
  {
128
- "epoch": 0.09,
129
  "learning_rate": 0.0003,
130
- "loss": 0.9079,
131
- "step": 150
132
- },
133
- {
134
- "epoch": 0.09,
135
- "eval_loss": 0.8187346458435059,
136
- "eval_runtime": 15.0493,
137
- "eval_samples_per_second": 77.811,
138
- "eval_steps_per_second": 6.512,
139
  "step": 150
140
  },
141
  {
142
- "epoch": 0.1,
143
  "learning_rate": 0.00032,
144
- "loss": 0.9271,
145
  "step": 160
146
  },
147
  {
148
- "epoch": 0.1,
149
  "learning_rate": 0.00034,
150
- "loss": 0.9079,
151
  "step": 170
152
  },
153
  {
154
- "epoch": 0.11,
155
  "learning_rate": 0.00035999999999999997,
156
- "loss": 0.8843,
157
- "step": 180
158
- },
159
- {
160
- "epoch": 0.11,
161
- "eval_loss": 0.7424212694168091,
162
- "eval_runtime": 15.4048,
163
- "eval_samples_per_second": 76.016,
164
- "eval_steps_per_second": 6.362,
165
  "step": 180
166
  },
167
  {
168
- "epoch": 0.12,
169
  "learning_rate": 0.00038,
170
- "loss": 0.818,
171
  "step": 190
172
  },
173
  {
174
- "epoch": 0.12,
175
  "learning_rate": 0.0004,
176
- "loss": 0.8288,
177
  "step": 200
178
  },
179
  {
180
- "epoch": 0.13,
181
- "learning_rate": 0.00042,
182
- "loss": 0.9119,
183
- "step": 210
 
 
184
  },
185
  {
186
- "epoch": 0.13,
187
- "eval_loss": 0.7072588205337524,
188
- "eval_runtime": 15.4947,
189
- "eval_samples_per_second": 75.574,
190
- "eval_steps_per_second": 6.325,
191
  "step": 210
192
  },
193
  {
194
- "epoch": 0.13,
195
  "learning_rate": 0.00044,
196
- "loss": 0.8642,
197
  "step": 220
198
  },
199
  {
200
- "epoch": 0.14,
201
  "learning_rate": 0.00046,
202
- "loss": 0.7842,
203
  "step": 230
204
  },
205
  {
206
- "epoch": 0.15,
207
  "learning_rate": 0.00048,
208
- "loss": 0.8218,
209
- "step": 240
210
- },
211
- {
212
- "epoch": 0.15,
213
- "eval_loss": 0.6722940802574158,
214
- "eval_runtime": 15.2385,
215
- "eval_samples_per_second": 76.845,
216
- "eval_steps_per_second": 6.431,
217
  "step": 240
218
  },
219
  {
220
- "epoch": 0.15,
221
  "learning_rate": 0.0005,
222
- "loss": 0.8302,
223
  "step": 250
224
  },
225
  {
226
- "epoch": 0.16,
227
  "learning_rate": 0.0005200000000000001,
228
- "loss": 0.81,
229
  "step": 260
230
  },
231
  {
232
- "epoch": 0.17,
233
  "learning_rate": 0.00054,
234
- "loss": 0.7181,
235
- "step": 270
236
- },
237
- {
238
- "epoch": 0.17,
239
- "eval_loss": 0.6630179286003113,
240
- "eval_runtime": 15.2349,
241
- "eval_samples_per_second": 76.863,
242
- "eval_steps_per_second": 6.433,
243
  "step": 270
244
  },
245
  {
246
- "epoch": 0.17,
247
  "learning_rate": 0.0005600000000000001,
248
- "loss": 0.7874,
249
  "step": 280
250
  },
251
  {
252
- "epoch": 0.18,
253
  "learning_rate": 0.00058,
254
- "loss": 0.7636,
255
  "step": 290
256
  },
257
  {
258
- "epoch": 0.18,
259
  "learning_rate": 0.0006,
260
- "loss": 0.7933,
261
  "step": 300
262
  },
263
  {
264
- "epoch": 0.18,
265
- "eval_loss": 0.648048460483551,
266
- "eval_runtime": 15.3271,
267
- "eval_samples_per_second": 76.401,
268
- "eval_steps_per_second": 6.394,
269
  "step": 300
270
  },
271
  {
272
- "epoch": 0.19,
273
  "learning_rate": 0.00062,
274
- "loss": 0.806,
275
  "step": 310
276
  },
277
  {
278
- "epoch": 0.2,
279
  "learning_rate": 0.00064,
280
- "loss": 0.8229,
281
  "step": 320
282
  },
283
  {
284
- "epoch": 0.2,
285
  "learning_rate": 0.00066,
286
- "loss": 0.7979,
287
- "step": 330
288
- },
289
- {
290
- "epoch": 0.2,
291
- "eval_loss": 0.6749615669250488,
292
- "eval_runtime": 15.4179,
293
- "eval_samples_per_second": 75.951,
294
- "eval_steps_per_second": 6.356,
295
  "step": 330
296
  },
297
  {
298
- "epoch": 0.21,
299
  "learning_rate": 0.00068,
300
- "loss": 0.7493,
301
  "step": 340
302
  },
303
  {
304
- "epoch": 0.21,
305
  "learning_rate": 0.0007,
306
- "loss": 0.8082,
307
  "step": 350
308
  },
309
  {
310
- "epoch": 0.22,
311
  "learning_rate": 0.0007199999999999999,
312
- "loss": 0.7077,
313
- "step": 360
314
- },
315
- {
316
- "epoch": 0.22,
317
- "eval_loss": 0.6362787485122681,
318
- "eval_runtime": 15.3758,
319
- "eval_samples_per_second": 76.159,
320
- "eval_steps_per_second": 6.374,
321
  "step": 360
322
  },
323
  {
324
- "epoch": 0.23,
325
  "learning_rate": 0.00074,
326
- "loss": 0.7082,
327
  "step": 370
328
  },
329
  {
330
- "epoch": 0.23,
331
  "learning_rate": 0.00076,
332
- "loss": 0.7552,
333
  "step": 380
334
  },
335
  {
336
- "epoch": 0.24,
337
  "learning_rate": 0.0007800000000000001,
338
- "loss": 0.7012,
339
  "step": 390
340
  },
341
  {
342
- "epoch": 0.24,
343
- "eval_loss": 0.6303613781929016,
344
- "eval_runtime": 15.3461,
345
- "eval_samples_per_second": 76.306,
346
- "eval_steps_per_second": 6.386,
347
- "step": 390
348
  },
349
  {
350
- "epoch": 0.24,
351
- "learning_rate": 0.0008,
352
- "loss": 0.6524,
 
 
353
  "step": 400
354
  },
355
  {
356
- "epoch": 0.25,
357
  "learning_rate": 0.00082,
358
- "loss": 0.7403,
359
  "step": 410
360
  },
361
  {
362
- "epoch": 0.26,
363
  "learning_rate": 0.00084,
364
- "loss": 0.7808,
365
- "step": 420
366
- },
367
- {
368
- "epoch": 0.26,
369
- "eval_loss": 0.6001694798469543,
370
- "eval_runtime": 15.2815,
371
- "eval_samples_per_second": 76.629,
372
- "eval_steps_per_second": 6.413,
373
  "step": 420
374
  },
375
  {
376
- "epoch": 0.26,
377
  "learning_rate": 0.00086,
378
- "loss": 0.6942,
379
  "step": 430
380
  },
381
  {
382
- "epoch": 0.27,
383
  "learning_rate": 0.00088,
384
- "loss": 0.6691,
385
  "step": 440
386
  },
387
  {
388
- "epoch": 0.28,
389
  "learning_rate": 0.0009000000000000001,
390
- "loss": 0.613,
391
  "step": 450
392
  },
393
  {
394
- "epoch": 0.28,
395
- "eval_loss": 0.6457517743110657,
396
- "eval_runtime": 15.2997,
397
- "eval_samples_per_second": 76.537,
398
- "eval_steps_per_second": 6.405,
399
- "step": 450
400
- },
401
- {
402
- "epoch": 0.28,
403
  "learning_rate": 0.00092,
404
- "loss": 0.7459,
405
  "step": 460
406
  },
407
  {
408
- "epoch": 0.29,
409
  "learning_rate": 0.00094,
410
- "loss": 0.7077,
411
  "step": 470
412
  },
413
  {
414
- "epoch": 0.29,
415
  "learning_rate": 0.00096,
416
- "loss": 0.6939,
417
- "step": 480
418
- },
419
- {
420
- "epoch": 0.29,
421
- "eval_loss": 0.6020320057868958,
422
- "eval_runtime": 15.2909,
423
- "eval_samples_per_second": 76.581,
424
- "eval_steps_per_second": 6.409,
425
  "step": 480
426
  },
427
  {
428
- "epoch": 0.3,
429
  "learning_rate": 0.00098,
430
- "loss": 0.7313,
431
  "step": 490
432
  },
433
  {
434
- "epoch": 0.31,
435
  "learning_rate": 0.001,
436
- "loss": 0.6848,
437
  "step": 500
438
  },
439
- {
440
- "epoch": 0.31,
441
- "learning_rate": 0.0009993690851735015,
442
- "loss": 0.7805,
443
- "step": 510
444
- },
445
- {
446
- "epoch": 0.31,
447
- "eval_loss": 0.5767749547958374,
448
- "eval_runtime": 15.3007,
449
- "eval_samples_per_second": 76.532,
450
- "eval_steps_per_second": 6.405,
451
- "step": 510
452
- },
453
- {
454
- "epoch": 0.32,
455
- "learning_rate": 0.0009987381703470033,
456
- "loss": 0.6732,
457
- "step": 520
458
- },
459
- {
460
- "epoch": 0.32,
461
- "learning_rate": 0.0009981072555205047,
462
- "loss": 0.6052,
463
- "step": 530
464
- },
465
- {
466
- "epoch": 0.33,
467
- "learning_rate": 0.0009974763406940062,
468
- "loss": 0.664,
469
- "step": 540
470
- },
471
- {
472
- "epoch": 0.33,
473
- "eval_loss": 0.5918195843696594,
474
- "eval_runtime": 15.2896,
475
- "eval_samples_per_second": 76.588,
476
- "eval_steps_per_second": 6.41,
477
- "step": 540
478
- },
479
- {
480
- "epoch": 0.34,
481
- "learning_rate": 0.000996845425867508,
482
- "loss": 0.657,
483
- "step": 550
484
- },
485
- {
486
- "epoch": 0.34,
487
- "learning_rate": 0.0009962145110410095,
488
- "loss": 0.6524,
489
- "step": 560
490
- },
491
- {
492
- "epoch": 0.35,
493
- "learning_rate": 0.0009955835962145111,
494
- "loss": 0.7756,
495
- "step": 570
496
- },
497
- {
498
- "epoch": 0.35,
499
- "eval_loss": 0.5548932552337646,
500
- "eval_runtime": 15.3185,
501
- "eval_samples_per_second": 76.443,
502
- "eval_steps_per_second": 6.397,
503
- "step": 570
504
- },
505
- {
506
- "epoch": 0.35,
507
- "learning_rate": 0.0009949526813880128,
508
- "loss": 0.6771,
509
- "step": 580
510
- },
511
- {
512
- "epoch": 0.36,
513
- "learning_rate": 0.0009943217665615142,
514
- "loss": 0.6054,
515
- "step": 590
516
- },
517
- {
518
- "epoch": 0.37,
519
- "learning_rate": 0.0009936908517350158,
520
- "loss": 0.6582,
521
- "step": 600
522
- },
523
- {
524
- "epoch": 0.37,
525
- "eval_loss": 0.5589831471443176,
526
- "eval_runtime": 15.3491,
527
- "eval_samples_per_second": 76.291,
528
- "eval_steps_per_second": 6.385,
529
- "step": 600
530
- },
531
- {
532
- "epoch": 0.37,
533
- "learning_rate": 0.0009930599369085175,
534
- "loss": 0.6386,
535
- "step": 610
536
- },
537
- {
538
- "epoch": 0.38,
539
- "learning_rate": 0.000992429022082019,
540
- "loss": 0.6355,
541
- "step": 620
542
- },
543
- {
544
- "epoch": 0.39,
545
- "learning_rate": 0.0009917981072555206,
546
- "loss": 0.6883,
547
- "step": 630
548
- },
549
- {
550
- "epoch": 0.39,
551
- "eval_loss": 0.5954719185829163,
552
- "eval_runtime": 15.317,
553
- "eval_samples_per_second": 76.451,
554
- "eval_steps_per_second": 6.398,
555
- "step": 630
556
- },
557
- {
558
- "epoch": 0.39,
559
- "learning_rate": 0.0009911671924290222,
560
- "loss": 0.6283,
561
- "step": 640
562
- },
563
- {
564
- "epoch": 0.4,
565
- "learning_rate": 0.0009905362776025236,
566
- "loss": 0.6973,
567
- "step": 650
568
- },
569
- {
570
- "epoch": 0.4,
571
- "learning_rate": 0.0009899053627760253,
572
- "loss": 0.6252,
573
- "step": 660
574
- },
575
- {
576
- "epoch": 0.4,
577
- "eval_loss": 0.5695869326591492,
578
- "eval_runtime": 15.3301,
579
- "eval_samples_per_second": 76.386,
580
- "eval_steps_per_second": 6.393,
581
- "step": 660
582
- },
583
- {
584
- "epoch": 0.41,
585
- "learning_rate": 0.000989274447949527,
586
- "loss": 0.6971,
587
- "step": 670
588
- },
589
- {
590
- "epoch": 0.42,
591
- "learning_rate": 0.0009886435331230286,
592
- "loss": 0.6276,
593
- "step": 680
594
- },
595
- {
596
- "epoch": 0.42,
597
- "learning_rate": 0.00098801261829653,
598
- "loss": 0.6776,
599
- "step": 690
600
- },
601
- {
602
- "epoch": 0.42,
603
- "eval_loss": 0.5438072085380554,
604
- "eval_runtime": 15.2869,
605
- "eval_samples_per_second": 76.602,
606
- "eval_steps_per_second": 6.411,
607
- "step": 690
608
- },
609
- {
610
- "epoch": 0.43,
611
- "learning_rate": 0.0009873817034700316,
612
- "loss": 0.5928,
613
- "step": 700
614
- },
615
- {
616
- "epoch": 0.43,
617
- "learning_rate": 0.0009867507886435333,
618
- "loss": 0.6095,
619
- "step": 710
620
- },
621
- {
622
- "epoch": 0.44,
623
- "learning_rate": 0.0009861198738170347,
624
- "loss": 0.6427,
625
- "step": 720
626
- },
627
- {
628
- "epoch": 0.44,
629
- "eval_loss": 0.5402929186820984,
630
- "eval_runtime": 15.332,
631
- "eval_samples_per_second": 76.376,
632
- "eval_steps_per_second": 6.392,
633
- "step": 720
634
- },
635
- {
636
- "epoch": 0.45,
637
- "learning_rate": 0.0009854889589905364,
638
- "loss": 0.5991,
639
- "step": 730
640
- },
641
- {
642
- "epoch": 0.45,
643
- "learning_rate": 0.000984858044164038,
644
- "loss": 0.6153,
645
- "step": 740
646
- },
647
- {
648
- "epoch": 0.46,
649
- "learning_rate": 0.0009842271293375394,
650
- "loss": 0.6422,
651
- "step": 750
652
- },
653
- {
654
- "epoch": 0.46,
655
- "eval_loss": 0.55536949634552,
656
- "eval_runtime": 15.315,
657
- "eval_samples_per_second": 76.461,
658
- "eval_steps_per_second": 6.399,
659
- "step": 750
660
- },
661
- {
662
- "epoch": 0.46,
663
- "learning_rate": 0.000983596214511041,
664
- "loss": 0.6874,
665
- "step": 760
666
- },
667
- {
668
- "epoch": 0.47,
669
- "learning_rate": 0.0009829652996845427,
670
- "loss": 0.6588,
671
- "step": 770
672
- },
673
- {
674
- "epoch": 0.48,
675
- "learning_rate": 0.0009823343848580442,
676
- "loss": 0.5467,
677
- "step": 780
678
- },
679
- {
680
- "epoch": 0.48,
681
- "eval_loss": 0.5687702298164368,
682
- "eval_runtime": 15.3018,
683
- "eval_samples_per_second": 76.527,
684
- "eval_steps_per_second": 6.404,
685
- "step": 780
686
- },
687
- {
688
- "epoch": 0.48,
689
- "learning_rate": 0.0009817034700315458,
690
- "loss": 0.5929,
691
- "step": 790
692
- },
693
- {
694
- "epoch": 0.49,
695
- "learning_rate": 0.0009810725552050475,
696
- "loss": 0.6407,
697
- "step": 800
698
- },
699
- {
700
- "epoch": 0.5,
701
- "learning_rate": 0.0009804416403785489,
702
- "loss": 0.621,
703
- "step": 810
704
- },
705
- {
706
- "epoch": 0.5,
707
- "eval_loss": 0.5593787431716919,
708
- "eval_runtime": 15.32,
709
- "eval_samples_per_second": 76.436,
710
- "eval_steps_per_second": 6.397,
711
- "step": 810
712
- },
713
- {
714
- "epoch": 0.5,
715
- "learning_rate": 0.0009798107255520505,
716
- "loss": 0.6191,
717
- "step": 820
718
- },
719
- {
720
- "epoch": 0.51,
721
- "learning_rate": 0.0009791798107255522,
722
- "loss": 0.6468,
723
- "step": 830
724
- },
725
- {
726
- "epoch": 0.51,
727
- "learning_rate": 0.0009785488958990536,
728
- "loss": 0.6486,
729
- "step": 840
730
- },
731
- {
732
- "epoch": 0.51,
733
- "eval_loss": 0.542827308177948,
734
- "eval_runtime": 15.3141,
735
- "eval_samples_per_second": 76.466,
736
- "eval_steps_per_second": 6.399,
737
- "step": 840
738
- },
739
- {
740
- "epoch": 0.52,
741
- "learning_rate": 0.0009779179810725552,
742
- "loss": 0.6503,
743
- "step": 850
744
- },
745
- {
746
- "epoch": 0.53,
747
- "learning_rate": 0.000977287066246057,
748
- "loss": 0.5697,
749
- "step": 860
750
- },
751
- {
752
- "epoch": 0.53,
753
- "learning_rate": 0.0009766561514195583,
754
- "loss": 0.5993,
755
- "step": 870
756
- },
757
- {
758
- "epoch": 0.53,
759
- "eval_loss": 0.5758046507835388,
760
- "eval_runtime": 15.2882,
761
- "eval_samples_per_second": 76.595,
762
- "eval_steps_per_second": 6.41,
763
- "step": 870
764
- },
765
- {
766
- "epoch": 0.54,
767
- "learning_rate": 0.00097602523659306,
768
- "loss": 0.6151,
769
- "step": 880
770
- },
771
- {
772
- "epoch": 0.54,
773
- "learning_rate": 0.0009753943217665615,
774
- "loss": 0.5733,
775
- "step": 890
776
- },
777
- {
778
- "epoch": 0.55,
779
- "learning_rate": 0.0009747634069400632,
780
- "loss": 0.5637,
781
- "step": 900
782
- },
783
- {
784
- "epoch": 0.55,
785
- "eval_loss": 0.5380309224128723,
786
- "eval_runtime": 15.3344,
787
- "eval_samples_per_second": 76.364,
788
- "eval_steps_per_second": 6.391,
789
- "step": 900
790
- },
791
- {
792
- "epoch": 0.56,
793
- "learning_rate": 0.0009741324921135647,
794
- "loss": 0.5768,
795
- "step": 910
796
- },
797
- {
798
- "epoch": 0.56,
799
- "learning_rate": 0.0009735015772870662,
800
- "loss": 0.5445,
801
- "step": 920
802
- },
803
- {
804
- "epoch": 0.57,
805
- "learning_rate": 0.0009728706624605679,
806
- "loss": 0.627,
807
- "step": 930
808
- },
809
- {
810
- "epoch": 0.57,
811
- "eval_loss": 0.5249527096748352,
812
- "eval_runtime": 15.3162,
813
- "eval_samples_per_second": 76.455,
814
- "eval_steps_per_second": 6.398,
815
- "step": 930
816
- },
817
- {
818
- "epoch": 0.57,
819
- "learning_rate": 0.0009722397476340694,
820
- "loss": 0.5396,
821
- "step": 940
822
- },
823
- {
824
- "epoch": 0.58,
825
- "learning_rate": 0.000971608832807571,
826
- "loss": 0.6558,
827
- "step": 950
828
- },
829
- {
830
- "epoch": 0.59,
831
- "learning_rate": 0.0009709779179810726,
832
- "loss": 0.5876,
833
- "step": 960
834
- },
835
- {
836
- "epoch": 0.59,
837
- "eval_loss": 0.5324742197990417,
838
- "eval_runtime": 15.3008,
839
- "eval_samples_per_second": 76.532,
840
- "eval_steps_per_second": 6.405,
841
- "step": 960
842
- },
843
- {
844
- "epoch": 0.59,
845
- "learning_rate": 0.0009703470031545741,
846
- "loss": 0.581,
847
- "step": 970
848
- },
849
- {
850
- "epoch": 0.6,
851
- "learning_rate": 0.0009697160883280758,
852
- "loss": 0.5938,
853
- "step": 980
854
- },
855
- {
856
- "epoch": 0.61,
857
- "learning_rate": 0.0009690851735015773,
858
- "loss": 0.5539,
859
- "step": 990
860
- },
861
- {
862
- "epoch": 0.61,
863
- "eval_loss": 0.5384231209754944,
864
- "eval_runtime": 15.3241,
865
- "eval_samples_per_second": 76.416,
866
- "eval_steps_per_second": 6.395,
867
- "step": 990
868
- },
869
- {
870
- "epoch": 0.61,
871
- "learning_rate": 0.0009684542586750789,
872
- "loss": 0.6287,
873
- "step": 1000
874
- },
875
- {
876
- "epoch": 0.62,
877
- "learning_rate": 0.0009678233438485805,
878
- "loss": 0.6094,
879
- "step": 1010
880
- },
881
- {
882
- "epoch": 0.62,
883
- "learning_rate": 0.000967192429022082,
884
- "loss": 0.5671,
885
- "step": 1020
886
- },
887
- {
888
- "epoch": 0.62,
889
- "eval_loss": 0.5662776827812195,
890
- "eval_runtime": 15.3105,
891
- "eval_samples_per_second": 76.483,
892
- "eval_steps_per_second": 6.401,
893
- "step": 1020
894
- },
895
- {
896
- "epoch": 0.63,
897
- "learning_rate": 0.0009665615141955836,
898
- "loss": 0.5863,
899
- "step": 1030
900
- },
901
- {
902
- "epoch": 0.64,
903
- "learning_rate": 0.0009659305993690852,
904
- "loss": 0.5675,
905
- "step": 1040
906
- },
907
- {
908
- "epoch": 0.64,
909
- "learning_rate": 0.0009652996845425868,
910
- "loss": 0.714,
911
- "step": 1050
912
- },
913
- {
914
- "epoch": 0.64,
915
- "eval_loss": 0.5345898270606995,
916
- "eval_runtime": 15.3604,
917
- "eval_samples_per_second": 76.235,
918
- "eval_steps_per_second": 6.38,
919
- "step": 1050
920
- },
921
- {
922
- "epoch": 0.65,
923
- "learning_rate": 0.0009646687697160883,
924
- "loss": 0.6364,
925
- "step": 1060
926
- },
927
- {
928
- "epoch": 0.65,
929
- "learning_rate": 0.0009640378548895899,
930
- "loss": 0.5733,
931
- "step": 1070
932
- },
933
- {
934
- "epoch": 0.66,
935
- "learning_rate": 0.0009634069400630915,
936
- "loss": 0.6243,
937
- "step": 1080
938
- },
939
- {
940
- "epoch": 0.66,
941
- "eval_loss": 0.5188413858413696,
942
- "eval_runtime": 15.3443,
943
- "eval_samples_per_second": 76.315,
944
- "eval_steps_per_second": 6.387,
945
- "step": 1080
946
- },
947
- {
948
- "epoch": 0.67,
949
- "learning_rate": 0.0009627760252365931,
950
- "loss": 0.5955,
951
- "step": 1090
952
- },
953
- {
954
- "epoch": 0.67,
955
- "learning_rate": 0.0009621451104100947,
956
- "loss": 0.5949,
957
- "step": 1100
958
- },
959
- {
960
- "epoch": 0.68,
961
- "learning_rate": 0.0009615141955835962,
962
- "loss": 0.5342,
963
- "step": 1110
964
- },
965
- {
966
- "epoch": 0.68,
967
- "eval_loss": 0.5572788119316101,
968
- "eval_runtime": 15.4088,
969
- "eval_samples_per_second": 75.996,
970
- "eval_steps_per_second": 6.36,
971
- "step": 1110
972
- },
973
- {
974
- "epoch": 0.69,
975
- "learning_rate": 0.0009608832807570978,
976
- "loss": 0.5371,
977
- "step": 1120
978
- },
979
- {
980
- "epoch": 0.69,
981
- "learning_rate": 0.0009602523659305994,
982
- "loss": 0.6767,
983
- "step": 1130
984
- },
985
- {
986
- "epoch": 0.7,
987
- "learning_rate": 0.0009596214511041009,
988
- "loss": 0.589,
989
- "step": 1140
990
- },
991
- {
992
- "epoch": 0.7,
993
- "eval_loss": 0.5333449840545654,
994
- "eval_runtime": 15.3542,
995
- "eval_samples_per_second": 76.266,
996
- "eval_steps_per_second": 6.383,
997
- "step": 1140
998
- },
999
- {
1000
- "epoch": 0.7,
1001
- "learning_rate": 0.0009589905362776026,
1002
- "loss": 0.5914,
1003
- "step": 1150
1004
- },
1005
- {
1006
- "epoch": 0.71,
1007
- "learning_rate": 0.0009583596214511041,
1008
- "loss": 0.547,
1009
- "step": 1160
1010
- },
1011
- {
1012
- "epoch": 0.72,
1013
- "learning_rate": 0.0009577287066246056,
1014
- "loss": 0.577,
1015
- "step": 1170
1016
- },
1017
- {
1018
- "epoch": 0.72,
1019
- "eval_loss": 0.520677924156189,
1020
- "eval_runtime": 15.3767,
1021
- "eval_samples_per_second": 76.154,
1022
- "eval_steps_per_second": 6.373,
1023
- "step": 1170
1024
- },
1025
- {
1026
- "epoch": 0.72,
1027
- "learning_rate": 0.0009570977917981073,
1028
- "loss": 0.6002,
1029
- "step": 1180
1030
- },
1031
- {
1032
- "epoch": 0.73,
1033
- "learning_rate": 0.0009564668769716088,
1034
- "loss": 0.6236,
1035
- "step": 1190
1036
- },
1037
- {
1038
- "epoch": 0.73,
1039
- "learning_rate": 0.0009558359621451105,
1040
- "loss": 0.5757,
1041
- "step": 1200
1042
- },
1043
- {
1044
- "epoch": 0.73,
1045
- "eval_loss": 0.5401705503463745,
1046
- "eval_runtime": 15.3622,
1047
- "eval_samples_per_second": 76.226,
1048
- "eval_steps_per_second": 6.379,
1049
- "step": 1200
1050
- },
1051
- {
1052
- "epoch": 0.74,
1053
- "learning_rate": 0.000955205047318612,
1054
- "loss": 0.681,
1055
- "step": 1210
1056
- },
1057
- {
1058
- "epoch": 0.75,
1059
- "learning_rate": 0.0009545741324921136,
1060
- "loss": 0.5508,
1061
- "step": 1220
1062
- },
1063
- {
1064
- "epoch": 0.75,
1065
- "learning_rate": 0.0009539432176656152,
1066
- "loss": 0.5726,
1067
- "step": 1230
1068
- },
1069
- {
1070
- "epoch": 0.75,
1071
- "eval_loss": 0.5101749897003174,
1072
- "eval_runtime": 15.3465,
1073
- "eval_samples_per_second": 76.304,
1074
- "eval_steps_per_second": 6.386,
1075
- "step": 1230
1076
- },
1077
- {
1078
- "epoch": 0.76,
1079
- "learning_rate": 0.0009533123028391167,
1080
- "loss": 0.5528,
1081
- "step": 1240
1082
- },
1083
- {
1084
- "epoch": 0.76,
1085
- "learning_rate": 0.0009526813880126183,
1086
- "loss": 0.6107,
1087
- "step": 1250
1088
- },
1089
- {
1090
- "epoch": 0.77,
1091
- "learning_rate": 0.0009520504731861199,
1092
- "loss": 0.5675,
1093
- "step": 1260
1094
- },
1095
- {
1096
- "epoch": 0.77,
1097
- "eval_loss": 0.5156053900718689,
1098
- "eval_runtime": 15.3646,
1099
- "eval_samples_per_second": 76.214,
1100
- "eval_steps_per_second": 6.378,
1101
- "step": 1260
1102
- },
1103
- {
1104
- "epoch": 0.78,
1105
- "learning_rate": 0.0009514195583596215,
1106
- "loss": 0.5795,
1107
- "step": 1270
1108
- },
1109
- {
1110
- "epoch": 0.78,
1111
- "learning_rate": 0.000950788643533123,
1112
- "loss": 0.5489,
1113
- "step": 1280
1114
- },
1115
- {
1116
- "epoch": 0.79,
1117
- "learning_rate": 0.0009501577287066246,
1118
- "loss": 0.5931,
1119
- "step": 1290
1120
- },
1121
- {
1122
- "epoch": 0.79,
1123
- "eval_loss": 0.5232254266738892,
1124
- "eval_runtime": 15.3627,
1125
- "eval_samples_per_second": 76.224,
1126
- "eval_steps_per_second": 6.379,
1127
- "step": 1290
1128
- },
1129
- {
1130
- "epoch": 0.8,
1131
- "learning_rate": 0.0009495268138801262,
1132
- "loss": 0.6082,
1133
- "step": 1300
1134
- },
1135
- {
1136
- "epoch": 0.8,
1137
- "learning_rate": 0.0009488958990536278,
1138
- "loss": 0.6085,
1139
- "step": 1310
1140
- },
1141
- {
1142
- "epoch": 0.81,
1143
- "learning_rate": 0.0009482649842271294,
1144
- "loss": 0.5727,
1145
- "step": 1320
1146
- },
1147
- {
1148
- "epoch": 0.81,
1149
- "eval_loss": 0.5149978399276733,
1150
- "eval_runtime": 15.3731,
1151
- "eval_samples_per_second": 76.172,
1152
- "eval_steps_per_second": 6.375,
1153
- "step": 1320
1154
- },
1155
- {
1156
- "epoch": 0.81,
1157
- "learning_rate": 0.0009476340694006309,
1158
- "loss": 0.5912,
1159
- "step": 1330
1160
- },
1161
  {
1162
  "epoch": 0.82,
1163
- "learning_rate": 0.0009470031545741325,
1164
- "loss": 0.6397,
1165
- "step": 1340
1166
- },
1167
- {
1168
- "epoch": 0.83,
1169
- "learning_rate": 0.0009463722397476341,
1170
- "loss": 0.6233,
1171
- "step": 1350
1172
- },
1173
- {
1174
- "epoch": 0.83,
1175
- "eval_loss": 0.5305810570716858,
1176
- "eval_runtime": 15.3303,
1177
- "eval_samples_per_second": 76.385,
1178
- "eval_steps_per_second": 6.393,
1179
- "step": 1350
1180
- },
1181
- {
1182
- "epoch": 0.83,
1183
- "learning_rate": 0.0009457413249211356,
1184
- "loss": 0.54,
1185
- "step": 1360
1186
- },
1187
- {
1188
- "epoch": 0.84,
1189
- "learning_rate": 0.0009451104100946373,
1190
- "loss": 0.6118,
1191
- "step": 1370
1192
- },
1193
- {
1194
- "epoch": 0.84,
1195
- "learning_rate": 0.0009444794952681388,
1196
- "loss": 0.5661,
1197
- "step": 1380
1198
- },
1199
- {
1200
- "epoch": 0.84,
1201
- "eval_loss": 0.5178245306015015,
1202
- "eval_runtime": 15.3439,
1203
- "eval_samples_per_second": 76.317,
1204
- "eval_steps_per_second": 6.387,
1205
- "step": 1380
1206
- },
1207
- {
1208
- "epoch": 0.85,
1209
- "learning_rate": 0.0009438485804416403,
1210
- "loss": 0.6365,
1211
- "step": 1390
1212
- },
1213
- {
1214
- "epoch": 0.86,
1215
- "learning_rate": 0.000943217665615142,
1216
- "loss": 0.5635,
1217
- "step": 1400
1218
- },
1219
- {
1220
- "epoch": 0.86,
1221
- "learning_rate": 0.0009425867507886435,
1222
- "loss": 0.5993,
1223
- "step": 1410
1224
- },
1225
- {
1226
- "epoch": 0.86,
1227
- "eval_loss": 0.5094326734542847,
1228
- "eval_runtime": 15.3033,
1229
- "eval_samples_per_second": 76.519,
1230
- "eval_steps_per_second": 6.404,
1231
- "step": 1410
1232
- },
1233
- {
1234
- "epoch": 0.87,
1235
- "learning_rate": 0.0009419558359621452,
1236
- "loss": 0.5921,
1237
- "step": 1420
1238
- },
1239
- {
1240
- "epoch": 0.87,
1241
- "learning_rate": 0.0009413249211356467,
1242
- "loss": 0.5571,
1243
- "step": 1430
1244
- },
1245
- {
1246
- "epoch": 0.88,
1247
- "learning_rate": 0.0009406940063091482,
1248
- "loss": 0.5359,
1249
- "step": 1440
1250
- },
1251
- {
1252
- "epoch": 0.88,
1253
- "eval_loss": 0.5110692977905273,
1254
- "eval_runtime": 15.3327,
1255
- "eval_samples_per_second": 76.373,
1256
- "eval_steps_per_second": 6.392,
1257
- "step": 1440
1258
- },
1259
- {
1260
- "epoch": 0.89,
1261
- "learning_rate": 0.0009400630914826499,
1262
- "loss": 0.5363,
1263
- "step": 1450
1264
- },
1265
- {
1266
- "epoch": 0.89,
1267
- "learning_rate": 0.0009394321766561514,
1268
- "loss": 0.5619,
1269
- "step": 1460
1270
- },
1271
- {
1272
- "epoch": 0.9,
1273
- "learning_rate": 0.000938801261829653,
1274
- "loss": 0.5925,
1275
- "step": 1470
1276
- },
1277
- {
1278
- "epoch": 0.9,
1279
- "eval_loss": 0.5252500176429749,
1280
- "eval_runtime": 15.3611,
1281
- "eval_samples_per_second": 76.231,
1282
- "eval_steps_per_second": 6.38,
1283
- "step": 1470
1284
- },
1285
- {
1286
- "epoch": 0.91,
1287
- "learning_rate": 0.0009381703470031546,
1288
- "loss": 0.611,
1289
- "step": 1480
1290
- },
1291
- {
1292
- "epoch": 0.91,
1293
- "learning_rate": 0.0009375394321766562,
1294
- "loss": 0.581,
1295
- "step": 1490
1296
- },
1297
- {
1298
- "epoch": 0.92,
1299
- "learning_rate": 0.0009369085173501577,
1300
- "loss": 0.4984,
1301
- "step": 1500
1302
- },
1303
- {
1304
- "epoch": 0.92,
1305
- "eval_loss": 0.5312231779098511,
1306
- "eval_runtime": 15.5553,
1307
- "eval_samples_per_second": 75.28,
1308
- "eval_steps_per_second": 6.3,
1309
- "step": 1500
1310
- },
1311
- {
1312
- "epoch": 0.92,
1313
- "learning_rate": 0.0009362776025236593,
1314
- "loss": 0.5921,
1315
- "step": 1510
1316
- },
1317
- {
1318
- "epoch": 0.93,
1319
- "learning_rate": 0.0009356466876971609,
1320
- "loss": 0.5776,
1321
- "step": 1520
1322
- },
1323
- {
1324
- "epoch": 0.94,
1325
- "learning_rate": 0.0009350157728706625,
1326
- "loss": 0.5551,
1327
- "step": 1530
1328
- },
1329
- {
1330
- "epoch": 0.94,
1331
- "eval_loss": 0.515774130821228,
1332
- "eval_runtime": 15.3618,
1333
- "eval_samples_per_second": 76.228,
1334
- "eval_steps_per_second": 6.379,
1335
- "step": 1530
1336
- },
1337
- {
1338
- "epoch": 0.94,
1339
- "learning_rate": 0.0009343848580441641,
1340
- "loss": 0.5832,
1341
- "step": 1540
1342
- },
1343
- {
1344
- "epoch": 0.95,
1345
- "learning_rate": 0.0009337539432176656,
1346
- "loss": 0.5673,
1347
- "step": 1550
1348
- },
1349
- {
1350
- "epoch": 0.95,
1351
- "learning_rate": 0.0009331230283911672,
1352
- "loss": 0.6288,
1353
- "step": 1560
1354
- },
1355
- {
1356
- "epoch": 0.95,
1357
- "eval_loss": 0.4972882866859436,
1358
- "eval_runtime": 15.359,
1359
- "eval_samples_per_second": 76.242,
1360
- "eval_steps_per_second": 6.381,
1361
- "step": 1560
1362
- },
1363
- {
1364
- "epoch": 0.96,
1365
- "learning_rate": 0.0009324921135646688,
1366
- "loss": 0.555,
1367
- "step": 1570
1368
- },
1369
- {
1370
- "epoch": 0.97,
1371
- "learning_rate": 0.0009318611987381703,
1372
- "loss": 0.5405,
1373
- "step": 1580
1374
- },
1375
- {
1376
- "epoch": 0.97,
1377
- "learning_rate": 0.000931230283911672,
1378
- "loss": 0.6164,
1379
- "step": 1590
1380
- },
1381
- {
1382
- "epoch": 0.97,
1383
- "eval_loss": 0.5089274644851685,
1384
- "eval_runtime": 15.3184,
1385
- "eval_samples_per_second": 76.444,
1386
- "eval_steps_per_second": 6.398,
1387
- "step": 1590
1388
- },
1389
- {
1390
- "epoch": 0.98,
1391
- "learning_rate": 0.0009305993690851735,
1392
- "loss": 0.5609,
1393
- "step": 1600
1394
- },
1395
- {
1396
- "epoch": 0.98,
1397
- "learning_rate": 0.000929968454258675,
1398
- "loss": 0.6188,
1399
- "step": 1610
1400
- },
1401
- {
1402
- "epoch": 0.99,
1403
- "learning_rate": 0.0009293375394321767,
1404
- "loss": 0.5423,
1405
- "step": 1620
1406
- },
1407
- {
1408
- "epoch": 0.99,
1409
- "eval_loss": 0.5065773129463196,
1410
- "eval_runtime": 15.2816,
1411
- "eval_samples_per_second": 76.628,
1412
- "eval_steps_per_second": 6.413,
1413
- "step": 1620
1414
- },
1415
- {
1416
- "epoch": 1.0,
1417
- "learning_rate": 0.0009287066246056782,
1418
- "loss": 0.5393,
1419
- "step": 1630
1420
  }
1421
  ],
1422
  "logging_steps": 10,
1423
- "max_steps": 16350,
1424
  "num_train_epochs": 10,
1425
  "save_steps": 500,
1426
- "total_flos": 1250508061458432.0,
1427
  "trial_name": null,
1428
  "trial_params": null
1429
  }
 
1
  {
2
+ "best_metric": 0.5440758466720581,
3
+ "best_model_checkpoint": "bart_lora_outputs\\checkpoint-500",
4
+ "epoch": 0.8156606851549756,
5
+ "eval_steps": 100,
6
+ "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.02,
13
  "learning_rate": 2e-05,
14
+ "loss": 2.9197,
15
  "step": 10
16
  },
17
  {
18
+ "epoch": 0.03,
19
  "learning_rate": 4e-05,
20
+ "loss": 2.872,
21
  "step": 20
22
  },
23
  {
24
+ "epoch": 0.05,
25
  "learning_rate": 6e-05,
26
+ "loss": 2.6183,
 
 
 
 
 
 
 
 
27
  "step": 30
28
  },
29
  {
30
+ "epoch": 0.07,
31
  "learning_rate": 8e-05,
32
+ "loss": 2.4094,
33
  "step": 40
34
  },
35
  {
36
+ "epoch": 0.08,
37
  "learning_rate": 0.0001,
38
+ "loss": 2.15,
39
  "step": 50
40
  },
41
  {
42
+ "epoch": 0.1,
43
  "learning_rate": 0.00012,
44
+ "loss": 1.7088,
 
 
 
 
 
 
 
 
45
  "step": 60
46
  },
47
  {
48
+ "epoch": 0.11,
49
  "learning_rate": 0.00014000000000000001,
50
+ "loss": 1.413,
51
  "step": 70
52
  },
53
  {
54
+ "epoch": 0.13,
55
  "learning_rate": 0.00016,
56
+ "loss": 1.2326,
57
  "step": 80
58
  },
59
  {
60
+ "epoch": 0.15,
61
  "learning_rate": 0.00017999999999999998,
62
+ "loss": 1.1108,
63
  "step": 90
64
  },
65
  {
66
+ "epoch": 0.16,
67
+ "learning_rate": 0.0002,
68
+ "loss": 1.0331,
69
+ "step": 100
 
 
70
  },
71
  {
72
+ "epoch": 0.16,
73
+ "eval_loss": 0.8141101598739624,
74
+ "eval_runtime": 5.616,
75
+ "eval_samples_per_second": 208.511,
76
+ "eval_steps_per_second": 13.177,
77
  "step": 100
78
  },
79
  {
80
+ "epoch": 0.18,
81
  "learning_rate": 0.00022,
82
+ "loss": 0.9807,
83
  "step": 110
84
  },
85
  {
86
+ "epoch": 0.2,
87
  "learning_rate": 0.00024,
88
+ "loss": 0.9853,
 
 
 
 
 
 
 
 
89
  "step": 120
90
  },
91
  {
92
+ "epoch": 0.21,
93
  "learning_rate": 0.00026000000000000003,
94
+ "loss": 0.9411,
95
  "step": 130
96
  },
97
  {
98
+ "epoch": 0.23,
99
  "learning_rate": 0.00028000000000000003,
100
+ "loss": 0.8733,
101
  "step": 140
102
  },
103
  {
104
+ "epoch": 0.24,
105
  "learning_rate": 0.0003,
106
+ "loss": 0.8347,
 
 
 
 
 
 
 
 
107
  "step": 150
108
  },
109
  {
110
+ "epoch": 0.26,
111
  "learning_rate": 0.00032,
112
+ "loss": 0.8786,
113
  "step": 160
114
  },
115
  {
116
+ "epoch": 0.28,
117
  "learning_rate": 0.00034,
118
+ "loss": 0.7676,
119
  "step": 170
120
  },
121
  {
122
+ "epoch": 0.29,
123
  "learning_rate": 0.00035999999999999997,
124
+ "loss": 0.8399,
 
 
 
 
 
 
 
 
125
  "step": 180
126
  },
127
  {
128
+ "epoch": 0.31,
129
  "learning_rate": 0.00038,
130
+ "loss": 0.838,
131
  "step": 190
132
  },
133
  {
134
+ "epoch": 0.33,
135
  "learning_rate": 0.0004,
136
+ "loss": 0.7888,
137
  "step": 200
138
  },
139
  {
140
+ "epoch": 0.33,
141
+ "eval_loss": 0.6890668869018555,
142
+ "eval_runtime": 5.748,
143
+ "eval_samples_per_second": 203.722,
144
+ "eval_steps_per_second": 12.874,
145
+ "step": 200
146
  },
147
  {
148
+ "epoch": 0.34,
149
+ "learning_rate": 0.00042,
150
+ "loss": 0.7596,
 
 
151
  "step": 210
152
  },
153
  {
154
+ "epoch": 0.36,
155
  "learning_rate": 0.00044,
156
+ "loss": 0.7959,
157
  "step": 220
158
  },
159
  {
160
+ "epoch": 0.38,
161
  "learning_rate": 0.00046,
162
+ "loss": 0.7637,
163
  "step": 230
164
  },
165
  {
166
+ "epoch": 0.39,
167
  "learning_rate": 0.00048,
168
+ "loss": 0.7413,
 
 
 
 
 
 
 
 
169
  "step": 240
170
  },
171
  {
172
+ "epoch": 0.41,
173
  "learning_rate": 0.0005,
174
+ "loss": 0.7559,
175
  "step": 250
176
  },
177
  {
178
+ "epoch": 0.42,
179
  "learning_rate": 0.0005200000000000001,
180
+ "loss": 0.7465,
181
  "step": 260
182
  },
183
  {
184
+ "epoch": 0.44,
185
  "learning_rate": 0.00054,
186
+ "loss": 0.7311,
 
 
 
 
 
 
 
 
187
  "step": 270
188
  },
189
  {
190
+ "epoch": 0.46,
191
  "learning_rate": 0.0005600000000000001,
192
+ "loss": 0.7031,
193
  "step": 280
194
  },
195
  {
196
+ "epoch": 0.47,
197
  "learning_rate": 0.00058,
198
+ "loss": 0.7565,
199
  "step": 290
200
  },
201
  {
202
+ "epoch": 0.49,
203
  "learning_rate": 0.0006,
204
+ "loss": 0.6811,
205
  "step": 300
206
  },
207
  {
208
+ "epoch": 0.49,
209
+ "eval_loss": 0.6422508955001831,
210
+ "eval_runtime": 5.623,
211
+ "eval_samples_per_second": 208.251,
212
+ "eval_steps_per_second": 13.16,
213
  "step": 300
214
  },
215
  {
216
+ "epoch": 0.51,
217
  "learning_rate": 0.00062,
218
+ "loss": 0.7063,
219
  "step": 310
220
  },
221
  {
222
+ "epoch": 0.52,
223
  "learning_rate": 0.00064,
224
+ "loss": 0.727,
225
  "step": 320
226
  },
227
  {
228
+ "epoch": 0.54,
229
  "learning_rate": 0.00066,
230
+ "loss": 0.6678,
 
 
 
 
 
 
 
 
231
  "step": 330
232
  },
233
  {
234
+ "epoch": 0.55,
235
  "learning_rate": 0.00068,
236
+ "loss": 0.6465,
237
  "step": 340
238
  },
239
  {
240
+ "epoch": 0.57,
241
  "learning_rate": 0.0007,
242
+ "loss": 0.6647,
243
  "step": 350
244
  },
245
  {
246
+ "epoch": 0.59,
247
  "learning_rate": 0.0007199999999999999,
248
+ "loss": 0.6754,
 
 
 
 
 
 
 
 
249
  "step": 360
250
  },
251
  {
252
+ "epoch": 0.6,
253
  "learning_rate": 0.00074,
254
+ "loss": 0.6419,
255
  "step": 370
256
  },
257
  {
258
+ "epoch": 0.62,
259
  "learning_rate": 0.00076,
260
+ "loss": 0.6746,
261
  "step": 380
262
  },
263
  {
264
+ "epoch": 0.64,
265
  "learning_rate": 0.0007800000000000001,
266
+ "loss": 0.6279,
267
  "step": 390
268
  },
269
  {
270
+ "epoch": 0.65,
271
+ "learning_rate": 0.0008,
272
+ "loss": 0.7124,
273
+ "step": 400
 
 
274
  },
275
  {
276
+ "epoch": 0.65,
277
+ "eval_loss": 0.5565645098686218,
278
+ "eval_runtime": 5.756,
279
+ "eval_samples_per_second": 203.439,
280
+ "eval_steps_per_second": 12.856,
281
  "step": 400
282
  },
283
  {
284
+ "epoch": 0.67,
285
  "learning_rate": 0.00082,
286
+ "loss": 0.6656,
287
  "step": 410
288
  },
289
  {
290
+ "epoch": 0.69,
291
  "learning_rate": 0.00084,
292
+ "loss": 0.6298,
 
 
 
 
 
 
 
 
293
  "step": 420
294
  },
295
  {
296
+ "epoch": 0.7,
297
  "learning_rate": 0.00086,
298
+ "loss": 0.6795,
299
  "step": 430
300
  },
301
  {
302
+ "epoch": 0.72,
303
  "learning_rate": 0.00088,
304
+ "loss": 0.6279,
305
  "step": 440
306
  },
307
  {
308
+ "epoch": 0.73,
309
  "learning_rate": 0.0009000000000000001,
310
+ "loss": 0.6663,
311
  "step": 450
312
  },
313
  {
314
+ "epoch": 0.75,
 
 
 
 
 
 
 
 
315
  "learning_rate": 0.00092,
316
+ "loss": 0.662,
317
  "step": 460
318
  },
319
  {
320
+ "epoch": 0.77,
321
  "learning_rate": 0.00094,
322
+ "loss": 0.6339,
323
  "step": 470
324
  },
325
  {
326
+ "epoch": 0.78,
327
  "learning_rate": 0.00096,
328
+ "loss": 0.6099,
 
 
 
 
 
 
 
 
329
  "step": 480
330
  },
331
  {
332
+ "epoch": 0.8,
333
  "learning_rate": 0.00098,
334
+ "loss": 0.6465,
335
  "step": 490
336
  },
337
  {
338
+ "epoch": 0.82,
339
  "learning_rate": 0.001,
340
+ "loss": 0.615,
341
  "step": 500
342
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  {
344
  "epoch": 0.82,
345
+ "eval_loss": 0.5440758466720581,
346
+ "eval_runtime": 5.541,
347
+ "eval_samples_per_second": 211.333,
348
+ "eval_steps_per_second": 13.355,
349
+ "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  }
351
  ],
352
  "logging_steps": 10,
353
+ "max_steps": 6130,
354
  "num_train_epochs": 10,
355
  "save_steps": 500,
356
+ "total_flos": 1077083681587200.0,
357
  "trial_name": null,
358
  "trial_params": null
359
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed73ee57056e6e53e9f323b135ace6c4a0b4f7e29df3c6690c9977fa6847b593
3
  size 4600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef926b73ee7f8ea582bb0c8e88b44eeac71091525992f56ddfde5d64524b7acf
3
  size 4600