diaenra commited on
Commit
39bc64b
·
verified ·
1 Parent(s): 4631c54

Training in progress, step 1344, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:efb94dd7e67139ea99014596d0344d8d906206fb75f42a024b9d316803b1dcd3
3
  size 20652232
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cfd8a262e60cb5b83c188d6510fc245b060e39d1b7b22ccedb2a7cf3f5ff2ce
3
  size 20652232
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d5ef12b992059fa547cbf102a19de53075d31da95dd3a61f8c42d48ee993fd4
3
  size 41455802
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5e951f81ca2dd3bb4c5fe725fab1e06ccce1f821f897ec5ff0487276f71010d
3
  size 41455802
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6cb1e3c0b09c43c829d3b72409988703d0b0f743c46139699298a2f8d5b8aaf
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bfbb2ca33cc9b3557235510ae3199cb13a13731b28befe7890768a81a0291b3
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f8ec9aa94796eae9cab514afaf03b7ae0061a66948c3bb6174c97038c0a9991
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38b7f1c361dec5189a47d902296a5c2bdaac9aa1cf564418b0600c3c3f362490
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8889715454714525,
5
  "eval_steps": 500,
6
- "global_step": 1195,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -8372,6 +8372,1049 @@
8372
  "learning_rate": 3.498172085070084e-06,
8373
  "loss": 9.5849,
8374
  "step": 1195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8375
  }
8376
  ],
8377
  "logging_steps": 1,
@@ -8386,12 +9429,12 @@
8386
  "should_evaluate": false,
8387
  "should_log": false,
8388
  "should_save": true,
8389
- "should_training_stop": false
8390
  },
8391
  "attributes": {}
8392
  }
8393
  },
8394
- "total_flos": 597570598993920.0,
8395
  "train_batch_size": 4,
8396
  "trial_name": null,
8397
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9998140226892319,
5
  "eval_steps": 500,
6
+ "global_step": 1344,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
8372
  "learning_rate": 3.498172085070084e-06,
8373
  "loss": 9.5849,
8374
  "step": 1195
8375
+ },
8376
+ {
8377
+ "epoch": 0.8897154547145248,
8378
+ "grad_norm": 3.594160795211792,
8379
+ "learning_rate": 3.4519204341786902e-06,
8380
+ "loss": 9.5811,
8381
+ "step": 1196
8382
+ },
8383
+ {
8384
+ "epoch": 0.8904593639575972,
8385
+ "grad_norm": 3.5941364765167236,
8386
+ "learning_rate": 3.405965649329046e-06,
8387
+ "loss": 9.5818,
8388
+ "step": 1197
8389
+ },
8390
+ {
8391
+ "epoch": 0.8912032732006695,
8392
+ "grad_norm": 3.5020856857299805,
8393
+ "learning_rate": 3.3603080236033867e-06,
8394
+ "loss": 9.5884,
8395
+ "step": 1198
8396
+ },
8397
+ {
8398
+ "epoch": 0.8919471824437418,
8399
+ "grad_norm": 3.4808876514434814,
8400
+ "learning_rate": 3.3149478481887598e-06,
8401
+ "loss": 9.5994,
8402
+ "step": 1199
8403
+ },
8404
+ {
8405
+ "epoch": 0.8926910916868142,
8406
+ "grad_norm": 3.767507553100586,
8407
+ "learning_rate": 3.269885412375223e-06,
8408
+ "loss": 9.5746,
8409
+ "step": 1200
8410
+ },
8411
+ {
8412
+ "epoch": 0.8934350009298866,
8413
+ "grad_norm": 3.547511339187622,
8414
+ "learning_rate": 3.2251210035539323e-06,
8415
+ "loss": 9.5839,
8416
+ "step": 1201
8417
+ },
8418
+ {
8419
+ "epoch": 0.8941789101729589,
8420
+ "grad_norm": 3.767230272293091,
8421
+ "learning_rate": 3.1806549072153635e-06,
8422
+ "loss": 9.574,
8423
+ "step": 1202
8424
+ },
8425
+ {
8426
+ "epoch": 0.8949228194160312,
8427
+ "grad_norm": 3.5477194786071777,
8428
+ "learning_rate": 3.1364874069474527e-06,
8429
+ "loss": 9.5837,
8430
+ "step": 1203
8431
+ },
8432
+ {
8433
+ "epoch": 0.8956667286591036,
8434
+ "grad_norm": 3.76715350151062,
8435
+ "learning_rate": 3.0926187844337984e-06,
8436
+ "loss": 9.5745,
8437
+ "step": 1204
8438
+ },
8439
+ {
8440
+ "epoch": 0.8964106379021759,
8441
+ "grad_norm": 3.7672533988952637,
8442
+ "learning_rate": 3.0490493194518855e-06,
8443
+ "loss": 9.5739,
8444
+ "step": 1205
8445
+ },
8446
+ {
8447
+ "epoch": 0.8971545471452482,
8448
+ "grad_norm": 3.87873911857605,
8449
+ "learning_rate": 3.00577928987128e-06,
8450
+ "loss": 9.5715,
8451
+ "step": 1206
8452
+ },
8453
+ {
8454
+ "epoch": 0.8978984563883207,
8455
+ "grad_norm": 3.672361373901367,
8456
+ "learning_rate": 2.962808971651859e-06,
8457
+ "loss": 9.5768,
8458
+ "step": 1207
8459
+ },
8460
+ {
8461
+ "epoch": 0.898642365631393,
8462
+ "grad_norm": 3.688178539276123,
8463
+ "learning_rate": 2.920138638842068e-06,
8464
+ "loss": 9.5783,
8465
+ "step": 1208
8466
+ },
8467
+ {
8468
+ "epoch": 0.8993862748744653,
8469
+ "grad_norm": 3.767151355743408,
8470
+ "learning_rate": 2.8777685635771155e-06,
8471
+ "loss": 9.5734,
8472
+ "step": 1209
8473
+ },
8474
+ {
8475
+ "epoch": 0.9001301841175376,
8476
+ "grad_norm": 3.5939481258392334,
8477
+ "learning_rate": 2.835699016077353e-06,
8478
+ "loss": 9.5818,
8479
+ "step": 1210
8480
+ },
8481
+ {
8482
+ "epoch": 0.90087409336061,
8483
+ "grad_norm": 3.6881825923919678,
8484
+ "learning_rate": 2.793930264646405e-06,
8485
+ "loss": 9.5778,
8486
+ "step": 1211
8487
+ },
8488
+ {
8489
+ "epoch": 0.9016180026036823,
8490
+ "grad_norm": 3.60945463180542,
8491
+ "learning_rate": 2.7524625756695954e-06,
8492
+ "loss": 9.5813,
8493
+ "step": 1212
8494
+ },
8495
+ {
8496
+ "epoch": 0.9023619118467547,
8497
+ "grad_norm": 3.473451852798462,
8498
+ "learning_rate": 2.711296213612119e-06,
8499
+ "loss": 9.5915,
8500
+ "step": 1213
8501
+ },
8502
+ {
8503
+ "epoch": 0.9031058210898271,
8504
+ "grad_norm": 3.5476932525634766,
8505
+ "learning_rate": 2.6704314410174958e-06,
8506
+ "loss": 9.5854,
8507
+ "step": 1214
8508
+ },
8509
+ {
8510
+ "epoch": 0.9038497303328994,
8511
+ "grad_norm": 3.878340005874634,
8512
+ "learning_rate": 2.6298685185057735e-06,
8513
+ "loss": 9.5707,
8514
+ "step": 1215
8515
+ },
8516
+ {
8517
+ "epoch": 0.9045936395759717,
8518
+ "grad_norm": 3.878525972366333,
8519
+ "learning_rate": 2.5896077047719237e-06,
8520
+ "loss": 9.5708,
8521
+ "step": 1216
8522
+ },
8523
+ {
8524
+ "epoch": 0.905337548819044,
8525
+ "grad_norm": 3.5938403606414795,
8526
+ "learning_rate": 2.549649256584191e-06,
8527
+ "loss": 9.5824,
8528
+ "step": 1217
8529
+ },
8530
+ {
8531
+ "epoch": 0.9060814580621164,
8532
+ "grad_norm": 3.593989372253418,
8533
+ "learning_rate": 2.50999342878242e-06,
8534
+ "loss": 9.5813,
8535
+ "step": 1218
8536
+ },
8537
+ {
8538
+ "epoch": 0.9068253673051888,
8539
+ "grad_norm": 3.6723949909210205,
8540
+ "learning_rate": 2.470640474276509e-06,
8541
+ "loss": 9.5778,
8542
+ "step": 1219
8543
+ },
8544
+ {
8545
+ "epoch": 0.9075692765482611,
8546
+ "grad_norm": 3.5473990440368652,
8547
+ "learning_rate": 2.4315906440446955e-06,
8548
+ "loss": 9.5852,
8549
+ "step": 1220
8550
+ },
8551
+ {
8552
+ "epoch": 0.9083131857913335,
8553
+ "grad_norm": 3.5474307537078857,
8554
+ "learning_rate": 2.3928441871320263e-06,
8555
+ "loss": 9.5848,
8556
+ "step": 1221
8557
+ },
8558
+ {
8559
+ "epoch": 0.9090570950344058,
8560
+ "grad_norm": 3.767143964767456,
8561
+ "learning_rate": 2.3544013506487496e-06,
8562
+ "loss": 9.5743,
8563
+ "step": 1222
8564
+ },
8565
+ {
8566
+ "epoch": 0.9098010042774781,
8567
+ "grad_norm": 3.5938961505889893,
8568
+ "learning_rate": 2.3162623797687245e-06,
8569
+ "loss": 9.5823,
8570
+ "step": 1223
8571
+ },
8572
+ {
8573
+ "epoch": 0.9105449135205504,
8574
+ "grad_norm": 3.6094627380371094,
8575
+ "learning_rate": 2.2784275177278934e-06,
8576
+ "loss": 9.5804,
8577
+ "step": 1224
8578
+ },
8579
+ {
8580
+ "epoch": 0.9112888227636229,
8581
+ "grad_norm": 3.593992233276367,
8582
+ "learning_rate": 2.240897005822684e-06,
8583
+ "loss": 9.581,
8584
+ "step": 1225
8585
+ },
8586
+ {
8587
+ "epoch": 0.9120327320066952,
8588
+ "grad_norm": 3.5026135444641113,
8589
+ "learning_rate": 2.2036710834084986e-06,
8590
+ "loss": 9.5884,
8591
+ "step": 1226
8592
+ },
8593
+ {
8594
+ "epoch": 0.9127766412497675,
8595
+ "grad_norm": 3.4764721393585205,
8596
+ "learning_rate": 2.1667499878981866e-06,
8597
+ "loss": 9.595,
8598
+ "step": 1227
8599
+ },
8600
+ {
8601
+ "epoch": 0.9135205504928399,
8602
+ "grad_norm": 3.4730682373046875,
8603
+ "learning_rate": 2.130133954760538e-06,
8604
+ "loss": 9.5912,
8605
+ "step": 1228
8606
+ },
8607
+ {
8608
+ "epoch": 0.9142644597359122,
8609
+ "grad_norm": 3.5019729137420654,
8610
+ "learning_rate": 2.0938232175187645e-06,
8611
+ "loss": 9.5887,
8612
+ "step": 1229
8613
+ },
8614
+ {
8615
+ "epoch": 0.9150083689789845,
8616
+ "grad_norm": 3.6096534729003906,
8617
+ "learning_rate": 2.0578180077489904e-06,
8618
+ "loss": 9.5819,
8619
+ "step": 1230
8620
+ },
8621
+ {
8622
+ "epoch": 0.915752278222057,
8623
+ "grad_norm": 3.767604351043701,
8624
+ "learning_rate": 2.0221185550788335e-06,
8625
+ "loss": 9.5749,
8626
+ "step": 1231
8627
+ },
8628
+ {
8629
+ "epoch": 0.9164961874651293,
8630
+ "grad_norm": 3.5474653244018555,
8631
+ "learning_rate": 1.986725087185898e-06,
8632
+ "loss": 9.584,
8633
+ "step": 1232
8634
+ },
8635
+ {
8636
+ "epoch": 0.9172400967082016,
8637
+ "grad_norm": 3.4607582092285156,
8638
+ "learning_rate": 1.951637829796338e-06,
8639
+ "loss": 9.595,
8640
+ "step": 1233
8641
+ },
8642
+ {
8643
+ "epoch": 0.9179840059512739,
8644
+ "grad_norm": 3.7671072483062744,
8645
+ "learning_rate": 1.916857006683398e-06,
8646
+ "loss": 9.5752,
8647
+ "step": 1234
8648
+ },
8649
+ {
8650
+ "epoch": 0.9187279151943463,
8651
+ "grad_norm": 3.6726598739624023,
8652
+ "learning_rate": 1.8823828396660081e-06,
8653
+ "loss": 9.5787,
8654
+ "step": 1235
8655
+ },
8656
+ {
8657
+ "epoch": 0.9194718244374186,
8658
+ "grad_norm": 3.6729495525360107,
8659
+ "learning_rate": 1.8482155486073739e-06,
8660
+ "loss": 9.5783,
8661
+ "step": 1236
8662
+ },
8663
+ {
8664
+ "epoch": 0.920215733680491,
8665
+ "grad_norm": 3.879298448562622,
8666
+ "learning_rate": 1.814355351413538e-06,
8667
+ "loss": 9.5706,
8668
+ "step": 1237
8669
+ },
8670
+ {
8671
+ "epoch": 0.9209596429235634,
8672
+ "grad_norm": 3.6096079349517822,
8673
+ "learning_rate": 1.7808024640320498e-06,
8674
+ "loss": 9.5803,
8675
+ "step": 1238
8676
+ },
8677
+ {
8678
+ "epoch": 0.9217035521666357,
8679
+ "grad_norm": 3.5938854217529297,
8680
+ "learning_rate": 1.7475571004505087e-06,
8681
+ "loss": 9.5813,
8682
+ "step": 1239
8683
+ },
8684
+ {
8685
+ "epoch": 0.922447461409708,
8686
+ "grad_norm": 3.60953688621521,
8687
+ "learning_rate": 1.714619472695278e-06,
8688
+ "loss": 9.5811,
8689
+ "step": 1240
8690
+ },
8691
+ {
8692
+ "epoch": 0.9231913706527803,
8693
+ "grad_norm": 3.609628200531006,
8694
+ "learning_rate": 1.6819897908300853e-06,
8695
+ "loss": 9.5814,
8696
+ "step": 1241
8697
+ },
8698
+ {
8699
+ "epoch": 0.9239352798958527,
8700
+ "grad_norm": 3.5939552783966064,
8701
+ "learning_rate": 1.6496682629546956e-06,
8702
+ "loss": 9.5807,
8703
+ "step": 1242
8704
+ },
8705
+ {
8706
+ "epoch": 0.9246791891389251,
8707
+ "grad_norm": 3.501974582672119,
8708
+ "learning_rate": 1.6176550952035908e-06,
8709
+ "loss": 9.5882,
8710
+ "step": 1243
8711
+ },
8712
+ {
8713
+ "epoch": 0.9254230983819974,
8714
+ "grad_norm": 3.5018725395202637,
8715
+ "learning_rate": 1.5859504917446366e-06,
8716
+ "loss": 9.5878,
8717
+ "step": 1244
8718
+ },
8719
+ {
8720
+ "epoch": 0.9261670076250698,
8721
+ "grad_norm": 3.5475661754608154,
8722
+ "learning_rate": 1.554554654777801e-06,
8723
+ "loss": 9.5856,
8724
+ "step": 1245
8725
+ },
8726
+ {
8727
+ "epoch": 0.9269109168681421,
8728
+ "grad_norm": 3.53214430809021,
8729
+ "learning_rate": 1.5234677845338607e-06,
8730
+ "loss": 9.5855,
8731
+ "step": 1246
8732
+ },
8733
+ {
8734
+ "epoch": 0.9276548261112144,
8735
+ "grad_norm": 3.67262601852417,
8736
+ "learning_rate": 1.492690079273118e-06,
8737
+ "loss": 9.578,
8738
+ "step": 1247
8739
+ },
8740
+ {
8741
+ "epoch": 0.9283987353542867,
8742
+ "grad_norm": 3.5474252700805664,
8743
+ "learning_rate": 1.4622217352841138e-06,
8744
+ "loss": 9.5852,
8745
+ "step": 1248
8746
+ },
8747
+ {
8748
+ "epoch": 0.9291426445973591,
8749
+ "grad_norm": 3.6728267669677734,
8750
+ "learning_rate": 1.4320629468824286e-06,
8751
+ "loss": 9.5787,
8752
+ "step": 1249
8753
+ },
8754
+ {
8755
+ "epoch": 0.9298865538404315,
8756
+ "grad_norm": 3.53242564201355,
8757
+ "learning_rate": 1.4022139064094165e-06,
8758
+ "loss": 9.5855,
8759
+ "step": 1250
8760
+ },
8761
+ {
8762
+ "epoch": 0.9306304630835038,
8763
+ "grad_norm": 3.7670955657958984,
8764
+ "learning_rate": 1.3726748042309557e-06,
8765
+ "loss": 9.5742,
8766
+ "step": 1251
8767
+ },
8768
+ {
8769
+ "epoch": 0.9313743723265762,
8770
+ "grad_norm": 3.672420024871826,
8771
+ "learning_rate": 1.3434458287362672e-06,
8772
+ "loss": 9.5771,
8773
+ "step": 1252
8774
+ },
8775
+ {
8776
+ "epoch": 0.9321182815696485,
8777
+ "grad_norm": 3.8784019947052,
8778
+ "learning_rate": 1.3145271663366932e-06,
8779
+ "loss": 9.5716,
8780
+ "step": 1253
8781
+ },
8782
+ {
8783
+ "epoch": 0.9328621908127208,
8784
+ "grad_norm": 3.6880033016204834,
8785
+ "learning_rate": 1.2859190014645305e-06,
8786
+ "loss": 9.5779,
8787
+ "step": 1254
8788
+ },
8789
+ {
8790
+ "epoch": 0.9336061000557931,
8791
+ "grad_norm": 3.767427682876587,
8792
+ "learning_rate": 1.257621516571822e-06,
8793
+ "loss": 9.5742,
8794
+ "step": 1255
8795
+ },
8796
+ {
8797
+ "epoch": 0.9343500092988656,
8798
+ "grad_norm": 3.878462314605713,
8799
+ "learning_rate": 1.2296348921292333e-06,
8800
+ "loss": 9.57,
8801
+ "step": 1256
8802
+ },
8803
+ {
8804
+ "epoch": 0.9350939185419379,
8805
+ "grad_norm": 3.6724817752838135,
8806
+ "learning_rate": 1.2019593066248614e-06,
8807
+ "loss": 9.5777,
8808
+ "step": 1257
8809
+ },
8810
+ {
8811
+ "epoch": 0.9358378277850102,
8812
+ "grad_norm": 3.5474514961242676,
8813
+ "learning_rate": 1.1745949365631115e-06,
8814
+ "loss": 9.5851,
8815
+ "step": 1258
8816
+ },
8817
+ {
8818
+ "epoch": 0.9365817370280826,
8819
+ "grad_norm": 4.006692409515381,
8820
+ "learning_rate": 1.1475419564636048e-06,
8821
+ "loss": 9.568,
8822
+ "step": 1259
8823
+ },
8824
+ {
8825
+ "epoch": 0.9373256462711549,
8826
+ "grad_norm": 3.672508478164673,
8827
+ "learning_rate": 1.120800538859995e-06,
8828
+ "loss": 9.5789,
8829
+ "step": 1260
8830
+ },
8831
+ {
8832
+ "epoch": 0.9380695555142272,
8833
+ "grad_norm": 3.5939109325408936,
8834
+ "learning_rate": 1.0943708542989372e-06,
8835
+ "loss": 9.5801,
8836
+ "step": 1261
8837
+ },
8838
+ {
8839
+ "epoch": 0.9388134647572997,
8840
+ "grad_norm": 3.7673163414001465,
8841
+ "learning_rate": 1.0682530713389482e-06,
8842
+ "loss": 9.5732,
8843
+ "step": 1262
8844
+ },
8845
+ {
8846
+ "epoch": 0.939557374000372,
8847
+ "grad_norm": 3.593905210494995,
8848
+ "learning_rate": 1.042447356549381e-06,
8849
+ "loss": 9.5818,
8850
+ "step": 1263
8851
+ },
8852
+ {
8853
+ "epoch": 0.9403012832434443,
8854
+ "grad_norm": 3.6882522106170654,
8855
+ "learning_rate": 1.0169538745093242e-06,
8856
+ "loss": 9.578,
8857
+ "step": 1264
8858
+ },
8859
+ {
8860
+ "epoch": 0.9410451924865166,
8861
+ "grad_norm": 3.8786656856536865,
8862
+ "learning_rate": 9.917727878065497e-07,
8863
+ "loss": 9.5709,
8864
+ "step": 1265
8865
+ },
8866
+ {
8867
+ "epoch": 0.941789101729589,
8868
+ "grad_norm": 3.5021424293518066,
8869
+ "learning_rate": 9.66904257036505e-07,
8870
+ "loss": 9.588,
8871
+ "step": 1266
8872
+ },
8873
+ {
8874
+ "epoch": 0.9425330109726613,
8875
+ "grad_norm": 3.609473466873169,
8876
+ "learning_rate": 9.423484408012717e-07,
8877
+ "loss": 9.581,
8878
+ "step": 1267
8879
+ },
8880
+ {
8881
+ "epoch": 0.9432769202157337,
8882
+ "grad_norm": 3.672401189804077,
8883
+ "learning_rate": 9.1810549570856e-07,
8884
+ "loss": 9.5778,
8885
+ "step": 1268
8886
+ },
8887
+ {
8888
+ "epoch": 0.944020829458806,
8889
+ "grad_norm": 3.672403573989868,
8890
+ "learning_rate": 8.941755763707038e-07,
8891
+ "loss": 9.5776,
8892
+ "step": 1269
8893
+ },
8894
+ {
8895
+ "epoch": 0.9447647387018784,
8896
+ "grad_norm": 3.5481207370758057,
8897
+ "learning_rate": 8.705588354036676e-07,
8898
+ "loss": 9.5856,
8899
+ "step": 1270
8900
+ },
8901
+ {
8902
+ "epoch": 0.9455086479449507,
8903
+ "grad_norm": 3.531853199005127,
8904
+ "learning_rate": 8.472554234260965e-07,
8905
+ "loss": 9.5838,
8906
+ "step": 1271
8907
+ },
8908
+ {
8909
+ "epoch": 0.946252557188023,
8910
+ "grad_norm": 3.502507209777832,
8911
+ "learning_rate": 8.24265489058329e-07,
8912
+ "loss": 9.5878,
8913
+ "step": 1272
8914
+ },
8915
+ {
8916
+ "epoch": 0.9469964664310954,
8917
+ "grad_norm": 3.8787922859191895,
8918
+ "learning_rate": 8.015891789214803e-07,
8919
+ "loss": 9.5708,
8920
+ "step": 1273
8921
+ },
8922
+ {
8923
+ "epoch": 0.9477403756741678,
8924
+ "grad_norm": 3.5023539066314697,
8925
+ "learning_rate": 7.79226637636471e-07,
8926
+ "loss": 9.5884,
8927
+ "step": 1274
8928
+ },
8929
+ {
8930
+ "epoch": 0.9484842849172401,
8931
+ "grad_norm": 3.7672739028930664,
8932
+ "learning_rate": 7.571780078231283e-07,
8933
+ "loss": 9.574,
8934
+ "step": 1275
8935
+ },
8936
+ {
8937
+ "epoch": 0.9492281941603125,
8938
+ "grad_norm": 3.879286289215088,
8939
+ "learning_rate": 7.354434300992752e-07,
8940
+ "loss": 9.5714,
8941
+ "step": 1276
8942
+ },
8943
+ {
8944
+ "epoch": 0.9499721034033848,
8945
+ "grad_norm": 3.501861810684204,
8946
+ "learning_rate": 7.140230430798256e-07,
8947
+ "loss": 9.5876,
8948
+ "step": 1277
8949
+ },
8950
+ {
8951
+ "epoch": 0.9507160126464571,
8952
+ "grad_norm": 3.7678983211517334,
8953
+ "learning_rate": 6.929169833759075e-07,
8954
+ "loss": 9.5754,
8955
+ "step": 1278
8956
+ },
8957
+ {
8958
+ "epoch": 0.9514599218895294,
8959
+ "grad_norm": 3.502337694168091,
8960
+ "learning_rate": 6.721253855939746e-07,
8961
+ "loss": 9.5883,
8962
+ "step": 1279
8963
+ },
8964
+ {
8965
+ "epoch": 0.9522038311326019,
8966
+ "grad_norm": 3.501981258392334,
8967
+ "learning_rate": 6.516483823349795e-07,
8968
+ "loss": 9.5874,
8969
+ "step": 1280
8970
+ },
8971
+ {
8972
+ "epoch": 0.9529477403756742,
8973
+ "grad_norm": 3.472780704498291,
8974
+ "learning_rate": 6.314861041935016e-07,
8975
+ "loss": 9.5914,
8976
+ "step": 1281
8977
+ },
8978
+ {
8979
+ "epoch": 0.9536916496187465,
8980
+ "grad_norm": 3.76802921295166,
8981
+ "learning_rate": 6.116386797569207e-07,
8982
+ "loss": 9.5749,
8983
+ "step": 1282
8984
+ },
8985
+ {
8986
+ "epoch": 0.9544355588618189,
8987
+ "grad_norm": 3.547377824783325,
8988
+ "learning_rate": 5.921062356046058e-07,
8989
+ "loss": 9.5844,
8990
+ "step": 1283
8991
+ },
8992
+ {
8993
+ "epoch": 0.9551794681048912,
8994
+ "grad_norm": 3.672638177871704,
8995
+ "learning_rate": 5.728888963070945e-07,
8996
+ "loss": 9.578,
8997
+ "step": 1284
8998
+ },
8999
+ {
9000
+ "epoch": 0.9559233773479635,
9001
+ "grad_norm": 3.672529458999634,
9002
+ "learning_rate": 5.539867844253033e-07,
9003
+ "loss": 9.5785,
9004
+ "step": 1285
9005
+ },
9006
+ {
9007
+ "epoch": 0.956667286591036,
9008
+ "grad_norm": 3.531773328781128,
9009
+ "learning_rate": 5.35400020509752e-07,
9010
+ "loss": 9.5858,
9011
+ "step": 1286
9012
+ },
9013
+ {
9014
+ "epoch": 0.9574111958341083,
9015
+ "grad_norm": 3.609574556350708,
9016
+ "learning_rate": 5.171287230997968e-07,
9017
+ "loss": 9.5813,
9018
+ "step": 1287
9019
+ },
9020
+ {
9021
+ "epoch": 0.9581551050771806,
9022
+ "grad_norm": 3.6094818115234375,
9023
+ "learning_rate": 4.99173008722853e-07,
9024
+ "loss": 9.5812,
9025
+ "step": 1288
9026
+ },
9027
+ {
9028
+ "epoch": 0.9588990143202529,
9029
+ "grad_norm": 3.7675580978393555,
9030
+ "learning_rate": 4.815329918936684e-07,
9031
+ "loss": 9.5752,
9032
+ "step": 1289
9033
+ },
9034
+ {
9035
+ "epoch": 0.9596429235633253,
9036
+ "grad_norm": 3.5020837783813477,
9037
+ "learning_rate": 4.642087851136123e-07,
9038
+ "loss": 9.5884,
9039
+ "step": 1290
9040
+ },
9041
+ {
9042
+ "epoch": 0.9603868328063976,
9043
+ "grad_norm": 3.593885660171509,
9044
+ "learning_rate": 4.472004988699041e-07,
9045
+ "loss": 9.5817,
9046
+ "step": 1291
9047
+ },
9048
+ {
9049
+ "epoch": 0.9611307420494699,
9050
+ "grad_norm": 3.547576665878296,
9051
+ "learning_rate": 4.305082416349804e-07,
9052
+ "loss": 9.5845,
9053
+ "step": 1292
9054
+ },
9055
+ {
9056
+ "epoch": 0.9618746512925423,
9057
+ "grad_norm": 3.6724321842193604,
9058
+ "learning_rate": 4.141321198657178e-07,
9059
+ "loss": 9.5779,
9060
+ "step": 1293
9061
+ },
9062
+ {
9063
+ "epoch": 0.9626185605356147,
9064
+ "grad_norm": 3.5321435928344727,
9065
+ "learning_rate": 3.9807223800284453e-07,
9066
+ "loss": 9.5845,
9067
+ "step": 1294
9068
+ },
9069
+ {
9070
+ "epoch": 0.963362469778687,
9071
+ "grad_norm": 3.594072103500366,
9072
+ "learning_rate": 3.823286984702079e-07,
9073
+ "loss": 9.5809,
9074
+ "step": 1295
9075
+ },
9076
+ {
9077
+ "epoch": 0.9641063790217593,
9078
+ "grad_norm": 3.4732275009155273,
9079
+ "learning_rate": 3.6690160167413554e-07,
9080
+ "loss": 9.5921,
9081
+ "step": 1296
9082
+ },
9083
+ {
9084
+ "epoch": 0.9648502882648317,
9085
+ "grad_norm": 3.473226547241211,
9086
+ "learning_rate": 3.5179104600283617e-07,
9087
+ "loss": 9.5916,
9088
+ "step": 1297
9089
+ },
9090
+ {
9091
+ "epoch": 0.965594197507904,
9092
+ "grad_norm": 3.5020735263824463,
9093
+ "learning_rate": 3.3699712782569446e-07,
9094
+ "loss": 9.5888,
9095
+ "step": 1298
9096
+ },
9097
+ {
9098
+ "epoch": 0.9663381067509764,
9099
+ "grad_norm": 3.6096696853637695,
9100
+ "learning_rate": 3.2251994149273823e-07,
9101
+ "loss": 9.5813,
9102
+ "step": 1299
9103
+ },
9104
+ {
9105
+ "epoch": 0.9670820159940487,
9106
+ "grad_norm": 3.5941600799560547,
9107
+ "learning_rate": 3.0835957933397773e-07,
9108
+ "loss": 9.5805,
9109
+ "step": 1300
9110
+ },
9111
+ {
9112
+ "epoch": 0.9678259252371211,
9113
+ "grad_norm": 3.672405481338501,
9114
+ "learning_rate": 2.945161316588563e-07,
9115
+ "loss": 9.5779,
9116
+ "step": 1301
9117
+ },
9118
+ {
9119
+ "epoch": 0.9685698344801934,
9120
+ "grad_norm": 4.005847930908203,
9121
+ "learning_rate": 2.809896867556283e-07,
9122
+ "loss": 9.567,
9123
+ "step": 1302
9124
+ },
9125
+ {
9126
+ "epoch": 0.9693137437232657,
9127
+ "grad_norm": 3.501986503601074,
9128
+ "learning_rate": 2.677803308908544e-07,
9129
+ "loss": 9.5874,
9130
+ "step": 1303
9131
+ },
9132
+ {
9133
+ "epoch": 0.9700576529663381,
9134
+ "grad_norm": 3.6726467609405518,
9135
+ "learning_rate": 2.548881483088128e-07,
9136
+ "loss": 9.5771,
9137
+ "step": 1304
9138
+ },
9139
+ {
9140
+ "epoch": 0.9708015622094105,
9141
+ "grad_norm": 4.005867958068848,
9142
+ "learning_rate": 2.423132212309609e-07,
9143
+ "loss": 9.5669,
9144
+ "step": 1305
9145
+ },
9146
+ {
9147
+ "epoch": 0.9715454714524828,
9148
+ "grad_norm": 3.609499931335449,
9149
+ "learning_rate": 2.3005562985542462e-07,
9150
+ "loss": 9.5804,
9151
+ "step": 1306
9152
+ },
9153
+ {
9154
+ "epoch": 0.9722893806955551,
9155
+ "grad_norm": 3.609724521636963,
9156
+ "learning_rate": 2.1811545235648212e-07,
9157
+ "loss": 9.581,
9158
+ "step": 1307
9159
+ },
9160
+ {
9161
+ "epoch": 0.9730332899386275,
9162
+ "grad_norm": 3.7671802043914795,
9163
+ "learning_rate": 2.0649276488408086e-07,
9164
+ "loss": 9.5743,
9165
+ "step": 1308
9166
+ },
9167
+ {
9168
+ "epoch": 0.9737771991816998,
9169
+ "grad_norm": 3.547499179840088,
9170
+ "learning_rate": 1.9518764156331027e-07,
9171
+ "loss": 9.5849,
9172
+ "step": 1309
9173
+ },
9174
+ {
9175
+ "epoch": 0.9745211084247721,
9176
+ "grad_norm": 3.6095187664031982,
9177
+ "learning_rate": 1.842001544939742e-07,
9178
+ "loss": 9.5813,
9179
+ "step": 1310
9180
+ },
9181
+ {
9182
+ "epoch": 0.9752650176678446,
9183
+ "grad_norm": 3.6724700927734375,
9184
+ "learning_rate": 1.735303737501137e-07,
9185
+ "loss": 9.5773,
9186
+ "step": 1311
9187
+ },
9188
+ {
9189
+ "epoch": 0.9760089269109169,
9190
+ "grad_norm": 3.609522581100464,
9191
+ "learning_rate": 1.6317836737955172e-07,
9192
+ "loss": 9.581,
9193
+ "step": 1312
9194
+ },
9195
+ {
9196
+ "epoch": 0.9767528361539892,
9197
+ "grad_norm": 3.767378330230713,
9198
+ "learning_rate": 1.5314420140346564e-07,
9199
+ "loss": 9.5747,
9200
+ "step": 1313
9201
+ },
9202
+ {
9203
+ "epoch": 0.9774967453970616,
9204
+ "grad_norm": 3.502347469329834,
9205
+ "learning_rate": 1.4342793981597103e-07,
9206
+ "loss": 9.5884,
9207
+ "step": 1314
9208
+ },
9209
+ {
9210
+ "epoch": 0.9782406546401339,
9211
+ "grad_norm": 3.6725053787231445,
9212
+ "learning_rate": 1.340296445837108e-07,
9213
+ "loss": 9.5769,
9214
+ "step": 1315
9215
+ },
9216
+ {
9217
+ "epoch": 0.9789845638832062,
9218
+ "grad_norm": 3.672431707382202,
9219
+ "learning_rate": 1.2494937564545562e-07,
9220
+ "loss": 9.5781,
9221
+ "step": 1316
9222
+ },
9223
+ {
9224
+ "epoch": 0.9797284731262786,
9225
+ "grad_norm": 3.5474746227264404,
9226
+ "learning_rate": 1.161871909117207e-07,
9227
+ "loss": 9.5839,
9228
+ "step": 1317
9229
+ },
9230
+ {
9231
+ "epoch": 0.980472382369351,
9232
+ "grad_norm": 3.6726107597351074,
9233
+ "learning_rate": 1.0774314626441628e-07,
9234
+ "loss": 9.5783,
9235
+ "step": 1318
9236
+ },
9237
+ {
9238
+ "epoch": 0.9812162916124233,
9239
+ "grad_norm": 3.4735405445098877,
9240
+ "learning_rate": 9.961729555645338e-08,
9241
+ "loss": 9.5917,
9242
+ "step": 1319
9243
+ },
9244
+ {
9245
+ "epoch": 0.9819602008554956,
9246
+ "grad_norm": 3.5478289127349854,
9247
+ "learning_rate": 9.180969061143852e-08,
9248
+ "loss": 9.5844,
9249
+ "step": 1320
9250
+ },
9251
+ {
9252
+ "epoch": 0.982704110098568,
9253
+ "grad_norm": 3.547591209411621,
9254
+ "learning_rate": 8.432038122331842e-08,
9255
+ "loss": 9.5842,
9256
+ "step": 1321
9257
+ },
9258
+ {
9259
+ "epoch": 0.9834480193416403,
9260
+ "grad_norm": 3.6724138259887695,
9261
+ "learning_rate": 7.714941515608587e-08,
9262
+ "loss": 9.5777,
9263
+ "step": 1322
9264
+ },
9265
+ {
9266
+ "epoch": 0.9841919285847127,
9267
+ "grad_norm": 3.7674906253814697,
9268
+ "learning_rate": 7.029683814343547e-08,
9269
+ "loss": 9.5748,
9270
+ "step": 1323
9271
+ },
9272
+ {
9273
+ "epoch": 0.984935837827785,
9274
+ "grad_norm": 3.767176866531372,
9275
+ "learning_rate": 6.376269388852496e-08,
9276
+ "loss": 9.5745,
9277
+ "step": 1324
9278
+ },
9279
+ {
9280
+ "epoch": 0.9856797470708574,
9281
+ "grad_norm": 3.593855857849121,
9282
+ "learning_rate": 5.7547024063642204e-08,
9283
+ "loss": 9.5817,
9284
+ "step": 1325
9285
+ },
9286
+ {
9287
+ "epoch": 0.9864236563139297,
9288
+ "grad_norm": 3.547398567199707,
9289
+ "learning_rate": 5.164986830998308e-08,
9290
+ "loss": 9.5846,
9291
+ "step": 1326
9292
+ },
9293
+ {
9294
+ "epoch": 0.987167565557002,
9295
+ "grad_norm": 3.4740869998931885,
9296
+ "learning_rate": 4.607126423737951e-08,
9297
+ "loss": 9.591,
9298
+ "step": 1327
9299
+ },
9300
+ {
9301
+ "epoch": 0.9879114748000744,
9302
+ "grad_norm": 3.5474886894226074,
9303
+ "learning_rate": 4.0811247424049625e-08,
9304
+ "loss": 9.5845,
9305
+ "step": 1328
9306
+ },
9307
+ {
9308
+ "epoch": 0.9886553840431468,
9309
+ "grad_norm": 3.67246150970459,
9310
+ "learning_rate": 3.586985141639798e-08,
9311
+ "loss": 9.5775,
9312
+ "step": 1329
9313
+ },
9314
+ {
9315
+ "epoch": 0.9893992932862191,
9316
+ "grad_norm": 3.4746315479278564,
9317
+ "learning_rate": 3.124710772877682e-08,
9318
+ "loss": 9.5916,
9319
+ "step": 1330
9320
+ },
9321
+ {
9322
+ "epoch": 0.9901432025292914,
9323
+ "grad_norm": 3.5024232864379883,
9324
+ "learning_rate": 2.694304584329732e-08,
9325
+ "loss": 9.5883,
9326
+ "step": 1331
9327
+ },
9328
+ {
9329
+ "epoch": 0.9908871117723638,
9330
+ "grad_norm": 3.8790781497955322,
9331
+ "learning_rate": 2.2957693209635368e-08,
9332
+ "loss": 9.5715,
9333
+ "step": 1332
9334
+ },
9335
+ {
9336
+ "epoch": 0.9916310210154361,
9337
+ "grad_norm": 3.6882379055023193,
9338
+ "learning_rate": 1.9291075244864954e-08,
9339
+ "loss": 9.5784,
9340
+ "step": 1333
9341
+ },
9342
+ {
9343
+ "epoch": 0.9923749302585084,
9344
+ "grad_norm": 3.7677388191223145,
9345
+ "learning_rate": 1.594321533328058e-08,
9346
+ "loss": 9.5748,
9347
+ "step": 1334
9348
+ },
9349
+ {
9350
+ "epoch": 0.9931188395015808,
9351
+ "grad_norm": 3.5019233226776123,
9352
+ "learning_rate": 1.2914134826280677e-08,
9353
+ "loss": 9.5891,
9354
+ "step": 1335
9355
+ },
9356
+ {
9357
+ "epoch": 0.9938627487446532,
9358
+ "grad_norm": 3.5015294551849365,
9359
+ "learning_rate": 1.0203853042184407e-08,
9360
+ "loss": 9.588,
9361
+ "step": 1336
9362
+ },
9363
+ {
9364
+ "epoch": 0.9946066579877255,
9365
+ "grad_norm": 3.5472798347473145,
9366
+ "learning_rate": 7.812387266142862e-09,
9367
+ "loss": 9.5846,
9368
+ "step": 1337
9369
+ },
9370
+ {
9371
+ "epoch": 0.9953505672307978,
9372
+ "grad_norm": 3.5023794174194336,
9373
+ "learning_rate": 5.7397527500224755e-09,
9374
+ "loss": 9.5886,
9375
+ "step": 1338
9376
+ },
9377
+ {
9378
+ "epoch": 0.9960944764738702,
9379
+ "grad_norm": 3.5475242137908936,
9380
+ "learning_rate": 3.985962712310665e-09,
9381
+ "loss": 9.5851,
9382
+ "step": 1339
9383
+ },
9384
+ {
9385
+ "epoch": 0.9968383857169425,
9386
+ "grad_norm": 3.5319745540618896,
9387
+ "learning_rate": 2.5510283379992504e-09,
9388
+ "loss": 9.5847,
9389
+ "step": 1340
9390
+ },
9391
+ {
9392
+ "epoch": 0.9975822949600148,
9393
+ "grad_norm": 3.4726641178131104,
9394
+ "learning_rate": 1.4349587785733586e-09,
9395
+ "loss": 9.5916,
9396
+ "step": 1341
9397
+ },
9398
+ {
9399
+ "epoch": 0.9983262042030873,
9400
+ "grad_norm": 3.768148422241211,
9401
+ "learning_rate": 6.377611518948446e-10,
9402
+ "loss": 9.5752,
9403
+ "step": 1342
9404
+ },
9405
+ {
9406
+ "epoch": 0.9990701134461596,
9407
+ "grad_norm": 3.688249111175537,
9408
+ "learning_rate": 1.594405421856404e-10,
9409
+ "loss": 9.578,
9410
+ "step": 1343
9411
+ },
9412
+ {
9413
+ "epoch": 0.9998140226892319,
9414
+ "grad_norm": 3.5478217601776123,
9415
+ "learning_rate": 0.0,
9416
+ "loss": 9.5859,
9417
+ "step": 1344
9418
  }
9419
  ],
9420
  "logging_steps": 1,
 
9429
  "should_evaluate": false,
9430
  "should_log": false,
9431
  "should_save": true,
9432
+ "should_training_stop": true
9433
  },
9434
  "attributes": {}
9435
  }
9436
  },
9437
+ "total_flos": 672079401713664.0,
9438
  "train_batch_size": 4,
9439
  "trial_name": null,
9440
  "trial_params": null