antonpolishko commited on
Commit
66e34d6
·
verified ·
1 Parent(s): 35ef4ce

Training in progress, epoch 3, checkpoint

Browse files
last-checkpoint/model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35e7bd1801d0d4245d02fcc221540d160d9ca4436762abd40ab696436db37997
3
  size 4949453792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cb217517103aa1f9b67671200e1088c82e998f977765551951ecbd6cacb9fc3
3
  size 4949453792
last-checkpoint/model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0fac833f220a4de94d80f1c6128e85dbd0534b37e1778d74f64d089406b9a9cd
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d306a9a8a0ee55b4dfde88bec95cd9d2efca13601a22479321364d39624178e8
3
  size 4999819336
last-checkpoint/model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5d6ebf26b088bbc6f219afaf49ff803ae69e0485761a48a6794d7aaaccc4eb7
3
  size 4546807800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9842fedf4985d9dd12c414781f60bc29e4a8029b6b63daa86ef5332b34e099e7
3
  size 4546807800
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:84bf6c04da13948bebd4d30d4b13cd682886c3f4762b4f85e343d6b5fe8ad40e
3
  size 28992348490
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09fbba3766d6cb31a87875e81304d345ce1b8197cb449721c0f15cd8e1db6251
3
  size 28992348490
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06fea830cf5ad73ec00d500ea6fb952740ac936f18e93fa2d32abde1ea3ead92
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb18ac8d6db3307b1c242f7cb069fc8b8dab957434ddfcafcac997cfd6a43abf
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be561d1df19be227394d8ea607c54262a06c9bf880af0aa5e04a52596a2a6cb0
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bdab708057b5f34a402d9a2b4443f5f93a8e8ee2ddb66d955f0a15ad394ecc5
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03f3e24417a59435f5a8450a4aeb0f09cc92734b5c3b45a0701b2c043c415c05
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:599882a30c163a5a2a000c4e74b320ecc4a55aa1b079882fd66aa3d2559d19e7
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2bea02744c29f30024590ab1629a0e7b7dabbf1e8476456c2e7c5ce46dc35c28
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:567c3b482c209c2778fc017e39a38642c488edda20673ef29f571ef7177ad81e
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:041be966454b60c86af576fc1eb7f34189114689abff8f9622b947110f7334c8
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f9ffe9a916e778423aaed4ec842923c9ccfdd3d7a4fbad10dc6a3bfc278fb8e
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b85766f6596d15a810177d77dd259d9b50588cf100ec5f8ebff5fed881d57957
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7ede8a81aa3c780fb9c3cb57537752a782c4aed1dcecb7aafd6ca5a7ea90252
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8be75d04b1ebe614241b88fd010a5dda1b7bf703c00c6ebe310ca07975830fe7
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b1c5c0c0afa907d332467e631e6cee80ba476689aa0caa77689ca273d83b3e4
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4699833a7ab4cb692996ef7567f934c0bac79d6a067963a873f89a38e412bd48
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73025ac422abb13303ee974109cf39f6f848de7f7013e828d04aa4e2ec0e6757
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a23384b1a4df8f5cde36ecc73a9742d7fae5940c0f154d6cc580286c571d0dba
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dbc6caf624eba5c924d315b74568bd01ca1fccdd670b0ff8efdc24821d15151
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.0,
5
  "eval_steps": 300,
6
- "global_step": 962,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1382,6 +1382,686 @@
1382
  "learning_rate": 5.050432566228552e-07,
1383
  "loss": 1.1831,
1384
  "step": 960
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1385
  }
1386
  ],
1387
  "logging_steps": 5,
@@ -1396,12 +2076,12 @@
1396
  "should_evaluate": false,
1397
  "should_log": false,
1398
  "should_save": true,
1399
- "should_training_stop": false
1400
  },
1401
  "attributes": {}
1402
  }
1403
  },
1404
- "total_flos": 5.381932813203276e+18,
1405
  "train_batch_size": 8,
1406
  "trial_name": null,
1407
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
  "eval_steps": 300,
6
+ "global_step": 1443,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1382
  "learning_rate": 5.050432566228552e-07,
1383
  "loss": 1.1831,
1384
  "step": 960
1385
+ },
1386
+ {
1387
+ "epoch": 2.006237006237006,
1388
+ "grad_norm": 3.609375,
1389
+ "learning_rate": 4.956010076471065e-07,
1390
+ "loss": 1.1592,
1391
+ "step": 965
1392
+ },
1393
+ {
1394
+ "epoch": 2.0166320166320166,
1395
+ "grad_norm": 3.53125,
1396
+ "learning_rate": 4.862186938359441e-07,
1397
+ "loss": 1.1667,
1398
+ "step": 970
1399
+ },
1400
+ {
1401
+ "epoch": 2.027027027027027,
1402
+ "grad_norm": 3.5625,
1403
+ "learning_rate": 4.768974300419573e-07,
1404
+ "loss": 1.1678,
1405
+ "step": 975
1406
+ },
1407
+ {
1408
+ "epoch": 2.0374220374220373,
1409
+ "grad_norm": 3.625,
1410
+ "learning_rate": 4.6763832386347214e-07,
1411
+ "loss": 1.1723,
1412
+ "step": 980
1413
+ },
1414
+ {
1415
+ "epoch": 2.047817047817048,
1416
+ "grad_norm": 3.578125,
1417
+ "learning_rate": 4.5844247551294224e-07,
1418
+ "loss": 1.1642,
1419
+ "step": 985
1420
+ },
1421
+ {
1422
+ "epoch": 2.0582120582120584,
1423
+ "grad_norm": 3.609375,
1424
+ "learning_rate": 4.493109776862143e-07,
1425
+ "loss": 1.17,
1426
+ "step": 990
1427
+ },
1428
+ {
1429
+ "epoch": 2.0686070686070686,
1430
+ "grad_norm": 3.640625,
1431
+ "learning_rate": 4.402449154326913e-07,
1432
+ "loss": 1.1651,
1433
+ "step": 995
1434
+ },
1435
+ {
1436
+ "epoch": 2.079002079002079,
1437
+ "grad_norm": 3.640625,
1438
+ "learning_rate": 4.312453660263987e-07,
1439
+ "loss": 1.1783,
1440
+ "step": 1000
1441
+ },
1442
+ {
1443
+ "epoch": 2.0893970893970892,
1444
+ "grad_norm": 3.578125,
1445
+ "learning_rate": 4.2231339883798025e-07,
1446
+ "loss": 1.1711,
1447
+ "step": 1005
1448
+ },
1449
+ {
1450
+ "epoch": 2.0997920997921,
1451
+ "grad_norm": 3.5625,
1452
+ "learning_rate": 4.13450075207628e-07,
1453
+ "loss": 1.163,
1454
+ "step": 1010
1455
+ },
1456
+ {
1457
+ "epoch": 2.1101871101871104,
1458
+ "grad_norm": 3.515625,
1459
+ "learning_rate": 4.0465644831897006e-07,
1460
+ "loss": 1.17,
1461
+ "step": 1015
1462
+ },
1463
+ {
1464
+ "epoch": 2.1205821205821205,
1465
+ "grad_norm": 3.5625,
1466
+ "learning_rate": 3.9593356307392436e-07,
1467
+ "loss": 1.1733,
1468
+ "step": 1020
1469
+ },
1470
+ {
1471
+ "epoch": 2.130977130977131,
1472
+ "grad_norm": 3.59375,
1473
+ "learning_rate": 3.872824559685409e-07,
1474
+ "loss": 1.1762,
1475
+ "step": 1025
1476
+ },
1477
+ {
1478
+ "epoch": 2.141372141372141,
1479
+ "grad_norm": 3.671875,
1480
+ "learning_rate": 3.7870415496983743e-07,
1481
+ "loss": 1.1734,
1482
+ "step": 1030
1483
+ },
1484
+ {
1485
+ "epoch": 2.1517671517671517,
1486
+ "grad_norm": 3.578125,
1487
+ "learning_rate": 3.701996793936535e-07,
1488
+ "loss": 1.1724,
1489
+ "step": 1035
1490
+ },
1491
+ {
1492
+ "epoch": 2.1621621621621623,
1493
+ "grad_norm": 3.5625,
1494
+ "learning_rate": 3.6177003978352917e-07,
1495
+ "loss": 1.1718,
1496
+ "step": 1040
1497
+ },
1498
+ {
1499
+ "epoch": 2.1725571725571724,
1500
+ "grad_norm": 3.5625,
1501
+ "learning_rate": 3.5341623779062813e-07,
1502
+ "loss": 1.1688,
1503
+ "step": 1045
1504
+ },
1505
+ {
1506
+ "epoch": 2.182952182952183,
1507
+ "grad_norm": 3.609375,
1508
+ "learning_rate": 3.45139266054715e-07,
1509
+ "loss": 1.1732,
1510
+ "step": 1050
1511
+ },
1512
+ {
1513
+ "epoch": 2.1933471933471935,
1514
+ "grad_norm": 3.59375,
1515
+ "learning_rate": 3.3694010808620733e-07,
1516
+ "loss": 1.1619,
1517
+ "step": 1055
1518
+ },
1519
+ {
1520
+ "epoch": 2.2037422037422036,
1521
+ "grad_norm": 3.625,
1522
+ "learning_rate": 3.288197381493075e-07,
1523
+ "loss": 1.1673,
1524
+ "step": 1060
1525
+ },
1526
+ {
1527
+ "epoch": 2.214137214137214,
1528
+ "grad_norm": 3.609375,
1529
+ "learning_rate": 3.207791211462383e-07,
1530
+ "loss": 1.1725,
1531
+ "step": 1065
1532
+ },
1533
+ {
1534
+ "epoch": 2.2245322245322248,
1535
+ "grad_norm": 3.546875,
1536
+ "learning_rate": 3.128192125025869e-07,
1537
+ "loss": 1.1673,
1538
+ "step": 1070
1539
+ },
1540
+ {
1541
+ "epoch": 2.234927234927235,
1542
+ "grad_norm": 3.578125,
1543
+ "learning_rate": 3.049409580537773e-07,
1544
+ "loss": 1.1735,
1545
+ "step": 1075
1546
+ },
1547
+ {
1548
+ "epoch": 2.2453222453222454,
1549
+ "grad_norm": 3.5625,
1550
+ "learning_rate": 2.9714529393268016e-07,
1551
+ "loss": 1.1583,
1552
+ "step": 1080
1553
+ },
1554
+ {
1555
+ "epoch": 2.2557172557172556,
1556
+ "grad_norm": 3.484375,
1557
+ "learning_rate": 2.8943314645837955e-07,
1558
+ "loss": 1.1715,
1559
+ "step": 1085
1560
+ },
1561
+ {
1562
+ "epoch": 2.266112266112266,
1563
+ "grad_norm": 3.609375,
1564
+ "learning_rate": 2.8180543202609984e-07,
1565
+ "loss": 1.164,
1566
+ "step": 1090
1567
+ },
1568
+ {
1569
+ "epoch": 2.2765072765072767,
1570
+ "grad_norm": 3.53125,
1571
+ "learning_rate": 2.742630569983182e-07,
1572
+ "loss": 1.1695,
1573
+ "step": 1095
1574
+ },
1575
+ {
1576
+ "epoch": 2.286902286902287,
1577
+ "grad_norm": 3.578125,
1578
+ "learning_rate": 2.66806917597064e-07,
1579
+ "loss": 1.169,
1580
+ "step": 1100
1581
+ },
1582
+ {
1583
+ "epoch": 2.2972972972972974,
1584
+ "grad_norm": 3.578125,
1585
+ "learning_rate": 2.594378997974267e-07,
1586
+ "loss": 1.1615,
1587
+ "step": 1105
1588
+ },
1589
+ {
1590
+ "epoch": 2.3076923076923075,
1591
+ "grad_norm": 3.6875,
1592
+ "learning_rate": 2.5215687922227845e-07,
1593
+ "loss": 1.1712,
1594
+ "step": 1110
1595
+ },
1596
+ {
1597
+ "epoch": 2.318087318087318,
1598
+ "grad_norm": 3.515625,
1599
+ "learning_rate": 2.4496472103823027e-07,
1600
+ "loss": 1.1688,
1601
+ "step": 1115
1602
+ },
1603
+ {
1604
+ "epoch": 2.3284823284823286,
1605
+ "grad_norm": 3.609375,
1606
+ "learning_rate": 2.378622798528266e-07,
1607
+ "loss": 1.1631,
1608
+ "step": 1120
1609
+ },
1610
+ {
1611
+ "epoch": 2.3388773388773387,
1612
+ "grad_norm": 3.6875,
1613
+ "learning_rate": 2.3085039961299814e-07,
1614
+ "loss": 1.1671,
1615
+ "step": 1125
1616
+ },
1617
+ {
1618
+ "epoch": 2.3492723492723493,
1619
+ "grad_norm": 3.640625,
1620
+ "learning_rate": 2.239299135047794e-07,
1621
+ "loss": 1.1623,
1622
+ "step": 1130
1623
+ },
1624
+ {
1625
+ "epoch": 2.35966735966736,
1626
+ "grad_norm": 3.578125,
1627
+ "learning_rate": 2.1710164385430585e-07,
1628
+ "loss": 1.1716,
1629
+ "step": 1135
1630
+ },
1631
+ {
1632
+ "epoch": 2.37006237006237,
1633
+ "grad_norm": 3.546875,
1634
+ "learning_rate": 2.103664020300997e-07,
1635
+ "loss": 1.1674,
1636
+ "step": 1140
1637
+ },
1638
+ {
1639
+ "epoch": 2.3804573804573805,
1640
+ "grad_norm": 3.453125,
1641
+ "learning_rate": 2.037249883466614e-07,
1642
+ "loss": 1.1623,
1643
+ "step": 1145
1644
+ },
1645
+ {
1646
+ "epoch": 2.390852390852391,
1647
+ "grad_norm": 3.65625,
1648
+ "learning_rate": 1.971781919693697e-07,
1649
+ "loss": 1.1808,
1650
+ "step": 1150
1651
+ },
1652
+ {
1653
+ "epoch": 2.401247401247401,
1654
+ "grad_norm": 3.625,
1655
+ "learning_rate": 1.9072679082071163e-07,
1656
+ "loss": 1.169,
1657
+ "step": 1155
1658
+ },
1659
+ {
1660
+ "epoch": 2.4116424116424118,
1661
+ "grad_norm": 3.671875,
1662
+ "learning_rate": 1.8437155148784433e-07,
1663
+ "loss": 1.1717,
1664
+ "step": 1160
1665
+ },
1666
+ {
1667
+ "epoch": 2.422037422037422,
1668
+ "grad_norm": 3.59375,
1669
+ "learning_rate": 1.781132291315064e-07,
1670
+ "loss": 1.1706,
1671
+ "step": 1165
1672
+ },
1673
+ {
1674
+ "epoch": 2.4324324324324325,
1675
+ "grad_norm": 3.625,
1676
+ "learning_rate": 1.7195256739628439e-07,
1677
+ "loss": 1.1722,
1678
+ "step": 1170
1679
+ },
1680
+ {
1681
+ "epoch": 2.442827442827443,
1682
+ "grad_norm": 3.71875,
1683
+ "learning_rate": 1.6589029832225155e-07,
1684
+ "loss": 1.1615,
1685
+ "step": 1175
1686
+ },
1687
+ {
1688
+ "epoch": 2.453222453222453,
1689
+ "grad_norm": 3.625,
1690
+ "learning_rate": 1.599271422579812e-07,
1691
+ "loss": 1.1691,
1692
+ "step": 1180
1693
+ },
1694
+ {
1695
+ "epoch": 2.4636174636174637,
1696
+ "grad_norm": 3.640625,
1697
+ "learning_rate": 1.5406380777495297e-07,
1698
+ "loss": 1.1647,
1699
+ "step": 1185
1700
+ },
1701
+ {
1702
+ "epoch": 2.474012474012474,
1703
+ "grad_norm": 3.59375,
1704
+ "learning_rate": 1.4830099158335563e-07,
1705
+ "loss": 1.1707,
1706
+ "step": 1190
1707
+ },
1708
+ {
1709
+ "epoch": 2.4844074844074844,
1710
+ "grad_norm": 3.578125,
1711
+ "learning_rate": 1.426393784493015e-07,
1712
+ "loss": 1.1564,
1713
+ "step": 1195
1714
+ },
1715
+ {
1716
+ "epoch": 2.494802494802495,
1717
+ "grad_norm": 3.59375,
1718
+ "learning_rate": 1.3707964111345805e-07,
1719
+ "loss": 1.1721,
1720
+ "step": 1200
1721
+ },
1722
+ {
1723
+ "epoch": 2.494802494802495,
1724
+ "eval_loss": 1.1930803060531616,
1725
+ "eval_runtime": 10.8355,
1726
+ "eval_samples_per_second": 85.829,
1727
+ "eval_steps_per_second": 2.769,
1728
+ "step": 1200
1729
+ },
1730
+ {
1731
+ "epoch": 2.505197505197505,
1732
+ "grad_norm": 3.65625,
1733
+ "learning_rate": 1.3162244021111123e-07,
1734
+ "loss": 1.1677,
1735
+ "step": 1205
1736
+ },
1737
+ {
1738
+ "epoch": 2.5155925155925156,
1739
+ "grad_norm": 3.59375,
1740
+ "learning_rate": 1.2626842419366369e-07,
1741
+ "loss": 1.1551,
1742
+ "step": 1210
1743
+ },
1744
+ {
1745
+ "epoch": 2.525987525987526,
1746
+ "grad_norm": 3.640625,
1747
+ "learning_rate": 1.2101822925158378e-07,
1748
+ "loss": 1.1678,
1749
+ "step": 1215
1750
+ },
1751
+ {
1752
+ "epoch": 2.5363825363825363,
1753
+ "grad_norm": 3.65625,
1754
+ "learning_rate": 1.1587247923881016e-07,
1755
+ "loss": 1.1731,
1756
+ "step": 1220
1757
+ },
1758
+ {
1759
+ "epoch": 2.546777546777547,
1760
+ "grad_norm": 3.6875,
1761
+ "learning_rate": 1.1083178559862227e-07,
1762
+ "loss": 1.1707,
1763
+ "step": 1225
1764
+ },
1765
+ {
1766
+ "epoch": 2.5571725571725574,
1767
+ "grad_norm": 3.671875,
1768
+ "learning_rate": 1.0589674729098507e-07,
1769
+ "loss": 1.1733,
1770
+ "step": 1230
1771
+ },
1772
+ {
1773
+ "epoch": 2.5675675675675675,
1774
+ "grad_norm": 3.578125,
1775
+ "learning_rate": 1.0106795072137896e-07,
1776
+ "loss": 1.1741,
1777
+ "step": 1235
1778
+ },
1779
+ {
1780
+ "epoch": 2.577962577962578,
1781
+ "grad_norm": 3.625,
1782
+ "learning_rate": 9.634596967111853e-08,
1783
+ "loss": 1.1704,
1784
+ "step": 1240
1785
+ },
1786
+ {
1787
+ "epoch": 2.5883575883575882,
1788
+ "grad_norm": 3.640625,
1789
+ "learning_rate": 9.173136522917457e-08,
1790
+ "loss": 1.1679,
1791
+ "step": 1245
1792
+ },
1793
+ {
1794
+ "epoch": 2.598752598752599,
1795
+ "grad_norm": 3.546875,
1796
+ "learning_rate": 8.722468572550213e-08,
1797
+ "loss": 1.1682,
1798
+ "step": 1250
1799
+ },
1800
+ {
1801
+ "epoch": 2.609147609147609,
1802
+ "grad_norm": 3.640625,
1803
+ "learning_rate": 8.28264666658851e-08,
1804
+ "loss": 1.1653,
1805
+ "step": 1255
1806
+ },
1807
+ {
1808
+ "epoch": 2.6195426195426195,
1809
+ "grad_norm": 3.734375,
1810
+ "learning_rate": 7.853723066830486e-08,
1811
+ "loss": 1.1672,
1812
+ "step": 1260
1813
+ },
1814
+ {
1815
+ "epoch": 2.62993762993763,
1816
+ "grad_norm": 3.640625,
1817
+ "learning_rate": 7.435748740084046e-08,
1818
+ "loss": 1.1606,
1819
+ "step": 1265
1820
+ },
1821
+ {
1822
+ "epoch": 2.64033264033264,
1823
+ "grad_norm": 3.65625,
1824
+ "learning_rate": 7.028773352110684e-08,
1825
+ "loss": 1.1634,
1826
+ "step": 1270
1827
+ },
1828
+ {
1829
+ "epoch": 2.6507276507276507,
1830
+ "grad_norm": 3.515625,
1831
+ "learning_rate": 6.632845261724051e-08,
1832
+ "loss": 1.1635,
1833
+ "step": 1275
1834
+ },
1835
+ {
1836
+ "epoch": 2.6611226611226613,
1837
+ "grad_norm": 4.1875,
1838
+ "learning_rate": 6.248011515043617e-08,
1839
+ "loss": 1.1641,
1840
+ "step": 1280
1841
+ },
1842
+ {
1843
+ "epoch": 2.6715176715176714,
1844
+ "grad_norm": 3.65625,
1845
+ "learning_rate": 5.8743178399044966e-08,
1846
+ "loss": 1.1642,
1847
+ "step": 1285
1848
+ },
1849
+ {
1850
+ "epoch": 2.681912681912682,
1851
+ "grad_norm": 3.625,
1852
+ "learning_rate": 5.511808640423765e-08,
1853
+ "loss": 1.1727,
1854
+ "step": 1290
1855
+ },
1856
+ {
1857
+ "epoch": 2.6923076923076925,
1858
+ "grad_norm": 3.59375,
1859
+ "learning_rate": 5.160526991724246e-08,
1860
+ "loss": 1.1732,
1861
+ "step": 1295
1862
+ },
1863
+ {
1864
+ "epoch": 2.7027027027027026,
1865
+ "grad_norm": 3.609375,
1866
+ "learning_rate": 4.8205146348160195e-08,
1867
+ "loss": 1.1699,
1868
+ "step": 1300
1869
+ },
1870
+ {
1871
+ "epoch": 2.713097713097713,
1872
+ "grad_norm": 3.6875,
1873
+ "learning_rate": 4.491811971636605e-08,
1874
+ "loss": 1.166,
1875
+ "step": 1305
1876
+ },
1877
+ {
1878
+ "epoch": 2.7234927234927238,
1879
+ "grad_norm": 3.625,
1880
+ "learning_rate": 4.174458060250208e-08,
1881
+ "loss": 1.1712,
1882
+ "step": 1310
1883
+ },
1884
+ {
1885
+ "epoch": 2.733887733887734,
1886
+ "grad_norm": 3.59375,
1887
+ "learning_rate": 3.868490610206565e-08,
1888
+ "loss": 1.1595,
1889
+ "step": 1315
1890
+ },
1891
+ {
1892
+ "epoch": 2.7442827442827444,
1893
+ "grad_norm": 3.578125,
1894
+ "learning_rate": 3.5739459780602665e-08,
1895
+ "loss": 1.1684,
1896
+ "step": 1320
1897
+ },
1898
+ {
1899
+ "epoch": 2.7546777546777546,
1900
+ "grad_norm": 3.578125,
1901
+ "learning_rate": 3.290859163050508e-08,
1902
+ "loss": 1.1744,
1903
+ "step": 1325
1904
+ },
1905
+ {
1906
+ "epoch": 2.765072765072765,
1907
+ "grad_norm": 3.671875,
1908
+ "learning_rate": 3.0192638029424735e-08,
1909
+ "loss": 1.1664,
1910
+ "step": 1330
1911
+ },
1912
+ {
1913
+ "epoch": 2.7754677754677752,
1914
+ "grad_norm": 3.53125,
1915
+ "learning_rate": 2.7591921700302222e-08,
1916
+ "loss": 1.1612,
1917
+ "step": 1335
1918
+ },
1919
+ {
1920
+ "epoch": 2.785862785862786,
1921
+ "grad_norm": 3.609375,
1922
+ "learning_rate": 2.5106751673020012e-08,
1923
+ "loss": 1.174,
1924
+ "step": 1340
1925
+ },
1926
+ {
1927
+ "epoch": 2.7962577962577964,
1928
+ "grad_norm": 3.59375,
1929
+ "learning_rate": 2.273742324768124e-08,
1930
+ "loss": 1.1602,
1931
+ "step": 1345
1932
+ },
1933
+ {
1934
+ "epoch": 2.8066528066528065,
1935
+ "grad_norm": 3.546875,
1936
+ "learning_rate": 2.048421795952171e-08,
1937
+ "loss": 1.1708,
1938
+ "step": 1350
1939
+ },
1940
+ {
1941
+ "epoch": 2.817047817047817,
1942
+ "grad_norm": 3.609375,
1943
+ "learning_rate": 1.8347403545455497e-08,
1944
+ "loss": 1.1622,
1945
+ "step": 1355
1946
+ },
1947
+ {
1948
+ "epoch": 2.8274428274428276,
1949
+ "grad_norm": 3.640625,
1950
+ "learning_rate": 1.6327233912261984e-08,
1951
+ "loss": 1.1668,
1952
+ "step": 1360
1953
+ },
1954
+ {
1955
+ "epoch": 2.8378378378378377,
1956
+ "grad_norm": 3.609375,
1957
+ "learning_rate": 1.4423949106414868e-08,
1958
+ "loss": 1.1708,
1959
+ "step": 1365
1960
+ },
1961
+ {
1962
+ "epoch": 2.8482328482328483,
1963
+ "grad_norm": 3.640625,
1964
+ "learning_rate": 1.2637775285558983e-08,
1965
+ "loss": 1.1663,
1966
+ "step": 1370
1967
+ },
1968
+ {
1969
+ "epoch": 2.858627858627859,
1970
+ "grad_norm": 3.5625,
1971
+ "learning_rate": 1.0968924691636572e-08,
1972
+ "loss": 1.1621,
1973
+ "step": 1375
1974
+ },
1975
+ {
1976
+ "epoch": 2.869022869022869,
1977
+ "grad_norm": 3.671875,
1978
+ "learning_rate": 9.417595625668462e-09,
1979
+ "loss": 1.1769,
1980
+ "step": 1380
1981
+ },
1982
+ {
1983
+ "epoch": 2.8794178794178795,
1984
+ "grad_norm": 3.71875,
1985
+ "learning_rate": 7.983972424190354e-09,
1986
+ "loss": 1.1784,
1987
+ "step": 1385
1988
+ },
1989
+ {
1990
+ "epoch": 2.88981288981289,
1991
+ "grad_norm": 3.65625,
1992
+ "learning_rate": 6.668225437349351e-09,
1993
+ "loss": 1.1734,
1994
+ "step": 1390
1995
+ },
1996
+ {
1997
+ "epoch": 2.9002079002079,
1998
+ "grad_norm": 3.75,
1999
+ "learning_rate": 5.470511008662026e-09,
2000
+ "loss": 1.1747,
2001
+ "step": 1395
2002
+ },
2003
+ {
2004
+ "epoch": 2.9106029106029108,
2005
+ "grad_norm": 3.671875,
2006
+ "learning_rate": 4.390971456437076e-09,
2007
+ "loss": 1.171,
2008
+ "step": 1400
2009
+ },
2010
+ {
2011
+ "epoch": 2.920997920997921,
2012
+ "grad_norm": 3.671875,
2013
+ "learning_rate": 3.429735056863725e-09,
2014
+ "loss": 1.1667,
2015
+ "step": 1405
2016
+ },
2017
+ {
2018
+ "epoch": 2.9313929313929314,
2019
+ "grad_norm": 3.671875,
2020
+ "learning_rate": 2.5869160287702586e-09,
2021
+ "loss": 1.1683,
2022
+ "step": 1410
2023
+ },
2024
+ {
2025
+ "epoch": 2.9417879417879416,
2026
+ "grad_norm": 3.625,
2027
+ "learning_rate": 1.8626145200513199e-09,
2028
+ "loss": 1.1702,
2029
+ "step": 1415
2030
+ },
2031
+ {
2032
+ "epoch": 2.952182952182952,
2033
+ "grad_norm": 3.5,
2034
+ "learning_rate": 1.2569165957680983e-09,
2035
+ "loss": 1.1712,
2036
+ "step": 1420
2037
+ },
2038
+ {
2039
+ "epoch": 2.9625779625779627,
2040
+ "grad_norm": 3.640625,
2041
+ "learning_rate": 7.698942279216192e-10,
2042
+ "loss": 1.1725,
2043
+ "step": 1425
2044
+ },
2045
+ {
2046
+ "epoch": 2.972972972972973,
2047
+ "grad_norm": 3.5,
2048
+ "learning_rate": 4.016052869005859e-10,
2049
+ "loss": 1.1693,
2050
+ "step": 1430
2051
+ },
2052
+ {
2053
+ "epoch": 2.9833679833679834,
2054
+ "grad_norm": 3.578125,
2055
+ "learning_rate": 1.520935346051022e-10,
2056
+ "loss": 1.1682,
2057
+ "step": 1435
2058
+ },
2059
+ {
2060
+ "epoch": 2.993762993762994,
2061
+ "grad_norm": 3.609375,
2062
+ "learning_rate": 2.1388619246498486e-11,
2063
+ "loss": 1.1627,
2064
+ "step": 1440
2065
  }
2066
  ],
2067
  "logging_steps": 5,
 
2076
  "should_evaluate": false,
2077
  "should_log": false,
2078
  "should_save": true,
2079
+ "should_training_stop": true
2080
  },
2081
  "attributes": {}
2082
  }
2083
  },
2084
+ "total_flos": 8.072899219804914e+18,
2085
  "train_batch_size": 8,
2086
  "trial_name": null,
2087
  "trial_params": null