Training in progress, step 17700, checkpoint
Browse files
.gitattributes
CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
37 |
checkpoint-17700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
37 |
checkpoint-17700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
38 |
+
last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
last-checkpoint/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 3541119728
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f673fbf3d293bdfb7083302f95a085b1c3809362d73ae1feb5db6ae0ec6a3e7e
|
3 |
size 3541119728
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 778374186
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a085a3464ee030b3211908f350dc686f7d0f79f38846fc7e1d9c5bc49537c610
|
3 |
size 778374186
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:390ed221cdfff0759c038f9f9dc672e97ebfcc32b38cca4add3a81bbe314fc8b
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -2,9 +2,9 @@
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
-
"epoch": 0.
|
6 |
"eval_steps": 500,
|
7 |
-
"global_step":
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
@@ -15668,6 +15668,276 @@
|
|
15668 |
"mean_token_accuracy": 0.8778254583477973,
|
15669 |
"num_tokens": 28906983.0,
|
15670 |
"step": 17400
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15671 |
}
|
15672 |
],
|
15673 |
"logging_steps": 10,
|
@@ -15687,7 +15957,7 @@
|
|
15687 |
"attributes": {}
|
15688 |
}
|
15689 |
},
|
15690 |
-
"total_flos": 6.
|
15691 |
"train_batch_size": 2,
|
15692 |
"trial_name": null,
|
15693 |
"trial_params": null
|
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
+
"epoch": 0.844677221155108,
|
6 |
"eval_steps": 500,
|
7 |
+
"global_step": 17700,
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
|
|
15668 |
"mean_token_accuracy": 0.8778254583477973,
|
15669 |
"num_tokens": 28906983.0,
|
15670 |
"step": 17400
|
15671 |
+
},
|
15672 |
+
{
|
15673 |
+
"epoch": 0.8308378768536967,
|
15674 |
+
"grad_norm": 0.4296000599861145,
|
15675 |
+
"learning_rate": 1.1692197566213315e-05,
|
15676 |
+
"loss": 0.7167,
|
15677 |
+
"mean_token_accuracy": 0.8584795027971268,
|
15678 |
+
"num_tokens": 28924631.0,
|
15679 |
+
"step": 17410
|
15680 |
+
},
|
15681 |
+
{
|
15682 |
+
"epoch": 0.8313150956227109,
|
15683 |
+
"grad_norm": 0.35790425539016724,
|
15684 |
+
"learning_rate": 1.1687425435456932e-05,
|
15685 |
+
"loss": 0.6852,
|
15686 |
+
"mean_token_accuracy": 0.8608928889036178,
|
15687 |
+
"num_tokens": 28943848.0,
|
15688 |
+
"step": 17420
|
15689 |
+
},
|
15690 |
+
{
|
15691 |
+
"epoch": 0.8317923143917251,
|
15692 |
+
"grad_norm": 0.35815557837486267,
|
15693 |
+
"learning_rate": 1.168265330470055e-05,
|
15694 |
+
"loss": 0.5988,
|
15695 |
+
"mean_token_accuracy": 0.8806958973407746,
|
15696 |
+
"num_tokens": 28960033.0,
|
15697 |
+
"step": 17430
|
15698 |
+
},
|
15699 |
+
{
|
15700 |
+
"epoch": 0.8322695331607393,
|
15701 |
+
"grad_norm": 0.41242972016334534,
|
15702 |
+
"learning_rate": 1.1677881173944167e-05,
|
15703 |
+
"loss": 0.7573,
|
15704 |
+
"mean_token_accuracy": 0.8531021654605866,
|
15705 |
+
"num_tokens": 28978291.0,
|
15706 |
+
"step": 17440
|
15707 |
+
},
|
15708 |
+
{
|
15709 |
+
"epoch": 0.8327467519297534,
|
15710 |
+
"grad_norm": 0.3900233805179596,
|
15711 |
+
"learning_rate": 1.1673109043187785e-05,
|
15712 |
+
"loss": 0.6684,
|
15713 |
+
"mean_token_accuracy": 0.8653559580445289,
|
15714 |
+
"num_tokens": 28995166.0,
|
15715 |
+
"step": 17450
|
15716 |
+
},
|
15717 |
+
{
|
15718 |
+
"epoch": 0.8332239706987676,
|
15719 |
+
"grad_norm": 0.3421511948108673,
|
15720 |
+
"learning_rate": 1.1668336912431402e-05,
|
15721 |
+
"loss": 0.5933,
|
15722 |
+
"mean_token_accuracy": 0.8817808702588081,
|
15723 |
+
"num_tokens": 29011537.0,
|
15724 |
+
"step": 17460
|
15725 |
+
},
|
15726 |
+
{
|
15727 |
+
"epoch": 0.8337011894677818,
|
15728 |
+
"grad_norm": 0.3471275269985199,
|
15729 |
+
"learning_rate": 1.1663564781675019e-05,
|
15730 |
+
"loss": 0.6444,
|
15731 |
+
"mean_token_accuracy": 0.8728347107768059,
|
15732 |
+
"num_tokens": 29029323.0,
|
15733 |
+
"step": 17470
|
15734 |
+
},
|
15735 |
+
{
|
15736 |
+
"epoch": 0.8341784082367959,
|
15737 |
+
"grad_norm": 0.4366353750228882,
|
15738 |
+
"learning_rate": 1.1658792650918635e-05,
|
15739 |
+
"loss": 0.6836,
|
15740 |
+
"mean_token_accuracy": 0.8620174437761307,
|
15741 |
+
"num_tokens": 29048157.0,
|
15742 |
+
"step": 17480
|
15743 |
+
},
|
15744 |
+
{
|
15745 |
+
"epoch": 0.8346556270058101,
|
15746 |
+
"grad_norm": 0.3484688997268677,
|
15747 |
+
"learning_rate": 1.1654020520162252e-05,
|
15748 |
+
"loss": 0.6891,
|
15749 |
+
"mean_token_accuracy": 0.8628757908940315,
|
15750 |
+
"num_tokens": 29064710.0,
|
15751 |
+
"step": 17490
|
15752 |
+
},
|
15753 |
+
{
|
15754 |
+
"epoch": 0.8351328457748243,
|
15755 |
+
"grad_norm": 0.3586483597755432,
|
15756 |
+
"learning_rate": 1.1649248389405872e-05,
|
15757 |
+
"loss": 0.6487,
|
15758 |
+
"mean_token_accuracy": 0.8712560385465622,
|
15759 |
+
"num_tokens": 29081864.0,
|
15760 |
+
"step": 17500
|
15761 |
+
},
|
15762 |
+
{
|
15763 |
+
"epoch": 0.8356100645438385,
|
15764 |
+
"grad_norm": 0.35695043206214905,
|
15765 |
+
"learning_rate": 1.1644476258649489e-05,
|
15766 |
+
"loss": 0.652,
|
15767 |
+
"mean_token_accuracy": 0.8665311306715011,
|
15768 |
+
"num_tokens": 29098008.0,
|
15769 |
+
"step": 17510
|
15770 |
+
},
|
15771 |
+
{
|
15772 |
+
"epoch": 0.8360872833128526,
|
15773 |
+
"grad_norm": 0.35384443402290344,
|
15774 |
+
"learning_rate": 1.1639704127893106e-05,
|
15775 |
+
"loss": 0.6137,
|
15776 |
+
"mean_token_accuracy": 0.8707799568772316,
|
15777 |
+
"num_tokens": 29114025.0,
|
15778 |
+
"step": 17520
|
15779 |
+
},
|
15780 |
+
{
|
15781 |
+
"epoch": 0.8365645020818668,
|
15782 |
+
"grad_norm": 0.4258424639701843,
|
15783 |
+
"learning_rate": 1.1634931997136722e-05,
|
15784 |
+
"loss": 0.446,
|
15785 |
+
"mean_token_accuracy": 0.9021103799343109,
|
15786 |
+
"num_tokens": 29127980.0,
|
15787 |
+
"step": 17530
|
15788 |
+
},
|
15789 |
+
{
|
15790 |
+
"epoch": 0.837041720850881,
|
15791 |
+
"grad_norm": 0.4185291826725006,
|
15792 |
+
"learning_rate": 1.1630159866380339e-05,
|
15793 |
+
"loss": 0.7187,
|
15794 |
+
"mean_token_accuracy": 0.8666620507836342,
|
15795 |
+
"num_tokens": 29145781.0,
|
15796 |
+
"step": 17540
|
15797 |
+
},
|
15798 |
+
{
|
15799 |
+
"epoch": 0.8375189396198952,
|
15800 |
+
"grad_norm": 0.3698226511478424,
|
15801 |
+
"learning_rate": 1.1625387735623956e-05,
|
15802 |
+
"loss": 0.8038,
|
15803 |
+
"mean_token_accuracy": 0.83167435079813,
|
15804 |
+
"num_tokens": 29164539.0,
|
15805 |
+
"step": 17550
|
15806 |
+
},
|
15807 |
+
{
|
15808 |
+
"epoch": 0.8379961583889094,
|
15809 |
+
"grad_norm": 0.5082905888557434,
|
15810 |
+
"learning_rate": 1.1620615604867574e-05,
|
15811 |
+
"loss": 0.6794,
|
15812 |
+
"mean_token_accuracy": 0.8646394088864326,
|
15813 |
+
"num_tokens": 29181392.0,
|
15814 |
+
"step": 17560
|
15815 |
+
},
|
15816 |
+
{
|
15817 |
+
"epoch": 0.8384733771579236,
|
15818 |
+
"grad_norm": 0.419879287481308,
|
15819 |
+
"learning_rate": 1.1615843474111192e-05,
|
15820 |
+
"loss": 0.748,
|
15821 |
+
"mean_token_accuracy": 0.8569721296429634,
|
15822 |
+
"num_tokens": 29198880.0,
|
15823 |
+
"step": 17570
|
15824 |
+
},
|
15825 |
+
{
|
15826 |
+
"epoch": 0.8389505959269378,
|
15827 |
+
"grad_norm": 0.3323199450969696,
|
15828 |
+
"learning_rate": 1.1611071343354809e-05,
|
15829 |
+
"loss": 0.6364,
|
15830 |
+
"mean_token_accuracy": 0.8711786776781082,
|
15831 |
+
"num_tokens": 29215341.0,
|
15832 |
+
"step": 17580
|
15833 |
+
},
|
15834 |
+
{
|
15835 |
+
"epoch": 0.839427814695952,
|
15836 |
+
"grad_norm": 0.35589149594306946,
|
15837 |
+
"learning_rate": 1.1606299212598426e-05,
|
15838 |
+
"loss": 0.5241,
|
15839 |
+
"mean_token_accuracy": 0.886940547823906,
|
15840 |
+
"num_tokens": 29230599.0,
|
15841 |
+
"step": 17590
|
15842 |
+
},
|
15843 |
+
{
|
15844 |
+
"epoch": 0.8399050334649661,
|
15845 |
+
"grad_norm": 0.3645700216293335,
|
15846 |
+
"learning_rate": 1.1601527081842044e-05,
|
15847 |
+
"loss": 0.6459,
|
15848 |
+
"mean_token_accuracy": 0.8628365308046341,
|
15849 |
+
"num_tokens": 29247686.0,
|
15850 |
+
"step": 17600
|
15851 |
+
},
|
15852 |
+
{
|
15853 |
+
"epoch": 0.8403822522339803,
|
15854 |
+
"grad_norm": 0.4367329180240631,
|
15855 |
+
"learning_rate": 1.159675495108566e-05,
|
15856 |
+
"loss": 0.5824,
|
15857 |
+
"mean_token_accuracy": 0.8813767299056053,
|
15858 |
+
"num_tokens": 29263208.0,
|
15859 |
+
"step": 17610
|
15860 |
+
},
|
15861 |
+
{
|
15862 |
+
"epoch": 0.8408594710029945,
|
15863 |
+
"grad_norm": 0.4404272139072418,
|
15864 |
+
"learning_rate": 1.1591982820329277e-05,
|
15865 |
+
"loss": 0.7341,
|
15866 |
+
"mean_token_accuracy": 0.861807630956173,
|
15867 |
+
"num_tokens": 29282037.0,
|
15868 |
+
"step": 17620
|
15869 |
+
},
|
15870 |
+
{
|
15871 |
+
"epoch": 0.8413366897720087,
|
15872 |
+
"grad_norm": 0.32958847284317017,
|
15873 |
+
"learning_rate": 1.1587210689572894e-05,
|
15874 |
+
"loss": 0.5831,
|
15875 |
+
"mean_token_accuracy": 0.8764279022812843,
|
15876 |
+
"num_tokens": 29297330.0,
|
15877 |
+
"step": 17630
|
15878 |
+
},
|
15879 |
+
{
|
15880 |
+
"epoch": 0.8418139085410229,
|
15881 |
+
"grad_norm": 0.3422182500362396,
|
15882 |
+
"learning_rate": 1.1582438558816514e-05,
|
15883 |
+
"loss": 0.7288,
|
15884 |
+
"mean_token_accuracy": 0.8610541269183158,
|
15885 |
+
"num_tokens": 29314275.0,
|
15886 |
+
"step": 17640
|
15887 |
+
},
|
15888 |
+
{
|
15889 |
+
"epoch": 0.8422911273100371,
|
15890 |
+
"grad_norm": 0.3618062138557434,
|
15891 |
+
"learning_rate": 1.157766642806013e-05,
|
15892 |
+
"loss": 0.8217,
|
15893 |
+
"mean_token_accuracy": 0.8382393896579743,
|
15894 |
+
"num_tokens": 29334784.0,
|
15895 |
+
"step": 17650
|
15896 |
+
},
|
15897 |
+
{
|
15898 |
+
"epoch": 0.8427683460790513,
|
15899 |
+
"grad_norm": 0.3208582103252411,
|
15900 |
+
"learning_rate": 1.1572894297303747e-05,
|
15901 |
+
"loss": 0.6157,
|
15902 |
+
"mean_token_accuracy": 0.8792721211910248,
|
15903 |
+
"num_tokens": 29351815.0,
|
15904 |
+
"step": 17660
|
15905 |
+
},
|
15906 |
+
{
|
15907 |
+
"epoch": 0.8432455648480655,
|
15908 |
+
"grad_norm": 0.38699203729629517,
|
15909 |
+
"learning_rate": 1.1568122166547364e-05,
|
15910 |
+
"loss": 0.5618,
|
15911 |
+
"mean_token_accuracy": 0.8911767050623893,
|
15912 |
+
"num_tokens": 29366991.0,
|
15913 |
+
"step": 17670
|
15914 |
+
},
|
15915 |
+
{
|
15916 |
+
"epoch": 0.8437227836170796,
|
15917 |
+
"grad_norm": 0.36298489570617676,
|
15918 |
+
"learning_rate": 1.156335003579098e-05,
|
15919 |
+
"loss": 0.7076,
|
15920 |
+
"mean_token_accuracy": 0.8556164249777793,
|
15921 |
+
"num_tokens": 29384444.0,
|
15922 |
+
"step": 17680
|
15923 |
+
},
|
15924 |
+
{
|
15925 |
+
"epoch": 0.8442000023860938,
|
15926 |
+
"grad_norm": 0.32522013783454895,
|
15927 |
+
"learning_rate": 1.1558577905034597e-05,
|
15928 |
+
"loss": 0.6395,
|
15929 |
+
"mean_token_accuracy": 0.87257649153471,
|
15930 |
+
"num_tokens": 29401037.0,
|
15931 |
+
"step": 17690
|
15932 |
+
},
|
15933 |
+
{
|
15934 |
+
"epoch": 0.844677221155108,
|
15935 |
+
"grad_norm": 0.36789947748184204,
|
15936 |
+
"learning_rate": 1.1553805774278218e-05,
|
15937 |
+
"loss": 0.6614,
|
15938 |
+
"mean_token_accuracy": 0.8719203874468804,
|
15939 |
+
"num_tokens": 29418205.0,
|
15940 |
+
"step": 17700
|
15941 |
}
|
15942 |
],
|
15943 |
"logging_steps": 10,
|
|
|
15957 |
"attributes": {}
|
15958 |
}
|
15959 |
},
|
15960 |
+
"total_flos": 6.62507884939518e+17,
|
15961 |
"train_batch_size": 2,
|
15962 |
"trial_name": null,
|
15963 |
"trial_params": null
|