Training in progress, step 15600, checkpoint
Browse files
.gitattributes
CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
37 |
+
last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
last-checkpoint/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 3541119728
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:70537360c9daddf04205b6fbd293c0d4965ec40c67ef261daf546af624afd98f
|
3 |
size 3541119728
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 778374186
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7ca05e267a448d87fef33633929234240f69ebde46a8d89d8a7bbe11cbc11f6c
|
3 |
size 778374186
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8955cd9d24ecd092d5a24dfa8ee9d34839e14159c86f280833a6a8e4cb640de6
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -2,9 +2,9 @@
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
-
"epoch": 0.
|
6 |
"eval_steps": 500,
|
7 |
-
"global_step":
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
@@ -13778,6 +13778,276 @@
|
|
13778 |
"mean_token_accuracy": 0.8891868680715561,
|
13779 |
"num_tokens": 25427005.0,
|
13780 |
"step": 15300
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13781 |
}
|
13782 |
],
|
13783 |
"logging_steps": 10,
|
@@ -13797,7 +14067,7 @@
|
|
13797 |
"attributes": {}
|
13798 |
}
|
13799 |
},
|
13800 |
-
"total_flos": 5.
|
13801 |
"train_batch_size": 2,
|
13802 |
"trial_name": null,
|
13803 |
"trial_params": null
|
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
+
"epoch": 0.7444612796621292,
|
6 |
"eval_steps": 500,
|
7 |
+
"global_step": 15600,
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
|
|
13778 |
"mean_token_accuracy": 0.8891868680715561,
|
13779 |
"num_tokens": 25427005.0,
|
13780 |
"step": 15300
|
13781 |
+
},
|
13782 |
+
{
|
13783 |
+
"epoch": 0.7306219353607177,
|
13784 |
+
"grad_norm": 0.3419613242149353,
|
13785 |
+
"learning_rate": 1.2694345025053687e-05,
|
13786 |
+
"loss": 0.6515,
|
13787 |
+
"mean_token_accuracy": 0.8683709055185318,
|
13788 |
+
"num_tokens": 25442614.0,
|
13789 |
+
"step": 15310
|
13790 |
+
},
|
13791 |
+
{
|
13792 |
+
"epoch": 0.7310991541297319,
|
13793 |
+
"grad_norm": 0.35376232862472534,
|
13794 |
+
"learning_rate": 1.2689572894297304e-05,
|
13795 |
+
"loss": 0.6847,
|
13796 |
+
"mean_token_accuracy": 0.8579560115933418,
|
13797 |
+
"num_tokens": 25460244.0,
|
13798 |
+
"step": 15320
|
13799 |
+
},
|
13800 |
+
{
|
13801 |
+
"epoch": 0.7315763728987461,
|
13802 |
+
"grad_norm": 0.4027968943119049,
|
13803 |
+
"learning_rate": 1.2684800763540922e-05,
|
13804 |
+
"loss": 0.5904,
|
13805 |
+
"mean_token_accuracy": 0.8813899368047714,
|
13806 |
+
"num_tokens": 25477478.0,
|
13807 |
+
"step": 15330
|
13808 |
+
},
|
13809 |
+
{
|
13810 |
+
"epoch": 0.7320535916677603,
|
13811 |
+
"grad_norm": 0.33690837025642395,
|
13812 |
+
"learning_rate": 1.268002863278454e-05,
|
13813 |
+
"loss": 0.6076,
|
13814 |
+
"mean_token_accuracy": 0.8772727206349373,
|
13815 |
+
"num_tokens": 25494023.0,
|
13816 |
+
"step": 15340
|
13817 |
+
},
|
13818 |
+
{
|
13819 |
+
"epoch": 0.7325308104367745,
|
13820 |
+
"grad_norm": 0.3748989999294281,
|
13821 |
+
"learning_rate": 1.2675256502028157e-05,
|
13822 |
+
"loss": 0.6861,
|
13823 |
+
"mean_token_accuracy": 0.8666884452104568,
|
13824 |
+
"num_tokens": 25510507.0,
|
13825 |
+
"step": 15350
|
13826 |
+
},
|
13827 |
+
{
|
13828 |
+
"epoch": 0.7330080292057887,
|
13829 |
+
"grad_norm": 0.3961426317691803,
|
13830 |
+
"learning_rate": 1.2670484371271774e-05,
|
13831 |
+
"loss": 0.5477,
|
13832 |
+
"mean_token_accuracy": 0.8997573867440224,
|
13833 |
+
"num_tokens": 25525876.0,
|
13834 |
+
"step": 15360
|
13835 |
+
},
|
13836 |
+
{
|
13837 |
+
"epoch": 0.7334852479748029,
|
13838 |
+
"grad_norm": 0.32060977816581726,
|
13839 |
+
"learning_rate": 1.266571224051539e-05,
|
13840 |
+
"loss": 0.6086,
|
13841 |
+
"mean_token_accuracy": 0.8730768218636513,
|
13842 |
+
"num_tokens": 25542626.0,
|
13843 |
+
"step": 15370
|
13844 |
+
},
|
13845 |
+
{
|
13846 |
+
"epoch": 0.733962466743817,
|
13847 |
+
"grad_norm": 0.4424884617328644,
|
13848 |
+
"learning_rate": 1.2660940109759007e-05,
|
13849 |
+
"loss": 0.6637,
|
13850 |
+
"mean_token_accuracy": 0.8715329870581627,
|
13851 |
+
"num_tokens": 25559552.0,
|
13852 |
+
"step": 15380
|
13853 |
+
},
|
13854 |
+
{
|
13855 |
+
"epoch": 0.7344396855128312,
|
13856 |
+
"grad_norm": 0.2700168192386627,
|
13857 |
+
"learning_rate": 1.2656167979002624e-05,
|
13858 |
+
"loss": 0.6507,
|
13859 |
+
"mean_token_accuracy": 0.8775774970650673,
|
13860 |
+
"num_tokens": 25575310.0,
|
13861 |
+
"step": 15390
|
13862 |
+
},
|
13863 |
+
{
|
13864 |
+
"epoch": 0.7349169042818454,
|
13865 |
+
"grad_norm": 0.34019699692726135,
|
13866 |
+
"learning_rate": 1.2651395848246244e-05,
|
13867 |
+
"loss": 0.6687,
|
13868 |
+
"mean_token_accuracy": 0.8798381179571152,
|
13869 |
+
"num_tokens": 25590414.0,
|
13870 |
+
"step": 15400
|
13871 |
+
},
|
13872 |
+
{
|
13873 |
+
"epoch": 0.7353941230508596,
|
13874 |
+
"grad_norm": 0.41453129053115845,
|
13875 |
+
"learning_rate": 1.2646623717489861e-05,
|
13876 |
+
"loss": 0.692,
|
13877 |
+
"mean_token_accuracy": 0.8671859934926033,
|
13878 |
+
"num_tokens": 25608759.0,
|
13879 |
+
"step": 15410
|
13880 |
+
},
|
13881 |
+
{
|
13882 |
+
"epoch": 0.7358713418198738,
|
13883 |
+
"grad_norm": 0.37873607873916626,
|
13884 |
+
"learning_rate": 1.2641851586733478e-05,
|
13885 |
+
"loss": 0.7208,
|
13886 |
+
"mean_token_accuracy": 0.8635074034333229,
|
13887 |
+
"num_tokens": 25626985.0,
|
13888 |
+
"step": 15420
|
13889 |
+
},
|
13890 |
+
{
|
13891 |
+
"epoch": 0.736348560588888,
|
13892 |
+
"grad_norm": 0.3016092777252197,
|
13893 |
+
"learning_rate": 1.2637079455977094e-05,
|
13894 |
+
"loss": 0.6058,
|
13895 |
+
"mean_token_accuracy": 0.8779332295060158,
|
13896 |
+
"num_tokens": 25642693.0,
|
13897 |
+
"step": 15430
|
13898 |
+
},
|
13899 |
+
{
|
13900 |
+
"epoch": 0.7368257793579022,
|
13901 |
+
"grad_norm": 0.3086267411708832,
|
13902 |
+
"learning_rate": 1.2632307325220713e-05,
|
13903 |
+
"loss": 0.6249,
|
13904 |
+
"mean_token_accuracy": 0.8778651550412178,
|
13905 |
+
"num_tokens": 25659131.0,
|
13906 |
+
"step": 15440
|
13907 |
+
},
|
13908 |
+
{
|
13909 |
+
"epoch": 0.7373029981269164,
|
13910 |
+
"grad_norm": 0.3954660892486572,
|
13911 |
+
"learning_rate": 1.262753519446433e-05,
|
13912 |
+
"loss": 0.6073,
|
13913 |
+
"mean_token_accuracy": 0.8788123086094857,
|
13914 |
+
"num_tokens": 25675282.0,
|
13915 |
+
"step": 15450
|
13916 |
+
},
|
13917 |
+
{
|
13918 |
+
"epoch": 0.7377802168959305,
|
13919 |
+
"grad_norm": 0.3375210165977478,
|
13920 |
+
"learning_rate": 1.2622763063707946e-05,
|
13921 |
+
"loss": 0.5759,
|
13922 |
+
"mean_token_accuracy": 0.8832358941435814,
|
13923 |
+
"num_tokens": 25690284.0,
|
13924 |
+
"step": 15460
|
13925 |
+
},
|
13926 |
+
{
|
13927 |
+
"epoch": 0.7382574356649447,
|
13928 |
+
"grad_norm": 0.429108202457428,
|
13929 |
+
"learning_rate": 1.2617990932951564e-05,
|
13930 |
+
"loss": 0.5676,
|
13931 |
+
"mean_token_accuracy": 0.8806645110249519,
|
13932 |
+
"num_tokens": 25705824.0,
|
13933 |
+
"step": 15470
|
13934 |
+
},
|
13935 |
+
{
|
13936 |
+
"epoch": 0.7387346544339589,
|
13937 |
+
"grad_norm": 0.3869950771331787,
|
13938 |
+
"learning_rate": 1.2613218802195181e-05,
|
13939 |
+
"loss": 0.56,
|
13940 |
+
"mean_token_accuracy": 0.8864919826388359,
|
13941 |
+
"num_tokens": 25721448.0,
|
13942 |
+
"step": 15480
|
13943 |
+
},
|
13944 |
+
{
|
13945 |
+
"epoch": 0.7392118732029731,
|
13946 |
+
"grad_norm": 0.2914048731327057,
|
13947 |
+
"learning_rate": 1.26084466714388e-05,
|
13948 |
+
"loss": 0.6239,
|
13949 |
+
"mean_token_accuracy": 0.880633682012558,
|
13950 |
+
"num_tokens": 25736900.0,
|
13951 |
+
"step": 15490
|
13952 |
+
},
|
13953 |
+
{
|
13954 |
+
"epoch": 0.7396890919719873,
|
13955 |
+
"grad_norm": 0.3728204667568207,
|
13956 |
+
"learning_rate": 1.2603674540682416e-05,
|
13957 |
+
"loss": 0.6358,
|
13958 |
+
"mean_token_accuracy": 0.8782069548964501,
|
13959 |
+
"num_tokens": 25753432.0,
|
13960 |
+
"step": 15500
|
13961 |
+
},
|
13962 |
+
{
|
13963 |
+
"epoch": 0.7401663107410015,
|
13964 |
+
"grad_norm": 0.3584674596786499,
|
13965 |
+
"learning_rate": 1.2598902409926033e-05,
|
13966 |
+
"loss": 0.5266,
|
13967 |
+
"mean_token_accuracy": 0.887596707046032,
|
13968 |
+
"num_tokens": 25769120.0,
|
13969 |
+
"step": 15510
|
13970 |
+
},
|
13971 |
+
{
|
13972 |
+
"epoch": 0.7406435295100157,
|
13973 |
+
"grad_norm": 0.4318288564682007,
|
13974 |
+
"learning_rate": 1.259413027916965e-05,
|
13975 |
+
"loss": 0.5954,
|
13976 |
+
"mean_token_accuracy": 0.8761422768235206,
|
13977 |
+
"num_tokens": 25785477.0,
|
13978 |
+
"step": 15520
|
13979 |
+
},
|
13980 |
+
{
|
13981 |
+
"epoch": 0.7411207482790299,
|
13982 |
+
"grad_norm": 0.3693118989467621,
|
13983 |
+
"learning_rate": 1.2589358148413266e-05,
|
13984 |
+
"loss": 0.6766,
|
13985 |
+
"mean_token_accuracy": 0.8750508233904839,
|
13986 |
+
"num_tokens": 25803338.0,
|
13987 |
+
"step": 15530
|
13988 |
+
},
|
13989 |
+
{
|
13990 |
+
"epoch": 0.741597967048044,
|
13991 |
+
"grad_norm": 0.30119234323501587,
|
13992 |
+
"learning_rate": 1.2584586017656886e-05,
|
13993 |
+
"loss": 0.6606,
|
13994 |
+
"mean_token_accuracy": 0.8698265522718429,
|
13995 |
+
"num_tokens": 25819899.0,
|
13996 |
+
"step": 15540
|
13997 |
+
},
|
13998 |
+
{
|
13999 |
+
"epoch": 0.7420751858170582,
|
14000 |
+
"grad_norm": 0.702343761920929,
|
14001 |
+
"learning_rate": 1.2579813886900503e-05,
|
14002 |
+
"loss": 0.7339,
|
14003 |
+
"mean_token_accuracy": 0.869141760468483,
|
14004 |
+
"num_tokens": 25837531.0,
|
14005 |
+
"step": 15550
|
14006 |
+
},
|
14007 |
+
{
|
14008 |
+
"epoch": 0.7425524045860724,
|
14009 |
+
"grad_norm": 0.35476893186569214,
|
14010 |
+
"learning_rate": 1.257504175614412e-05,
|
14011 |
+
"loss": 0.6443,
|
14012 |
+
"mean_token_accuracy": 0.8738871991634369,
|
14013 |
+
"num_tokens": 25854586.0,
|
14014 |
+
"step": 15560
|
14015 |
+
},
|
14016 |
+
{
|
14017 |
+
"epoch": 0.7430296233550866,
|
14018 |
+
"grad_norm": 0.4192853569984436,
|
14019 |
+
"learning_rate": 1.2570269625387736e-05,
|
14020 |
+
"loss": 0.6971,
|
14021 |
+
"mean_token_accuracy": 0.8595242589712143,
|
14022 |
+
"num_tokens": 25871022.0,
|
14023 |
+
"step": 15570
|
14024 |
+
},
|
14025 |
+
{
|
14026 |
+
"epoch": 0.7435068421241008,
|
14027 |
+
"grad_norm": 0.3494696319103241,
|
14028 |
+
"learning_rate": 1.2565497494631353e-05,
|
14029 |
+
"loss": 0.6859,
|
14030 |
+
"mean_token_accuracy": 0.853353051841259,
|
14031 |
+
"num_tokens": 25888407.0,
|
14032 |
+
"step": 15580
|
14033 |
+
},
|
14034 |
+
{
|
14035 |
+
"epoch": 0.743984060893115,
|
14036 |
+
"grad_norm": 0.3698543906211853,
|
14037 |
+
"learning_rate": 1.2560725363874971e-05,
|
14038 |
+
"loss": 0.7607,
|
14039 |
+
"mean_token_accuracy": 0.8620315045118332,
|
14040 |
+
"num_tokens": 25906131.0,
|
14041 |
+
"step": 15590
|
14042 |
+
},
|
14043 |
+
{
|
14044 |
+
"epoch": 0.7444612796621292,
|
14045 |
+
"grad_norm": 0.3582072854042053,
|
14046 |
+
"learning_rate": 1.2555953233118588e-05,
|
14047 |
+
"loss": 0.5744,
|
14048 |
+
"mean_token_accuracy": 0.8893922328948974,
|
14049 |
+
"num_tokens": 25920582.0,
|
14050 |
+
"step": 15600
|
14051 |
}
|
14052 |
],
|
14053 |
"logging_steps": 10,
|
|
|
14067 |
"attributes": {}
|
14068 |
}
|
14069 |
},
|
14070 |
+
"total_flos": 5.837477802884506e+17,
|
14071 |
"train_batch_size": 2,
|
14072 |
"trial_name": null,
|
14073 |
"trial_params": null
|