Training in progress, step 23400, checkpoint
Browse files
.gitattributes
CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
37 |
+
last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
last-checkpoint/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 3541119728
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2e9f8625686c6ce45944c3c7217221b98c12500fd510b180766cf3eb372f4d89
|
3 |
size 3541119728
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 778374186
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ac87b5de42a674db5c2561770c65f3648c44d74fcf8a74096cb07a141a2d371d
|
3 |
size 778374186
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ff46be677348ca53ea6c6e13cd256f6d0b6ea2f784c9f0f6c43756f323fd9351
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -2,9 +2,9 @@
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
-
"epoch": 1.
|
6 |
"eval_steps": 500,
|
7 |
-
"global_step":
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
@@ -20798,6 +20798,276 @@
|
|
20798 |
"mean_token_accuracy": 0.8887743890285492,
|
20799 |
"num_tokens": 38365222.0,
|
20800 |
"step": 23100
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20801 |
}
|
20802 |
],
|
20803 |
"logging_steps": 10,
|
@@ -20817,7 +21087,7 @@
|
|
20817 |
"attributes": {}
|
20818 |
}
|
20819 |
},
|
20820 |
-
"total_flos": 8.
|
20821 |
"train_batch_size": 2,
|
20822 |
"trial_name": null,
|
20823 |
"trial_params": null
|
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
+
"epoch": 1.1166799890239683,
|
6 |
"eval_steps": 500,
|
7 |
+
"global_step": 23400,
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
|
|
20798 |
"mean_token_accuracy": 0.8887743890285492,
|
20799 |
"num_tokens": 38365222.0,
|
20800 |
"step": 23100
|
20801 |
+
},
|
20802 |
+
{
|
20803 |
+
"epoch": 1.102840644722557,
|
20804 |
+
"grad_norm": 0.37605273723602295,
|
20805 |
+
"learning_rate": 8.972083035075162e-06,
|
20806 |
+
"loss": 0.6587,
|
20807 |
+
"mean_token_accuracy": 0.8744294509291649,
|
20808 |
+
"num_tokens": 38381762.0,
|
20809 |
+
"step": 23110
|
20810 |
+
},
|
20811 |
+
{
|
20812 |
+
"epoch": 1.1033178634915712,
|
20813 |
+
"grad_norm": 0.36323273181915283,
|
20814 |
+
"learning_rate": 8.967310904318779e-06,
|
20815 |
+
"loss": 0.5949,
|
20816 |
+
"mean_token_accuracy": 0.883763587474823,
|
20817 |
+
"num_tokens": 38397297.0,
|
20818 |
+
"step": 23120
|
20819 |
+
},
|
20820 |
+
{
|
20821 |
+
"epoch": 1.1037950822605853,
|
20822 |
+
"grad_norm": 0.36709028482437134,
|
20823 |
+
"learning_rate": 8.962538773562395e-06,
|
20824 |
+
"loss": 0.6753,
|
20825 |
+
"mean_token_accuracy": 0.8767507761716843,
|
20826 |
+
"num_tokens": 38413052.0,
|
20827 |
+
"step": 23130
|
20828 |
+
},
|
20829 |
+
{
|
20830 |
+
"epoch": 1.1042723010295994,
|
20831 |
+
"grad_norm": 0.36097216606140137,
|
20832 |
+
"learning_rate": 8.957766642806014e-06,
|
20833 |
+
"loss": 0.56,
|
20834 |
+
"mean_token_accuracy": 0.8822339043021202,
|
20835 |
+
"num_tokens": 38429063.0,
|
20836 |
+
"step": 23140
|
20837 |
+
},
|
20838 |
+
{
|
20839 |
+
"epoch": 1.1047495197986137,
|
20840 |
+
"grad_norm": 0.3489522635936737,
|
20841 |
+
"learning_rate": 8.95299451204963e-06,
|
20842 |
+
"loss": 0.6324,
|
20843 |
+
"mean_token_accuracy": 0.8755771458148957,
|
20844 |
+
"num_tokens": 38446171.0,
|
20845 |
+
"step": 23150
|
20846 |
+
},
|
20847 |
+
{
|
20848 |
+
"epoch": 1.1052267385676278,
|
20849 |
+
"grad_norm": 0.31116101145744324,
|
20850 |
+
"learning_rate": 8.948222381293249e-06,
|
20851 |
+
"loss": 0.7654,
|
20852 |
+
"mean_token_accuracy": 0.8550684407353402,
|
20853 |
+
"num_tokens": 38465354.0,
|
20854 |
+
"step": 23160
|
20855 |
+
},
|
20856 |
+
{
|
20857 |
+
"epoch": 1.105703957336642,
|
20858 |
+
"grad_norm": 0.43474653363227844,
|
20859 |
+
"learning_rate": 8.943450250536865e-06,
|
20860 |
+
"loss": 0.7257,
|
20861 |
+
"mean_token_accuracy": 0.8584754586219787,
|
20862 |
+
"num_tokens": 38483004.0,
|
20863 |
+
"step": 23170
|
20864 |
+
},
|
20865 |
+
{
|
20866 |
+
"epoch": 1.1061811761056561,
|
20867 |
+
"grad_norm": 0.330024391412735,
|
20868 |
+
"learning_rate": 8.938678119780482e-06,
|
20869 |
+
"loss": 0.6111,
|
20870 |
+
"mean_token_accuracy": 0.8786431089043617,
|
20871 |
+
"num_tokens": 38500479.0,
|
20872 |
+
"step": 23180
|
20873 |
+
},
|
20874 |
+
{
|
20875 |
+
"epoch": 1.1066583948746704,
|
20876 |
+
"grad_norm": 0.3605097830295563,
|
20877 |
+
"learning_rate": 8.9339059890241e-06,
|
20878 |
+
"loss": 0.6118,
|
20879 |
+
"mean_token_accuracy": 0.8718374699354172,
|
20880 |
+
"num_tokens": 38516219.0,
|
20881 |
+
"step": 23190
|
20882 |
+
},
|
20883 |
+
{
|
20884 |
+
"epoch": 1.1071356136436845,
|
20885 |
+
"grad_norm": 0.37876948714256287,
|
20886 |
+
"learning_rate": 8.929133858267717e-06,
|
20887 |
+
"loss": 0.5201,
|
20888 |
+
"mean_token_accuracy": 0.8916581928730011,
|
20889 |
+
"num_tokens": 38530884.0,
|
20890 |
+
"step": 23200
|
20891 |
+
},
|
20892 |
+
{
|
20893 |
+
"epoch": 1.1076128324126988,
|
20894 |
+
"grad_norm": 0.41296443343162537,
|
20895 |
+
"learning_rate": 8.924361727511335e-06,
|
20896 |
+
"loss": 0.6266,
|
20897 |
+
"mean_token_accuracy": 0.8670860260725022,
|
20898 |
+
"num_tokens": 38547224.0,
|
20899 |
+
"step": 23210
|
20900 |
+
},
|
20901 |
+
{
|
20902 |
+
"epoch": 1.1080900511817129,
|
20903 |
+
"grad_norm": 0.3719196021556854,
|
20904 |
+
"learning_rate": 8.919589596754952e-06,
|
20905 |
+
"loss": 0.5483,
|
20906 |
+
"mean_token_accuracy": 0.8841874286532402,
|
20907 |
+
"num_tokens": 38563138.0,
|
20908 |
+
"step": 23220
|
20909 |
+
},
|
20910 |
+
{
|
20911 |
+
"epoch": 1.1085672699507272,
|
20912 |
+
"grad_norm": 0.34505724906921387,
|
20913 |
+
"learning_rate": 8.914817465998569e-06,
|
20914 |
+
"loss": 0.5962,
|
20915 |
+
"mean_token_accuracy": 0.8823557212948799,
|
20916 |
+
"num_tokens": 38579159.0,
|
20917 |
+
"step": 23230
|
20918 |
+
},
|
20919 |
+
{
|
20920 |
+
"epoch": 1.1090444887197413,
|
20921 |
+
"grad_norm": 0.41858744621276855,
|
20922 |
+
"learning_rate": 8.910045335242187e-06,
|
20923 |
+
"loss": 0.6115,
|
20924 |
+
"mean_token_accuracy": 0.8636498123407363,
|
20925 |
+
"num_tokens": 38595874.0,
|
20926 |
+
"step": 23240
|
20927 |
+
},
|
20928 |
+
{
|
20929 |
+
"epoch": 1.1095217074887556,
|
20930 |
+
"grad_norm": 0.37642261385917664,
|
20931 |
+
"learning_rate": 8.905273204485804e-06,
|
20932 |
+
"loss": 0.5512,
|
20933 |
+
"mean_token_accuracy": 0.8865195542573929,
|
20934 |
+
"num_tokens": 38613431.0,
|
20935 |
+
"step": 23250
|
20936 |
+
},
|
20937 |
+
{
|
20938 |
+
"epoch": 1.1099989262577696,
|
20939 |
+
"grad_norm": 0.3944489061832428,
|
20940 |
+
"learning_rate": 8.90050107372942e-06,
|
20941 |
+
"loss": 0.5622,
|
20942 |
+
"mean_token_accuracy": 0.8907116547226905,
|
20943 |
+
"num_tokens": 38628954.0,
|
20944 |
+
"step": 23260
|
20945 |
+
},
|
20946 |
+
{
|
20947 |
+
"epoch": 1.110476145026784,
|
20948 |
+
"grad_norm": 0.4231228232383728,
|
20949 |
+
"learning_rate": 8.895728942973039e-06,
|
20950 |
+
"loss": 0.5735,
|
20951 |
+
"mean_token_accuracy": 0.883218166232109,
|
20952 |
+
"num_tokens": 38644926.0,
|
20953 |
+
"step": 23270
|
20954 |
+
},
|
20955 |
+
{
|
20956 |
+
"epoch": 1.110953363795798,
|
20957 |
+
"grad_norm": 0.4161708652973175,
|
20958 |
+
"learning_rate": 8.890956812216655e-06,
|
20959 |
+
"loss": 0.6235,
|
20960 |
+
"mean_token_accuracy": 0.8809412658214569,
|
20961 |
+
"num_tokens": 38661566.0,
|
20962 |
+
"step": 23280
|
20963 |
+
},
|
20964 |
+
{
|
20965 |
+
"epoch": 1.1114305825648123,
|
20966 |
+
"grad_norm": 0.37831827998161316,
|
20967 |
+
"learning_rate": 8.886184681460272e-06,
|
20968 |
+
"loss": 0.6968,
|
20969 |
+
"mean_token_accuracy": 0.8654580265283585,
|
20970 |
+
"num_tokens": 38679443.0,
|
20971 |
+
"step": 23290
|
20972 |
+
},
|
20973 |
+
{
|
20974 |
+
"epoch": 1.1119078013338264,
|
20975 |
+
"grad_norm": 0.42068058252334595,
|
20976 |
+
"learning_rate": 8.881412550703889e-06,
|
20977 |
+
"loss": 0.5489,
|
20978 |
+
"mean_token_accuracy": 0.8919402092695237,
|
20979 |
+
"num_tokens": 38696241.0,
|
20980 |
+
"step": 23300
|
20981 |
+
},
|
20982 |
+
{
|
20983 |
+
"epoch": 1.1123850201028407,
|
20984 |
+
"grad_norm": 0.4387883245944977,
|
20985 |
+
"learning_rate": 8.876640419947507e-06,
|
20986 |
+
"loss": 0.5565,
|
20987 |
+
"mean_token_accuracy": 0.8957258448004722,
|
20988 |
+
"num_tokens": 38712179.0,
|
20989 |
+
"step": 23310
|
20990 |
+
},
|
20991 |
+
{
|
20992 |
+
"epoch": 1.1128622388718548,
|
20993 |
+
"grad_norm": 0.43475064635276794,
|
20994 |
+
"learning_rate": 8.871868289191124e-06,
|
20995 |
+
"loss": 0.5857,
|
20996 |
+
"mean_token_accuracy": 0.8815463319420814,
|
20997 |
+
"num_tokens": 38728828.0,
|
20998 |
+
"step": 23320
|
20999 |
+
},
|
21000 |
+
{
|
21001 |
+
"epoch": 1.113339457640869,
|
21002 |
+
"grad_norm": 0.3661762773990631,
|
21003 |
+
"learning_rate": 8.867096158434742e-06,
|
21004 |
+
"loss": 0.611,
|
21005 |
+
"mean_token_accuracy": 0.8724078252911568,
|
21006 |
+
"num_tokens": 38746266.0,
|
21007 |
+
"step": 23330
|
21008 |
+
},
|
21009 |
+
{
|
21010 |
+
"epoch": 1.1138166764098831,
|
21011 |
+
"grad_norm": 0.38337260484695435,
|
21012 |
+
"learning_rate": 8.862324027678359e-06,
|
21013 |
+
"loss": 0.6629,
|
21014 |
+
"mean_token_accuracy": 0.8663832738995552,
|
21015 |
+
"num_tokens": 38764971.0,
|
21016 |
+
"step": 23340
|
21017 |
+
},
|
21018 |
+
{
|
21019 |
+
"epoch": 1.1142938951788974,
|
21020 |
+
"grad_norm": 0.41741085052490234,
|
21021 |
+
"learning_rate": 8.857551896921977e-06,
|
21022 |
+
"loss": 0.6566,
|
21023 |
+
"mean_token_accuracy": 0.8759171679615975,
|
21024 |
+
"num_tokens": 38782014.0,
|
21025 |
+
"step": 23350
|
21026 |
+
},
|
21027 |
+
{
|
21028 |
+
"epoch": 1.1147711139479115,
|
21029 |
+
"grad_norm": 0.373674601316452,
|
21030 |
+
"learning_rate": 8.852779766165594e-06,
|
21031 |
+
"loss": 0.6178,
|
21032 |
+
"mean_token_accuracy": 0.8715434208512306,
|
21033 |
+
"num_tokens": 38796852.0,
|
21034 |
+
"step": 23360
|
21035 |
+
},
|
21036 |
+
{
|
21037 |
+
"epoch": 1.1152483327169258,
|
21038 |
+
"grad_norm": 0.364255428314209,
|
21039 |
+
"learning_rate": 8.848007635409212e-06,
|
21040 |
+
"loss": 0.6312,
|
21041 |
+
"mean_token_accuracy": 0.8757534250617027,
|
21042 |
+
"num_tokens": 38813756.0,
|
21043 |
+
"step": 23370
|
21044 |
+
},
|
21045 |
+
{
|
21046 |
+
"epoch": 1.1157255514859399,
|
21047 |
+
"grad_norm": 0.4289257526397705,
|
21048 |
+
"learning_rate": 8.843235504652829e-06,
|
21049 |
+
"loss": 0.5704,
|
21050 |
+
"mean_token_accuracy": 0.8797232627868652,
|
21051 |
+
"num_tokens": 38830297.0,
|
21052 |
+
"step": 23380
|
21053 |
+
},
|
21054 |
+
{
|
21055 |
+
"epoch": 1.1162027702549542,
|
21056 |
+
"grad_norm": 0.3604724407196045,
|
21057 |
+
"learning_rate": 8.838463373896446e-06,
|
21058 |
+
"loss": 0.6343,
|
21059 |
+
"mean_token_accuracy": 0.8824995398521424,
|
21060 |
+
"num_tokens": 38846122.0,
|
21061 |
+
"step": 23390
|
21062 |
+
},
|
21063 |
+
{
|
21064 |
+
"epoch": 1.1166799890239683,
|
21065 |
+
"grad_norm": 0.4442988336086273,
|
21066 |
+
"learning_rate": 8.833691243140062e-06,
|
21067 |
+
"loss": 0.6179,
|
21068 |
+
"mean_token_accuracy": 0.8732976973056793,
|
21069 |
+
"num_tokens": 38862694.0,
|
21070 |
+
"step": 23400
|
21071 |
}
|
21072 |
],
|
21073 |
"logging_steps": 10,
|
|
|
21087 |
"attributes": {}
|
21088 |
}
|
21089 |
},
|
21090 |
+
"total_flos": 8.752667987219743e+17,
|
21091 |
"train_batch_size": 2,
|
21092 |
"trial_name": null,
|
21093 |
"trial_params": null
|