Stuti103 commited on
Commit
87cb183
·
verified ·
1 Parent(s): 80ece63

Training in progress, step 23400, checkpoint

Browse files
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9686c12fcd5e45c28b4b28976fa75f8def31281ef98c64003bfbb5e3fa400952
3
  size 3541119728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e9f8625686c6ce45944c3c7217221b98c12500fd510b180766cf3eb372f4d89
3
  size 3541119728
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2273236a215f1def6ae0d5527d0137d3d1c96315b946a27b75ad2e2e2c59c12e
3
  size 778374186
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac87b5de42a674db5c2561770c65f3648c44d74fcf8a74096cb07a141a2d371d
3
  size 778374186
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af919d8c4330d21092d0ab400e160dd9669e9eba0e00f29bd30c8ffd5b00b8cf
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff46be677348ca53ea6c6e13cd256f6d0b6ea2f784c9f0f6c43756f323fd9351
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.1023634259535426,
6
  "eval_steps": 500,
7
- "global_step": 23100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -20798,6 +20798,276 @@
20798
  "mean_token_accuracy": 0.8887743890285492,
20799
  "num_tokens": 38365222.0,
20800
  "step": 23100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20801
  }
20802
  ],
20803
  "logging_steps": 10,
@@ -20817,7 +21087,7 @@
20817
  "attributes": {}
20818
  }
20819
  },
20820
- "total_flos": 8.640620345734717e+17,
20821
  "train_batch_size": 2,
20822
  "trial_name": null,
20823
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.1166799890239683,
6
  "eval_steps": 500,
7
+ "global_step": 23400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
20798
  "mean_token_accuracy": 0.8887743890285492,
20799
  "num_tokens": 38365222.0,
20800
  "step": 23100
20801
+ },
20802
+ {
20803
+ "epoch": 1.102840644722557,
20804
+ "grad_norm": 0.37605273723602295,
20805
+ "learning_rate": 8.972083035075162e-06,
20806
+ "loss": 0.6587,
20807
+ "mean_token_accuracy": 0.8744294509291649,
20808
+ "num_tokens": 38381762.0,
20809
+ "step": 23110
20810
+ },
20811
+ {
20812
+ "epoch": 1.1033178634915712,
20813
+ "grad_norm": 0.36323273181915283,
20814
+ "learning_rate": 8.967310904318779e-06,
20815
+ "loss": 0.5949,
20816
+ "mean_token_accuracy": 0.883763587474823,
20817
+ "num_tokens": 38397297.0,
20818
+ "step": 23120
20819
+ },
20820
+ {
20821
+ "epoch": 1.1037950822605853,
20822
+ "grad_norm": 0.36709028482437134,
20823
+ "learning_rate": 8.962538773562395e-06,
20824
+ "loss": 0.6753,
20825
+ "mean_token_accuracy": 0.8767507761716843,
20826
+ "num_tokens": 38413052.0,
20827
+ "step": 23130
20828
+ },
20829
+ {
20830
+ "epoch": 1.1042723010295994,
20831
+ "grad_norm": 0.36097216606140137,
20832
+ "learning_rate": 8.957766642806014e-06,
20833
+ "loss": 0.56,
20834
+ "mean_token_accuracy": 0.8822339043021202,
20835
+ "num_tokens": 38429063.0,
20836
+ "step": 23140
20837
+ },
20838
+ {
20839
+ "epoch": 1.1047495197986137,
20840
+ "grad_norm": 0.3489522635936737,
20841
+ "learning_rate": 8.95299451204963e-06,
20842
+ "loss": 0.6324,
20843
+ "mean_token_accuracy": 0.8755771458148957,
20844
+ "num_tokens": 38446171.0,
20845
+ "step": 23150
20846
+ },
20847
+ {
20848
+ "epoch": 1.1052267385676278,
20849
+ "grad_norm": 0.31116101145744324,
20850
+ "learning_rate": 8.948222381293249e-06,
20851
+ "loss": 0.7654,
20852
+ "mean_token_accuracy": 0.8550684407353402,
20853
+ "num_tokens": 38465354.0,
20854
+ "step": 23160
20855
+ },
20856
+ {
20857
+ "epoch": 1.105703957336642,
20858
+ "grad_norm": 0.43474653363227844,
20859
+ "learning_rate": 8.943450250536865e-06,
20860
+ "loss": 0.7257,
20861
+ "mean_token_accuracy": 0.8584754586219787,
20862
+ "num_tokens": 38483004.0,
20863
+ "step": 23170
20864
+ },
20865
+ {
20866
+ "epoch": 1.1061811761056561,
20867
+ "grad_norm": 0.330024391412735,
20868
+ "learning_rate": 8.938678119780482e-06,
20869
+ "loss": 0.6111,
20870
+ "mean_token_accuracy": 0.8786431089043617,
20871
+ "num_tokens": 38500479.0,
20872
+ "step": 23180
20873
+ },
20874
+ {
20875
+ "epoch": 1.1066583948746704,
20876
+ "grad_norm": 0.3605097830295563,
20877
+ "learning_rate": 8.9339059890241e-06,
20878
+ "loss": 0.6118,
20879
+ "mean_token_accuracy": 0.8718374699354172,
20880
+ "num_tokens": 38516219.0,
20881
+ "step": 23190
20882
+ },
20883
+ {
20884
+ "epoch": 1.1071356136436845,
20885
+ "grad_norm": 0.37876948714256287,
20886
+ "learning_rate": 8.929133858267717e-06,
20887
+ "loss": 0.5201,
20888
+ "mean_token_accuracy": 0.8916581928730011,
20889
+ "num_tokens": 38530884.0,
20890
+ "step": 23200
20891
+ },
20892
+ {
20893
+ "epoch": 1.1076128324126988,
20894
+ "grad_norm": 0.41296443343162537,
20895
+ "learning_rate": 8.924361727511335e-06,
20896
+ "loss": 0.6266,
20897
+ "mean_token_accuracy": 0.8670860260725022,
20898
+ "num_tokens": 38547224.0,
20899
+ "step": 23210
20900
+ },
20901
+ {
20902
+ "epoch": 1.1080900511817129,
20903
+ "grad_norm": 0.3719196021556854,
20904
+ "learning_rate": 8.919589596754952e-06,
20905
+ "loss": 0.5483,
20906
+ "mean_token_accuracy": 0.8841874286532402,
20907
+ "num_tokens": 38563138.0,
20908
+ "step": 23220
20909
+ },
20910
+ {
20911
+ "epoch": 1.1085672699507272,
20912
+ "grad_norm": 0.34505724906921387,
20913
+ "learning_rate": 8.914817465998569e-06,
20914
+ "loss": 0.5962,
20915
+ "mean_token_accuracy": 0.8823557212948799,
20916
+ "num_tokens": 38579159.0,
20917
+ "step": 23230
20918
+ },
20919
+ {
20920
+ "epoch": 1.1090444887197413,
20921
+ "grad_norm": 0.41858744621276855,
20922
+ "learning_rate": 8.910045335242187e-06,
20923
+ "loss": 0.6115,
20924
+ "mean_token_accuracy": 0.8636498123407363,
20925
+ "num_tokens": 38595874.0,
20926
+ "step": 23240
20927
+ },
20928
+ {
20929
+ "epoch": 1.1095217074887556,
20930
+ "grad_norm": 0.37642261385917664,
20931
+ "learning_rate": 8.905273204485804e-06,
20932
+ "loss": 0.5512,
20933
+ "mean_token_accuracy": 0.8865195542573929,
20934
+ "num_tokens": 38613431.0,
20935
+ "step": 23250
20936
+ },
20937
+ {
20938
+ "epoch": 1.1099989262577696,
20939
+ "grad_norm": 0.3944489061832428,
20940
+ "learning_rate": 8.90050107372942e-06,
20941
+ "loss": 0.5622,
20942
+ "mean_token_accuracy": 0.8907116547226905,
20943
+ "num_tokens": 38628954.0,
20944
+ "step": 23260
20945
+ },
20946
+ {
20947
+ "epoch": 1.110476145026784,
20948
+ "grad_norm": 0.4231228232383728,
20949
+ "learning_rate": 8.895728942973039e-06,
20950
+ "loss": 0.5735,
20951
+ "mean_token_accuracy": 0.883218166232109,
20952
+ "num_tokens": 38644926.0,
20953
+ "step": 23270
20954
+ },
20955
+ {
20956
+ "epoch": 1.110953363795798,
20957
+ "grad_norm": 0.4161708652973175,
20958
+ "learning_rate": 8.890956812216655e-06,
20959
+ "loss": 0.6235,
20960
+ "mean_token_accuracy": 0.8809412658214569,
20961
+ "num_tokens": 38661566.0,
20962
+ "step": 23280
20963
+ },
20964
+ {
20965
+ "epoch": 1.1114305825648123,
20966
+ "grad_norm": 0.37831827998161316,
20967
+ "learning_rate": 8.886184681460272e-06,
20968
+ "loss": 0.6968,
20969
+ "mean_token_accuracy": 0.8654580265283585,
20970
+ "num_tokens": 38679443.0,
20971
+ "step": 23290
20972
+ },
20973
+ {
20974
+ "epoch": 1.1119078013338264,
20975
+ "grad_norm": 0.42068058252334595,
20976
+ "learning_rate": 8.881412550703889e-06,
20977
+ "loss": 0.5489,
20978
+ "mean_token_accuracy": 0.8919402092695237,
20979
+ "num_tokens": 38696241.0,
20980
+ "step": 23300
20981
+ },
20982
+ {
20983
+ "epoch": 1.1123850201028407,
20984
+ "grad_norm": 0.4387883245944977,
20985
+ "learning_rate": 8.876640419947507e-06,
20986
+ "loss": 0.5565,
20987
+ "mean_token_accuracy": 0.8957258448004722,
20988
+ "num_tokens": 38712179.0,
20989
+ "step": 23310
20990
+ },
20991
+ {
20992
+ "epoch": 1.1128622388718548,
20993
+ "grad_norm": 0.43475064635276794,
20994
+ "learning_rate": 8.871868289191124e-06,
20995
+ "loss": 0.5857,
20996
+ "mean_token_accuracy": 0.8815463319420814,
20997
+ "num_tokens": 38728828.0,
20998
+ "step": 23320
20999
+ },
21000
+ {
21001
+ "epoch": 1.113339457640869,
21002
+ "grad_norm": 0.3661762773990631,
21003
+ "learning_rate": 8.867096158434742e-06,
21004
+ "loss": 0.611,
21005
+ "mean_token_accuracy": 0.8724078252911568,
21006
+ "num_tokens": 38746266.0,
21007
+ "step": 23330
21008
+ },
21009
+ {
21010
+ "epoch": 1.1138166764098831,
21011
+ "grad_norm": 0.38337260484695435,
21012
+ "learning_rate": 8.862324027678359e-06,
21013
+ "loss": 0.6629,
21014
+ "mean_token_accuracy": 0.8663832738995552,
21015
+ "num_tokens": 38764971.0,
21016
+ "step": 23340
21017
+ },
21018
+ {
21019
+ "epoch": 1.1142938951788974,
21020
+ "grad_norm": 0.41741085052490234,
21021
+ "learning_rate": 8.857551896921977e-06,
21022
+ "loss": 0.6566,
21023
+ "mean_token_accuracy": 0.8759171679615975,
21024
+ "num_tokens": 38782014.0,
21025
+ "step": 23350
21026
+ },
21027
+ {
21028
+ "epoch": 1.1147711139479115,
21029
+ "grad_norm": 0.373674601316452,
21030
+ "learning_rate": 8.852779766165594e-06,
21031
+ "loss": 0.6178,
21032
+ "mean_token_accuracy": 0.8715434208512306,
21033
+ "num_tokens": 38796852.0,
21034
+ "step": 23360
21035
+ },
21036
+ {
21037
+ "epoch": 1.1152483327169258,
21038
+ "grad_norm": 0.364255428314209,
21039
+ "learning_rate": 8.848007635409212e-06,
21040
+ "loss": 0.6312,
21041
+ "mean_token_accuracy": 0.8757534250617027,
21042
+ "num_tokens": 38813756.0,
21043
+ "step": 23370
21044
+ },
21045
+ {
21046
+ "epoch": 1.1157255514859399,
21047
+ "grad_norm": 0.4289257526397705,
21048
+ "learning_rate": 8.843235504652829e-06,
21049
+ "loss": 0.5704,
21050
+ "mean_token_accuracy": 0.8797232627868652,
21051
+ "num_tokens": 38830297.0,
21052
+ "step": 23380
21053
+ },
21054
+ {
21055
+ "epoch": 1.1162027702549542,
21056
+ "grad_norm": 0.3604724407196045,
21057
+ "learning_rate": 8.838463373896446e-06,
21058
+ "loss": 0.6343,
21059
+ "mean_token_accuracy": 0.8824995398521424,
21060
+ "num_tokens": 38846122.0,
21061
+ "step": 23390
21062
+ },
21063
+ {
21064
+ "epoch": 1.1166799890239683,
21065
+ "grad_norm": 0.4442988336086273,
21066
+ "learning_rate": 8.833691243140062e-06,
21067
+ "loss": 0.6179,
21068
+ "mean_token_accuracy": 0.8732976973056793,
21069
+ "num_tokens": 38862694.0,
21070
+ "step": 23400
21071
  }
21072
  ],
21073
  "logging_steps": 10,
 
21087
  "attributes": {}
21088
  }
21089
  },
21090
+ "total_flos": 8.752667987219743e+17,
21091
  "train_batch_size": 2,
21092
  "trial_name": null,
21093
  "trial_params": null