arcwarden46 commited on
Commit
e8f287c
·
verified ·
1 Parent(s): f5adf73

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2b891c33fc429c0cb28a6877c37fee7b9ad88339efdbefdc17f8648514cafb8
3
  size 289452128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36670a5cb07570b526cc68b5b9d0ebaa892ac926af726fbdbe84eaef96121c65
3
  size 289452128
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04f5aface321fef6ac3514607279d714a80837692202b1e5c7688000c93315d8
3
  size 147359892
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe5c90e3905bbef6d07cf5a823555596fd9e072abb4be8bdc0ec4434b20e757e
3
  size 147359892
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1213594a86bdd0e226c0613d392942ef0951aab7e498f1cfd891e9a0d4d395d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:630e2711af6547215ca84e215db44507617e11099bbc440a92e4d380f49e9a73
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7e9f0c2a27af03f3c1874438820d046de94b36aaec3b0cc778f96def4616314
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:530505d607699f384741067a5f9139d72f043713adb680898a3f1b5714170c97
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.7203736305236816,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-100",
4
- "epoch": 0.019923295313044778,
5
  "eval_steps": 100,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -723,6 +723,714 @@
723
  "eval_samples_per_second": 31.902,
724
  "eval_steps_per_second": 7.977,
725
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
  }
727
  ],
728
  "logging_steps": 1,
@@ -751,7 +1459,7 @@
751
  "attributes": {}
752
  }
753
  },
754
- "total_flos": 3.325585305423053e+16,
755
  "train_batch_size": 8,
756
  "trial_name": null,
757
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7008334398269653,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-200",
4
+ "epoch": 0.039846590626089556,
5
  "eval_steps": 100,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
723
  "eval_samples_per_second": 31.902,
724
  "eval_steps_per_second": 7.977,
725
  "step": 100
726
+ },
727
+ {
728
+ "epoch": 0.020122528266175225,
729
+ "grad_norm": 0.21482613682746887,
730
+ "learning_rate": 9.31367192988896e-05,
731
+ "loss": 0.5356,
732
+ "step": 101
733
+ },
734
+ {
735
+ "epoch": 0.02032176121930567,
736
+ "grad_norm": 0.28820475935935974,
737
+ "learning_rate": 9.297032057507264e-05,
738
+ "loss": 0.6606,
739
+ "step": 102
740
+ },
741
+ {
742
+ "epoch": 0.02052099417243612,
743
+ "grad_norm": 0.22919978201389313,
744
+ "learning_rate": 9.280208114573859e-05,
745
+ "loss": 0.6681,
746
+ "step": 103
747
+ },
748
+ {
749
+ "epoch": 0.02072022712556657,
750
+ "grad_norm": 0.20591995120048523,
751
+ "learning_rate": 9.263200821770461e-05,
752
+ "loss": 0.6523,
753
+ "step": 104
754
+ },
755
+ {
756
+ "epoch": 0.020919460078697015,
757
+ "grad_norm": 0.20084571838378906,
758
+ "learning_rate": 9.246010907632895e-05,
759
+ "loss": 0.629,
760
+ "step": 105
761
+ },
762
+ {
763
+ "epoch": 0.021118693031827465,
764
+ "grad_norm": 0.25101473927497864,
765
+ "learning_rate": 9.228639108519868e-05,
766
+ "loss": 0.7384,
767
+ "step": 106
768
+ },
769
+ {
770
+ "epoch": 0.021317925984957912,
771
+ "grad_norm": 0.23734250664710999,
772
+ "learning_rate": 9.211086168581433e-05,
773
+ "loss": 0.6895,
774
+ "step": 107
775
+ },
776
+ {
777
+ "epoch": 0.02151715893808836,
778
+ "grad_norm": 0.22536040842533112,
779
+ "learning_rate": 9.193352839727121e-05,
780
+ "loss": 0.7137,
781
+ "step": 108
782
+ },
783
+ {
784
+ "epoch": 0.02171639189121881,
785
+ "grad_norm": 0.2355283945798874,
786
+ "learning_rate": 9.175439881593716e-05,
787
+ "loss": 0.7662,
788
+ "step": 109
789
+ },
790
+ {
791
+ "epoch": 0.021915624844349255,
792
+ "grad_norm": 0.2228918820619583,
793
+ "learning_rate": 9.157348061512727e-05,
794
+ "loss": 0.6534,
795
+ "step": 110
796
+ },
797
+ {
798
+ "epoch": 0.022114857797479702,
799
+ "grad_norm": 0.2184191644191742,
800
+ "learning_rate": 9.139078154477512e-05,
801
+ "loss": 0.7054,
802
+ "step": 111
803
+ },
804
+ {
805
+ "epoch": 0.022314090750610152,
806
+ "grad_norm": 0.21408338844776154,
807
+ "learning_rate": 9.120630943110077e-05,
808
+ "loss": 0.6103,
809
+ "step": 112
810
+ },
811
+ {
812
+ "epoch": 0.0225133237037406,
813
+ "grad_norm": 0.25427061319351196,
814
+ "learning_rate": 9.102007217627568e-05,
815
+ "loss": 0.8366,
816
+ "step": 113
817
+ },
818
+ {
819
+ "epoch": 0.022712556656871045,
820
+ "grad_norm": 0.21583066880702972,
821
+ "learning_rate": 9.083207775808396e-05,
822
+ "loss": 0.7165,
823
+ "step": 114
824
+ },
825
+ {
826
+ "epoch": 0.022911789610001496,
827
+ "grad_norm": 0.2396174967288971,
828
+ "learning_rate": 9.064233422958077e-05,
829
+ "loss": 0.6794,
830
+ "step": 115
831
+ },
832
+ {
833
+ "epoch": 0.023111022563131942,
834
+ "grad_norm": 0.2334282398223877,
835
+ "learning_rate": 9.045084971874738e-05,
836
+ "loss": 0.7945,
837
+ "step": 116
838
+ },
839
+ {
840
+ "epoch": 0.02331025551626239,
841
+ "grad_norm": 0.20607277750968933,
842
+ "learning_rate": 9.025763242814291e-05,
843
+ "loss": 0.5739,
844
+ "step": 117
845
+ },
846
+ {
847
+ "epoch": 0.02350948846939284,
848
+ "grad_norm": 0.2125677615404129,
849
+ "learning_rate": 9.006269063455304e-05,
850
+ "loss": 0.6976,
851
+ "step": 118
852
+ },
853
+ {
854
+ "epoch": 0.023708721422523286,
855
+ "grad_norm": 0.24233980476856232,
856
+ "learning_rate": 8.986603268863536e-05,
857
+ "loss": 0.7768,
858
+ "step": 119
859
+ },
860
+ {
861
+ "epoch": 0.023907954375653732,
862
+ "grad_norm": 0.23903363943099976,
863
+ "learning_rate": 8.966766701456177e-05,
864
+ "loss": 0.7925,
865
+ "step": 120
866
+ },
867
+ {
868
+ "epoch": 0.024107187328784183,
869
+ "grad_norm": 0.2270466685295105,
870
+ "learning_rate": 8.94676021096575e-05,
871
+ "loss": 0.679,
872
+ "step": 121
873
+ },
874
+ {
875
+ "epoch": 0.02430642028191463,
876
+ "grad_norm": 0.22134599089622498,
877
+ "learning_rate": 8.926584654403724e-05,
878
+ "loss": 0.7155,
879
+ "step": 122
880
+ },
881
+ {
882
+ "epoch": 0.024505653235045076,
883
+ "grad_norm": 0.2539938986301422,
884
+ "learning_rate": 8.906240896023794e-05,
885
+ "loss": 0.7156,
886
+ "step": 123
887
+ },
888
+ {
889
+ "epoch": 0.024704886188175523,
890
+ "grad_norm": 0.24332107603549957,
891
+ "learning_rate": 8.885729807284856e-05,
892
+ "loss": 0.6988,
893
+ "step": 124
894
+ },
895
+ {
896
+ "epoch": 0.024904119141305973,
897
+ "grad_norm": 0.23084275424480438,
898
+ "learning_rate": 8.865052266813685e-05,
899
+ "loss": 0.7142,
900
+ "step": 125
901
+ },
902
+ {
903
+ "epoch": 0.02510335209443642,
904
+ "grad_norm": 0.24665014445781708,
905
+ "learning_rate": 8.844209160367299e-05,
906
+ "loss": 0.7471,
907
+ "step": 126
908
+ },
909
+ {
910
+ "epoch": 0.025302585047566866,
911
+ "grad_norm": 0.23389537632465363,
912
+ "learning_rate": 8.823201380795001e-05,
913
+ "loss": 0.676,
914
+ "step": 127
915
+ },
916
+ {
917
+ "epoch": 0.025501818000697316,
918
+ "grad_norm": 0.2225302904844284,
919
+ "learning_rate": 8.802029828000156e-05,
920
+ "loss": 0.6702,
921
+ "step": 128
922
+ },
923
+ {
924
+ "epoch": 0.025701050953827763,
925
+ "grad_norm": 0.2194124162197113,
926
+ "learning_rate": 8.780695408901613e-05,
927
+ "loss": 0.7173,
928
+ "step": 129
929
+ },
930
+ {
931
+ "epoch": 0.02590028390695821,
932
+ "grad_norm": 0.22123487293720245,
933
+ "learning_rate": 8.759199037394887e-05,
934
+ "loss": 0.679,
935
+ "step": 130
936
+ },
937
+ {
938
+ "epoch": 0.02609951686008866,
939
+ "grad_norm": 0.2465553730726242,
940
+ "learning_rate": 8.737541634312985e-05,
941
+ "loss": 0.7151,
942
+ "step": 131
943
+ },
944
+ {
945
+ "epoch": 0.026298749813219106,
946
+ "grad_norm": 0.2150459736585617,
947
+ "learning_rate": 8.715724127386972e-05,
948
+ "loss": 0.7106,
949
+ "step": 132
950
+ },
951
+ {
952
+ "epoch": 0.026497982766349553,
953
+ "grad_norm": 0.1966014802455902,
954
+ "learning_rate": 8.693747451206232e-05,
955
+ "loss": 0.7174,
956
+ "step": 133
957
+ },
958
+ {
959
+ "epoch": 0.026697215719480003,
960
+ "grad_norm": 0.25692620873451233,
961
+ "learning_rate": 8.671612547178428e-05,
962
+ "loss": 0.6709,
963
+ "step": 134
964
+ },
965
+ {
966
+ "epoch": 0.02689644867261045,
967
+ "grad_norm": 0.19534507393836975,
968
+ "learning_rate": 8.649320363489179e-05,
969
+ "loss": 0.5871,
970
+ "step": 135
971
+ },
972
+ {
973
+ "epoch": 0.027095681625740897,
974
+ "grad_norm": 0.25263822078704834,
975
+ "learning_rate": 8.626871855061438e-05,
976
+ "loss": 0.7815,
977
+ "step": 136
978
+ },
979
+ {
980
+ "epoch": 0.027294914578871347,
981
+ "grad_norm": 0.24761663377285004,
982
+ "learning_rate": 8.604267983514594e-05,
983
+ "loss": 0.7253,
984
+ "step": 137
985
+ },
986
+ {
987
+ "epoch": 0.027494147532001793,
988
+ "grad_norm": 0.2336164116859436,
989
+ "learning_rate": 8.581509717123273e-05,
990
+ "loss": 0.6871,
991
+ "step": 138
992
+ },
993
+ {
994
+ "epoch": 0.02769338048513224,
995
+ "grad_norm": 0.2253238558769226,
996
+ "learning_rate": 8.558598030775857e-05,
997
+ "loss": 0.6806,
998
+ "step": 139
999
+ },
1000
+ {
1001
+ "epoch": 0.02789261343826269,
1002
+ "grad_norm": 0.2473248988389969,
1003
+ "learning_rate": 8.535533905932738e-05,
1004
+ "loss": 0.8147,
1005
+ "step": 140
1006
+ },
1007
+ {
1008
+ "epoch": 0.028091846391393137,
1009
+ "grad_norm": 0.2515900731086731,
1010
+ "learning_rate": 8.51231833058426e-05,
1011
+ "loss": 0.8055,
1012
+ "step": 141
1013
+ },
1014
+ {
1015
+ "epoch": 0.028291079344523583,
1016
+ "grad_norm": 0.2372109442949295,
1017
+ "learning_rate": 8.488952299208401e-05,
1018
+ "loss": 0.6404,
1019
+ "step": 142
1020
+ },
1021
+ {
1022
+ "epoch": 0.028490312297654034,
1023
+ "grad_norm": 0.24354617297649384,
1024
+ "learning_rate": 8.46543681272818e-05,
1025
+ "loss": 0.7355,
1026
+ "step": 143
1027
+ },
1028
+ {
1029
+ "epoch": 0.02868954525078448,
1030
+ "grad_norm": 0.2426522821187973,
1031
+ "learning_rate": 8.44177287846877e-05,
1032
+ "loss": 0.7561,
1033
+ "step": 144
1034
+ },
1035
+ {
1036
+ "epoch": 0.028888778203914927,
1037
+ "grad_norm": 0.24764999747276306,
1038
+ "learning_rate": 8.417961510114356e-05,
1039
+ "loss": 0.7203,
1040
+ "step": 145
1041
+ },
1042
+ {
1043
+ "epoch": 0.029088011157045374,
1044
+ "grad_norm": 0.2602037191390991,
1045
+ "learning_rate": 8.39400372766471e-05,
1046
+ "loss": 0.7111,
1047
+ "step": 146
1048
+ },
1049
+ {
1050
+ "epoch": 0.029287244110175824,
1051
+ "grad_norm": 0.25291872024536133,
1052
+ "learning_rate": 8.36990055739149e-05,
1053
+ "loss": 0.7011,
1054
+ "step": 147
1055
+ },
1056
+ {
1057
+ "epoch": 0.02948647706330627,
1058
+ "grad_norm": 0.2726818323135376,
1059
+ "learning_rate": 8.345653031794292e-05,
1060
+ "loss": 0.8132,
1061
+ "step": 148
1062
+ },
1063
+ {
1064
+ "epoch": 0.029685710016436717,
1065
+ "grad_norm": 0.2830042839050293,
1066
+ "learning_rate": 8.321262189556409e-05,
1067
+ "loss": 0.7577,
1068
+ "step": 149
1069
+ },
1070
+ {
1071
+ "epoch": 0.029884942969567167,
1072
+ "grad_norm": 0.2831919193267822,
1073
+ "learning_rate": 8.296729075500344e-05,
1074
+ "loss": 0.7566,
1075
+ "step": 150
1076
+ },
1077
+ {
1078
+ "epoch": 0.030084175922697614,
1079
+ "grad_norm": 0.33581486344337463,
1080
+ "learning_rate": 8.272054740543052e-05,
1081
+ "loss": 0.7664,
1082
+ "step": 151
1083
+ },
1084
+ {
1085
+ "epoch": 0.03028340887582806,
1086
+ "grad_norm": 0.3191780149936676,
1087
+ "learning_rate": 8.247240241650918e-05,
1088
+ "loss": 0.6583,
1089
+ "step": 152
1090
+ },
1091
+ {
1092
+ "epoch": 0.03048264182895851,
1093
+ "grad_norm": 0.22133736312389374,
1094
+ "learning_rate": 8.222286641794488e-05,
1095
+ "loss": 0.6328,
1096
+ "step": 153
1097
+ },
1098
+ {
1099
+ "epoch": 0.030681874782088957,
1100
+ "grad_norm": 0.21831941604614258,
1101
+ "learning_rate": 8.197195009902924e-05,
1102
+ "loss": 0.698,
1103
+ "step": 154
1104
+ },
1105
+ {
1106
+ "epoch": 0.030881107735219404,
1107
+ "grad_norm": 0.1761123090982437,
1108
+ "learning_rate": 8.171966420818228e-05,
1109
+ "loss": 0.5441,
1110
+ "step": 155
1111
+ },
1112
+ {
1113
+ "epoch": 0.031080340688349854,
1114
+ "grad_norm": 0.20375514030456543,
1115
+ "learning_rate": 8.146601955249188e-05,
1116
+ "loss": 0.6309,
1117
+ "step": 156
1118
+ },
1119
+ {
1120
+ "epoch": 0.0312795736414803,
1121
+ "grad_norm": 0.2410995066165924,
1122
+ "learning_rate": 8.121102699725089e-05,
1123
+ "loss": 0.6337,
1124
+ "step": 157
1125
+ },
1126
+ {
1127
+ "epoch": 0.03147880659461075,
1128
+ "grad_norm": 0.2229624092578888,
1129
+ "learning_rate": 8.095469746549172e-05,
1130
+ "loss": 0.7014,
1131
+ "step": 158
1132
+ },
1133
+ {
1134
+ "epoch": 0.0316780395477412,
1135
+ "grad_norm": 0.23791897296905518,
1136
+ "learning_rate": 8.069704193751832e-05,
1137
+ "loss": 0.6582,
1138
+ "step": 159
1139
+ },
1140
+ {
1141
+ "epoch": 0.03187727250087164,
1142
+ "grad_norm": 0.22524884343147278,
1143
+ "learning_rate": 8.043807145043604e-05,
1144
+ "loss": 0.7178,
1145
+ "step": 160
1146
+ },
1147
+ {
1148
+ "epoch": 0.03207650545400209,
1149
+ "grad_norm": 0.21897757053375244,
1150
+ "learning_rate": 8.017779709767858e-05,
1151
+ "loss": 0.7032,
1152
+ "step": 161
1153
+ },
1154
+ {
1155
+ "epoch": 0.03227573840713254,
1156
+ "grad_norm": 0.19019927084445953,
1157
+ "learning_rate": 7.991623002853296e-05,
1158
+ "loss": 0.5842,
1159
+ "step": 162
1160
+ },
1161
+ {
1162
+ "epoch": 0.032474971360262984,
1163
+ "grad_norm": 0.21591834723949432,
1164
+ "learning_rate": 7.965338144766186e-05,
1165
+ "loss": 0.7243,
1166
+ "step": 163
1167
+ },
1168
+ {
1169
+ "epoch": 0.032674204313393435,
1170
+ "grad_norm": 0.20649899542331696,
1171
+ "learning_rate": 7.938926261462366e-05,
1172
+ "loss": 0.6814,
1173
+ "step": 164
1174
+ },
1175
+ {
1176
+ "epoch": 0.032873437266523885,
1177
+ "grad_norm": 0.24540702998638153,
1178
+ "learning_rate": 7.912388484339012e-05,
1179
+ "loss": 0.7356,
1180
+ "step": 165
1181
+ },
1182
+ {
1183
+ "epoch": 0.03307267021965433,
1184
+ "grad_norm": 0.22577622532844543,
1185
+ "learning_rate": 7.88572595018617e-05,
1186
+ "loss": 0.6468,
1187
+ "step": 166
1188
+ },
1189
+ {
1190
+ "epoch": 0.03327190317278478,
1191
+ "grad_norm": 0.2168670892715454,
1192
+ "learning_rate": 7.858939801138061e-05,
1193
+ "loss": 0.6448,
1194
+ "step": 167
1195
+ },
1196
+ {
1197
+ "epoch": 0.03347113612591523,
1198
+ "grad_norm": 0.22892935574054718,
1199
+ "learning_rate": 7.832031184624164e-05,
1200
+ "loss": 0.6375,
1201
+ "step": 168
1202
+ },
1203
+ {
1204
+ "epoch": 0.03367036907904567,
1205
+ "grad_norm": 0.2086174190044403,
1206
+ "learning_rate": 7.80500125332005e-05,
1207
+ "loss": 0.6993,
1208
+ "step": 169
1209
+ },
1210
+ {
1211
+ "epoch": 0.03386960203217612,
1212
+ "grad_norm": 0.19050797820091248,
1213
+ "learning_rate": 7.777851165098012e-05,
1214
+ "loss": 0.5909,
1215
+ "step": 170
1216
+ },
1217
+ {
1218
+ "epoch": 0.03406883498530657,
1219
+ "grad_norm": 0.22635716199874878,
1220
+ "learning_rate": 7.750582082977467e-05,
1221
+ "loss": 0.6799,
1222
+ "step": 171
1223
+ },
1224
+ {
1225
+ "epoch": 0.034268067938437015,
1226
+ "grad_norm": 0.2369690090417862,
1227
+ "learning_rate": 7.723195175075136e-05,
1228
+ "loss": 0.6823,
1229
+ "step": 172
1230
+ },
1231
+ {
1232
+ "epoch": 0.034467300891567465,
1233
+ "grad_norm": 0.2258961796760559,
1234
+ "learning_rate": 7.695691614555003e-05,
1235
+ "loss": 0.6768,
1236
+ "step": 173
1237
+ },
1238
+ {
1239
+ "epoch": 0.034666533844697915,
1240
+ "grad_norm": 0.2175053060054779,
1241
+ "learning_rate": 7.668072579578058e-05,
1242
+ "loss": 0.6501,
1243
+ "step": 174
1244
+ },
1245
+ {
1246
+ "epoch": 0.03486576679782836,
1247
+ "grad_norm": 0.23859256505966187,
1248
+ "learning_rate": 7.64033925325184e-05,
1249
+ "loss": 0.7198,
1250
+ "step": 175
1251
+ },
1252
+ {
1253
+ "epoch": 0.03506499975095881,
1254
+ "grad_norm": 0.2090621143579483,
1255
+ "learning_rate": 7.612492823579745e-05,
1256
+ "loss": 0.6075,
1257
+ "step": 176
1258
+ },
1259
+ {
1260
+ "epoch": 0.03526423270408926,
1261
+ "grad_norm": 0.23009976744651794,
1262
+ "learning_rate": 7.584534483410137e-05,
1263
+ "loss": 0.6993,
1264
+ "step": 177
1265
+ },
1266
+ {
1267
+ "epoch": 0.0354634656572197,
1268
+ "grad_norm": 0.19365736842155457,
1269
+ "learning_rate": 7.55646543038526e-05,
1270
+ "loss": 0.61,
1271
+ "step": 178
1272
+ },
1273
+ {
1274
+ "epoch": 0.03566269861035015,
1275
+ "grad_norm": 0.24506577849388123,
1276
+ "learning_rate": 7.528286866889924e-05,
1277
+ "loss": 0.68,
1278
+ "step": 179
1279
+ },
1280
+ {
1281
+ "epoch": 0.0358619315634806,
1282
+ "grad_norm": 0.2776792049407959,
1283
+ "learning_rate": 7.500000000000001e-05,
1284
+ "loss": 0.7224,
1285
+ "step": 180
1286
+ },
1287
+ {
1288
+ "epoch": 0.036061164516611045,
1289
+ "grad_norm": 0.21556320786476135,
1290
+ "learning_rate": 7.471606041430723e-05,
1291
+ "loss": 0.6742,
1292
+ "step": 181
1293
+ },
1294
+ {
1295
+ "epoch": 0.036260397469741495,
1296
+ "grad_norm": 0.2549598217010498,
1297
+ "learning_rate": 7.443106207484776e-05,
1298
+ "loss": 0.7866,
1299
+ "step": 182
1300
+ },
1301
+ {
1302
+ "epoch": 0.036459630422871946,
1303
+ "grad_norm": 0.2427287995815277,
1304
+ "learning_rate": 7.414501719000187e-05,
1305
+ "loss": 0.7006,
1306
+ "step": 183
1307
+ },
1308
+ {
1309
+ "epoch": 0.03665886337600239,
1310
+ "grad_norm": 0.2485671192407608,
1311
+ "learning_rate": 7.385793801298042e-05,
1312
+ "loss": 0.678,
1313
+ "step": 184
1314
+ },
1315
+ {
1316
+ "epoch": 0.03685809632913284,
1317
+ "grad_norm": 0.23628251254558563,
1318
+ "learning_rate": 7.35698368412999e-05,
1319
+ "loss": 0.7176,
1320
+ "step": 185
1321
+ },
1322
+ {
1323
+ "epoch": 0.03705732928226329,
1324
+ "grad_norm": 0.245437353849411,
1325
+ "learning_rate": 7.328072601625557e-05,
1326
+ "loss": 0.6784,
1327
+ "step": 186
1328
+ },
1329
+ {
1330
+ "epoch": 0.03725656223539373,
1331
+ "grad_norm": 0.26534438133239746,
1332
+ "learning_rate": 7.2990617922393e-05,
1333
+ "loss": 0.7389,
1334
+ "step": 187
1335
+ },
1336
+ {
1337
+ "epoch": 0.03745579518852418,
1338
+ "grad_norm": 0.26061758399009705,
1339
+ "learning_rate": 7.269952498697734e-05,
1340
+ "loss": 0.637,
1341
+ "step": 188
1342
+ },
1343
+ {
1344
+ "epoch": 0.03765502814165463,
1345
+ "grad_norm": 0.247264102101326,
1346
+ "learning_rate": 7.240745967946113e-05,
1347
+ "loss": 0.7007,
1348
+ "step": 189
1349
+ },
1350
+ {
1351
+ "epoch": 0.037854261094785076,
1352
+ "grad_norm": 0.2501027584075928,
1353
+ "learning_rate": 7.211443451095007e-05,
1354
+ "loss": 0.7552,
1355
+ "step": 190
1356
+ },
1357
+ {
1358
+ "epoch": 0.038053494047915526,
1359
+ "grad_norm": 0.24374301731586456,
1360
+ "learning_rate": 7.18204620336671e-05,
1361
+ "loss": 0.7168,
1362
+ "step": 191
1363
+ },
1364
+ {
1365
+ "epoch": 0.038252727001045976,
1366
+ "grad_norm": 0.2584417760372162,
1367
+ "learning_rate": 7.152555484041476e-05,
1368
+ "loss": 0.6993,
1369
+ "step": 192
1370
+ },
1371
+ {
1372
+ "epoch": 0.03845195995417642,
1373
+ "grad_norm": 0.2678215205669403,
1374
+ "learning_rate": 7.122972556403567e-05,
1375
+ "loss": 0.7069,
1376
+ "step": 193
1377
+ },
1378
+ {
1379
+ "epoch": 0.03865119290730687,
1380
+ "grad_norm": 0.27493688464164734,
1381
+ "learning_rate": 7.09329868768714e-05,
1382
+ "loss": 0.7271,
1383
+ "step": 194
1384
+ },
1385
+ {
1386
+ "epoch": 0.03885042586043732,
1387
+ "grad_norm": 0.28842246532440186,
1388
+ "learning_rate": 7.063535149021973e-05,
1389
+ "loss": 0.7254,
1390
+ "step": 195
1391
+ },
1392
+ {
1393
+ "epoch": 0.03904965881356776,
1394
+ "grad_norm": 0.26705753803253174,
1395
+ "learning_rate": 7.033683215379002e-05,
1396
+ "loss": 0.7115,
1397
+ "step": 196
1398
+ },
1399
+ {
1400
+ "epoch": 0.03924889176669821,
1401
+ "grad_norm": 0.25876060128211975,
1402
+ "learning_rate": 7.003744165515705e-05,
1403
+ "loss": 0.7228,
1404
+ "step": 197
1405
+ },
1406
+ {
1407
+ "epoch": 0.03944812471982866,
1408
+ "grad_norm": 0.30024516582489014,
1409
+ "learning_rate": 6.973719281921335e-05,
1410
+ "loss": 0.7615,
1411
+ "step": 198
1412
+ },
1413
+ {
1414
+ "epoch": 0.039647357672959106,
1415
+ "grad_norm": 0.27740225195884705,
1416
+ "learning_rate": 6.943609850761979e-05,
1417
+ "loss": 0.7928,
1418
+ "step": 199
1419
+ },
1420
+ {
1421
+ "epoch": 0.039846590626089556,
1422
+ "grad_norm": 0.3024348020553589,
1423
+ "learning_rate": 6.91341716182545e-05,
1424
+ "loss": 0.7479,
1425
+ "step": 200
1426
+ },
1427
+ {
1428
+ "epoch": 0.039846590626089556,
1429
+ "eval_loss": 0.7008334398269653,
1430
+ "eval_runtime": 265.1826,
1431
+ "eval_samples_per_second": 31.88,
1432
+ "eval_steps_per_second": 7.972,
1433
+ "step": 200
1434
  }
1435
  ],
1436
  "logging_steps": 1,
 
1459
  "attributes": {}
1460
  }
1461
  },
1462
+ "total_flos": 6.651170610846106e+16,
1463
  "train_batch_size": 8,
1464
  "trial_name": null,
1465
  "trial_params": null