quynguyen1704 commited on
Commit
331b73c
·
verified ·
1 Parent(s): f44f203

add best checkpoint

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "dangvantuan/vietnamese-embedding-LongContext",
3
+ "architectures": [
4
+ "VietnameseForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "dangvantuan/Vietnamese_impl--configuration.VietnameseConfig",
9
+ "AutoModel": "dangvantuan/Vietnamese_impl--modeling.VietnameseModel",
10
+ "AutoModelForMaskedLM": "dangvantuan/Vietnamese_impl--modeling.VietnameseForMaskedLM",
11
+ "AutoModelForMultipleChoice": "dangvantuan/Vietnamese_impl--modeling.VietnameseForMultipleChoice",
12
+ "AutoModelForQuestionAnswering": "dangvantuan/Vietnamese_impl--modeling.VietnameseForQuestionAnswering",
13
+ "AutoModelForSequenceClassification": "dangvantuan/Vietnamese_impl--modeling.VietnameseForSequenceClassification",
14
+ "AutoModelForTokenClassification": "dangvantuan/Vietnamese_impl--modeling.VietnameseForTokenClassification"
15
+ },
16
+ "classifier_dropout": 0.0,
17
+ "hidden_act": "gelu",
18
+ "hidden_dropout_prob": 0.1,
19
+ "hidden_size": 768,
20
+ "id2label": {
21
+ "0": "LABEL_0"
22
+ },
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 3072,
25
+ "label2id": {
26
+ "LABEL_0": 0
27
+ },
28
+ "layer_norm_eps": 1e-12,
29
+ "layer_norm_type": "layer_norm",
30
+ "logn_attention_clip1": false,
31
+ "logn_attention_scale": false,
32
+ "max_position_embeddings": 8192,
33
+ "model_type": "Vietnamese",
34
+ "num_attention_heads": 12,
35
+ "num_hidden_layers": 12,
36
+ "pack_qkv": true,
37
+ "pad_token_id": 1,
38
+ "position_embedding_type": "rope",
39
+ "rope_scaling": {
40
+ "factor": 8.0,
41
+ "type": "ntk"
42
+ },
43
+ "rope_theta": 20000,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.45.1",
46
+ "type_vocab_size": 1,
47
+ "unpad_inputs": false,
48
+ "use_memory_efficient_attention": false,
49
+ "vocab_size": 250002
50
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57184ce961ad61ed3525edfa549b08b91858e8b437d72da3cfe3ab98aa6d6e22
3
+ size 1224717040
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe7c0a591d7470cfef6141b8ab50c9d73128f8f1212f832a8da087976bb5e4b8
3
+ size 2449519034
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:357bb9cb4eafbd51257eb3538d782f0d9715c2fd8692ee322772a1cfd4aba7e9
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cde91977c4ecd9ac899d17836f5b9b93e91041f60c5049deda9f805d66a03a2
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6f4bfef0fd9457498b433fe0b94a27c210f982888370dc0f4ed95c100267ea5
3
+ size 17082833
tokenizer_config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "mask_token": "<mask>",
49
+ "max_length": 8192,
50
+ "model_max_length": 32768,
51
+ "pad_to_multiple_of": null,
52
+ "pad_token": "<pad>",
53
+ "pad_token_type_id": 0,
54
+ "padding_side": "right",
55
+ "sep_token": "</s>",
56
+ "stride": 0,
57
+ "tokenizer_class": "XLMRobertaTokenizer",
58
+ "truncation_side": "right",
59
+ "truncation_strategy": "longest_first",
60
+ "unk_token": "<unk>"
61
+ }
trainer_state.json ADDED
@@ -0,0 +1,749 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.091291069984436,
3
+ "best_model_checkpoint": "vietnamese-emb-long-mlm/checkpoint-8000",
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 8400,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05952380952380952,
13
+ "grad_norm": 7.383795261383057,
14
+ "learning_rate": 4.940476190476191e-05,
15
+ "loss": 6.9284,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.11904761904761904,
20
+ "grad_norm": 11.806465148925781,
21
+ "learning_rate": 4.880952380952381e-05,
22
+ "loss": 5.4103,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.17857142857142858,
27
+ "grad_norm": 10.51978588104248,
28
+ "learning_rate": 4.8214285714285716e-05,
29
+ "loss": 4.45,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.23809523809523808,
34
+ "grad_norm": 10.803540229797363,
35
+ "learning_rate": 4.761904761904762e-05,
36
+ "loss": 3.7495,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.2976190476190476,
41
+ "grad_norm": 10.87264347076416,
42
+ "learning_rate": 4.7023809523809525e-05,
43
+ "loss": 3.3683,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.2976190476190476,
48
+ "eval_loss": 2.989553928375244,
49
+ "eval_runtime": 105.8686,
50
+ "eval_samples_per_second": 112.838,
51
+ "eval_steps_per_second": 1.766,
52
+ "step": 500
53
+ },
54
+ {
55
+ "epoch": 0.35714285714285715,
56
+ "grad_norm": 9.709155082702637,
57
+ "learning_rate": 4.642857142857143e-05,
58
+ "loss": 3.0939,
59
+ "step": 600
60
+ },
61
+ {
62
+ "epoch": 0.4166666666666667,
63
+ "grad_norm": 9.323984146118164,
64
+ "learning_rate": 4.5833333333333334e-05,
65
+ "loss": 2.8534,
66
+ "step": 700
67
+ },
68
+ {
69
+ "epoch": 0.47619047619047616,
70
+ "grad_norm": 10.167367935180664,
71
+ "learning_rate": 4.523809523809524e-05,
72
+ "loss": 2.7091,
73
+ "step": 800
74
+ },
75
+ {
76
+ "epoch": 0.5357142857142857,
77
+ "grad_norm": 10.095072746276855,
78
+ "learning_rate": 4.464285714285715e-05,
79
+ "loss": 2.5711,
80
+ "step": 900
81
+ },
82
+ {
83
+ "epoch": 0.5952380952380952,
84
+ "grad_norm": 9.4666109085083,
85
+ "learning_rate": 4.404761904761905e-05,
86
+ "loss": 2.4663,
87
+ "step": 1000
88
+ },
89
+ {
90
+ "epoch": 0.5952380952380952,
91
+ "eval_loss": 2.2190322875976562,
92
+ "eval_runtime": 106.1834,
93
+ "eval_samples_per_second": 112.504,
94
+ "eval_steps_per_second": 1.761,
95
+ "step": 1000
96
+ },
97
+ {
98
+ "epoch": 0.6547619047619048,
99
+ "grad_norm": 9.085387229919434,
100
+ "learning_rate": 4.345238095238096e-05,
101
+ "loss": 2.4001,
102
+ "step": 1100
103
+ },
104
+ {
105
+ "epoch": 0.7142857142857143,
106
+ "grad_norm": 9.016592979431152,
107
+ "learning_rate": 4.2857142857142856e-05,
108
+ "loss": 2.3036,
109
+ "step": 1200
110
+ },
111
+ {
112
+ "epoch": 0.7738095238095238,
113
+ "grad_norm": 10.062590599060059,
114
+ "learning_rate": 4.226190476190476e-05,
115
+ "loss": 2.2139,
116
+ "step": 1300
117
+ },
118
+ {
119
+ "epoch": 0.8333333333333334,
120
+ "grad_norm": 9.198400497436523,
121
+ "learning_rate": 4.166666666666667e-05,
122
+ "loss": 2.1477,
123
+ "step": 1400
124
+ },
125
+ {
126
+ "epoch": 0.8928571428571429,
127
+ "grad_norm": 9.555102348327637,
128
+ "learning_rate": 4.107142857142857e-05,
129
+ "loss": 2.078,
130
+ "step": 1500
131
+ },
132
+ {
133
+ "epoch": 0.8928571428571429,
134
+ "eval_loss": 1.8398191928863525,
135
+ "eval_runtime": 106.2078,
136
+ "eval_samples_per_second": 112.478,
137
+ "eval_steps_per_second": 1.761,
138
+ "step": 1500
139
+ },
140
+ {
141
+ "epoch": 0.9523809523809523,
142
+ "grad_norm": 9.729957580566406,
143
+ "learning_rate": 4.047619047619048e-05,
144
+ "loss": 2.0146,
145
+ "step": 1600
146
+ },
147
+ {
148
+ "epoch": 1.0119047619047619,
149
+ "grad_norm": 9.68885326385498,
150
+ "learning_rate": 3.9880952380952386e-05,
151
+ "loss": 1.9381,
152
+ "step": 1700
153
+ },
154
+ {
155
+ "epoch": 1.0714285714285714,
156
+ "grad_norm": 8.330371856689453,
157
+ "learning_rate": 3.928571428571429e-05,
158
+ "loss": 1.8773,
159
+ "step": 1800
160
+ },
161
+ {
162
+ "epoch": 1.130952380952381,
163
+ "grad_norm": 9.066886901855469,
164
+ "learning_rate": 3.8690476190476195e-05,
165
+ "loss": 1.8749,
166
+ "step": 1900
167
+ },
168
+ {
169
+ "epoch": 1.1904761904761905,
170
+ "grad_norm": 10.275750160217285,
171
+ "learning_rate": 3.809523809523809e-05,
172
+ "loss": 1.8204,
173
+ "step": 2000
174
+ },
175
+ {
176
+ "epoch": 1.1904761904761905,
177
+ "eval_loss": 1.667816400527954,
178
+ "eval_runtime": 106.1061,
179
+ "eval_samples_per_second": 112.585,
180
+ "eval_steps_per_second": 1.762,
181
+ "step": 2000
182
+ },
183
+ {
184
+ "epoch": 1.25,
185
+ "grad_norm": 9.68263053894043,
186
+ "learning_rate": 3.7500000000000003e-05,
187
+ "loss": 1.8137,
188
+ "step": 2100
189
+ },
190
+ {
191
+ "epoch": 1.3095238095238095,
192
+ "grad_norm": 10.254537582397461,
193
+ "learning_rate": 3.690476190476191e-05,
194
+ "loss": 1.8096,
195
+ "step": 2200
196
+ },
197
+ {
198
+ "epoch": 1.369047619047619,
199
+ "grad_norm": 8.361231803894043,
200
+ "learning_rate": 3.630952380952381e-05,
201
+ "loss": 1.7509,
202
+ "step": 2300
203
+ },
204
+ {
205
+ "epoch": 1.4285714285714286,
206
+ "grad_norm": 9.450984954833984,
207
+ "learning_rate": 3.571428571428572e-05,
208
+ "loss": 1.7277,
209
+ "step": 2400
210
+ },
211
+ {
212
+ "epoch": 1.4880952380952381,
213
+ "grad_norm": 8.3367280960083,
214
+ "learning_rate": 3.511904761904762e-05,
215
+ "loss": 1.7063,
216
+ "step": 2500
217
+ },
218
+ {
219
+ "epoch": 1.4880952380952381,
220
+ "eval_loss": 1.55353844165802,
221
+ "eval_runtime": 106.2169,
222
+ "eval_samples_per_second": 112.468,
223
+ "eval_steps_per_second": 1.761,
224
+ "step": 2500
225
+ },
226
+ {
227
+ "epoch": 1.5476190476190477,
228
+ "grad_norm": 8.55213451385498,
229
+ "learning_rate": 3.4523809523809526e-05,
230
+ "loss": 1.6952,
231
+ "step": 2600
232
+ },
233
+ {
234
+ "epoch": 1.6071428571428572,
235
+ "grad_norm": 8.386811256408691,
236
+ "learning_rate": 3.392857142857143e-05,
237
+ "loss": 1.6654,
238
+ "step": 2700
239
+ },
240
+ {
241
+ "epoch": 1.6666666666666665,
242
+ "grad_norm": 9.841432571411133,
243
+ "learning_rate": 3.3333333333333335e-05,
244
+ "loss": 1.6662,
245
+ "step": 2800
246
+ },
247
+ {
248
+ "epoch": 1.7261904761904763,
249
+ "grad_norm": 8.69044303894043,
250
+ "learning_rate": 3.273809523809524e-05,
251
+ "loss": 1.6291,
252
+ "step": 2900
253
+ },
254
+ {
255
+ "epoch": 1.7857142857142856,
256
+ "grad_norm": 9.23985481262207,
257
+ "learning_rate": 3.2142857142857144e-05,
258
+ "loss": 1.5943,
259
+ "step": 3000
260
+ },
261
+ {
262
+ "epoch": 1.7857142857142856,
263
+ "eval_loss": 1.464246392250061,
264
+ "eval_runtime": 106.2444,
265
+ "eval_samples_per_second": 112.439,
266
+ "eval_steps_per_second": 1.76,
267
+ "step": 3000
268
+ },
269
+ {
270
+ "epoch": 1.8452380952380953,
271
+ "grad_norm": 8.489648818969727,
272
+ "learning_rate": 3.154761904761905e-05,
273
+ "loss": 1.6024,
274
+ "step": 3100
275
+ },
276
+ {
277
+ "epoch": 1.9047619047619047,
278
+ "grad_norm": 8.592900276184082,
279
+ "learning_rate": 3.095238095238095e-05,
280
+ "loss": 1.5711,
281
+ "step": 3200
282
+ },
283
+ {
284
+ "epoch": 1.9642857142857144,
285
+ "grad_norm": 8.73353099822998,
286
+ "learning_rate": 3.0357142857142857e-05,
287
+ "loss": 1.561,
288
+ "step": 3300
289
+ },
290
+ {
291
+ "epoch": 2.0238095238095237,
292
+ "grad_norm": 7.832152843475342,
293
+ "learning_rate": 2.9761904761904762e-05,
294
+ "loss": 1.4996,
295
+ "step": 3400
296
+ },
297
+ {
298
+ "epoch": 2.0833333333333335,
299
+ "grad_norm": 8.37968921661377,
300
+ "learning_rate": 2.916666666666667e-05,
301
+ "loss": 1.5235,
302
+ "step": 3500
303
+ },
304
+ {
305
+ "epoch": 2.0833333333333335,
306
+ "eval_loss": 1.3883955478668213,
307
+ "eval_runtime": 106.3163,
308
+ "eval_samples_per_second": 112.363,
309
+ "eval_steps_per_second": 1.759,
310
+ "step": 3500
311
+ },
312
+ {
313
+ "epoch": 2.142857142857143,
314
+ "grad_norm": 9.074413299560547,
315
+ "learning_rate": 2.857142857142857e-05,
316
+ "loss": 1.4867,
317
+ "step": 3600
318
+ },
319
+ {
320
+ "epoch": 2.2023809523809526,
321
+ "grad_norm": 8.715752601623535,
322
+ "learning_rate": 2.797619047619048e-05,
323
+ "loss": 1.481,
324
+ "step": 3700
325
+ },
326
+ {
327
+ "epoch": 2.261904761904762,
328
+ "grad_norm": 8.675792694091797,
329
+ "learning_rate": 2.7380952380952383e-05,
330
+ "loss": 1.4462,
331
+ "step": 3800
332
+ },
333
+ {
334
+ "epoch": 2.3214285714285716,
335
+ "grad_norm": 8.826884269714355,
336
+ "learning_rate": 2.6785714285714288e-05,
337
+ "loss": 1.449,
338
+ "step": 3900
339
+ },
340
+ {
341
+ "epoch": 2.380952380952381,
342
+ "grad_norm": 8.927966117858887,
343
+ "learning_rate": 2.6190476190476192e-05,
344
+ "loss": 1.4307,
345
+ "step": 4000
346
+ },
347
+ {
348
+ "epoch": 2.380952380952381,
349
+ "eval_loss": 1.3017064332962036,
350
+ "eval_runtime": 106.1765,
351
+ "eval_samples_per_second": 112.511,
352
+ "eval_steps_per_second": 1.761,
353
+ "step": 4000
354
+ },
355
+ {
356
+ "epoch": 2.4404761904761907,
357
+ "grad_norm": 7.1491804122924805,
358
+ "learning_rate": 2.5595238095238093e-05,
359
+ "loss": 1.4789,
360
+ "step": 4100
361
+ },
362
+ {
363
+ "epoch": 2.5,
364
+ "grad_norm": 7.50277853012085,
365
+ "learning_rate": 2.5e-05,
366
+ "loss": 1.4296,
367
+ "step": 4200
368
+ },
369
+ {
370
+ "epoch": 2.5595238095238093,
371
+ "grad_norm": 8.132680892944336,
372
+ "learning_rate": 2.4404761904761906e-05,
373
+ "loss": 1.4177,
374
+ "step": 4300
375
+ },
376
+ {
377
+ "epoch": 2.619047619047619,
378
+ "grad_norm": 8.233142852783203,
379
+ "learning_rate": 2.380952380952381e-05,
380
+ "loss": 1.3953,
381
+ "step": 4400
382
+ },
383
+ {
384
+ "epoch": 2.678571428571429,
385
+ "grad_norm": 7.6400628089904785,
386
+ "learning_rate": 2.3214285714285715e-05,
387
+ "loss": 1.4063,
388
+ "step": 4500
389
+ },
390
+ {
391
+ "epoch": 2.678571428571429,
392
+ "eval_loss": 1.2739932537078857,
393
+ "eval_runtime": 106.1766,
394
+ "eval_samples_per_second": 112.511,
395
+ "eval_steps_per_second": 1.761,
396
+ "step": 4500
397
+ },
398
+ {
399
+ "epoch": 2.738095238095238,
400
+ "grad_norm": 7.416648864746094,
401
+ "learning_rate": 2.261904761904762e-05,
402
+ "loss": 1.4008,
403
+ "step": 4600
404
+ },
405
+ {
406
+ "epoch": 2.7976190476190474,
407
+ "grad_norm": 7.283276081085205,
408
+ "learning_rate": 2.2023809523809524e-05,
409
+ "loss": 1.3818,
410
+ "step": 4700
411
+ },
412
+ {
413
+ "epoch": 2.857142857142857,
414
+ "grad_norm": 8.360782623291016,
415
+ "learning_rate": 2.1428571428571428e-05,
416
+ "loss": 1.379,
417
+ "step": 4800
418
+ },
419
+ {
420
+ "epoch": 2.9166666666666665,
421
+ "grad_norm": 9.960053443908691,
422
+ "learning_rate": 2.0833333333333336e-05,
423
+ "loss": 1.3479,
424
+ "step": 4900
425
+ },
426
+ {
427
+ "epoch": 2.9761904761904763,
428
+ "grad_norm": 8.903584480285645,
429
+ "learning_rate": 2.023809523809524e-05,
430
+ "loss": 1.3389,
431
+ "step": 5000
432
+ },
433
+ {
434
+ "epoch": 2.9761904761904763,
435
+ "eval_loss": 1.2187227010726929,
436
+ "eval_runtime": 106.0745,
437
+ "eval_samples_per_second": 112.619,
438
+ "eval_steps_per_second": 1.763,
439
+ "step": 5000
440
+ },
441
+ {
442
+ "epoch": 3.0357142857142856,
443
+ "grad_norm": 8.299568176269531,
444
+ "learning_rate": 1.9642857142857145e-05,
445
+ "loss": 1.3336,
446
+ "step": 5100
447
+ },
448
+ {
449
+ "epoch": 3.0952380952380953,
450
+ "grad_norm": 8.194002151489258,
451
+ "learning_rate": 1.9047619047619046e-05,
452
+ "loss": 1.3102,
453
+ "step": 5200
454
+ },
455
+ {
456
+ "epoch": 3.1547619047619047,
457
+ "grad_norm": 7.804888725280762,
458
+ "learning_rate": 1.8452380952380954e-05,
459
+ "loss": 1.3354,
460
+ "step": 5300
461
+ },
462
+ {
463
+ "epoch": 3.2142857142857144,
464
+ "grad_norm": 7.510008811950684,
465
+ "learning_rate": 1.785714285714286e-05,
466
+ "loss": 1.3382,
467
+ "step": 5400
468
+ },
469
+ {
470
+ "epoch": 3.2738095238095237,
471
+ "grad_norm": 7.9406023025512695,
472
+ "learning_rate": 1.7261904761904763e-05,
473
+ "loss": 1.2996,
474
+ "step": 5500
475
+ },
476
+ {
477
+ "epoch": 3.2738095238095237,
478
+ "eval_loss": 1.2163845300674438,
479
+ "eval_runtime": 106.2374,
480
+ "eval_samples_per_second": 112.446,
481
+ "eval_steps_per_second": 1.76,
482
+ "step": 5500
483
+ },
484
+ {
485
+ "epoch": 3.3333333333333335,
486
+ "grad_norm": 7.935986042022705,
487
+ "learning_rate": 1.6666666666666667e-05,
488
+ "loss": 1.2889,
489
+ "step": 5600
490
+ },
491
+ {
492
+ "epoch": 3.392857142857143,
493
+ "grad_norm": 7.514946937561035,
494
+ "learning_rate": 1.6071428571428572e-05,
495
+ "loss": 1.3353,
496
+ "step": 5700
497
+ },
498
+ {
499
+ "epoch": 3.4523809523809526,
500
+ "grad_norm": 8.081690788269043,
501
+ "learning_rate": 1.5476190476190476e-05,
502
+ "loss": 1.2959,
503
+ "step": 5800
504
+ },
505
+ {
506
+ "epoch": 3.511904761904762,
507
+ "grad_norm": 8.02477741241455,
508
+ "learning_rate": 1.4880952380952381e-05,
509
+ "loss": 1.2467,
510
+ "step": 5900
511
+ },
512
+ {
513
+ "epoch": 3.571428571428571,
514
+ "grad_norm": 8.462018013000488,
515
+ "learning_rate": 1.4285714285714285e-05,
516
+ "loss": 1.2899,
517
+ "step": 6000
518
+ },
519
+ {
520
+ "epoch": 3.571428571428571,
521
+ "eval_loss": 1.1731750965118408,
522
+ "eval_runtime": 106.1002,
523
+ "eval_samples_per_second": 112.592,
524
+ "eval_steps_per_second": 1.762,
525
+ "step": 6000
526
+ },
527
+ {
528
+ "epoch": 3.630952380952381,
529
+ "grad_norm": 7.650486469268799,
530
+ "learning_rate": 1.3690476190476192e-05,
531
+ "loss": 1.2878,
532
+ "step": 6100
533
+ },
534
+ {
535
+ "epoch": 3.6904761904761907,
536
+ "grad_norm": 8.889339447021484,
537
+ "learning_rate": 1.3095238095238096e-05,
538
+ "loss": 1.2704,
539
+ "step": 6200
540
+ },
541
+ {
542
+ "epoch": 3.75,
543
+ "grad_norm": 9.502975463867188,
544
+ "learning_rate": 1.25e-05,
545
+ "loss": 1.2406,
546
+ "step": 6300
547
+ },
548
+ {
549
+ "epoch": 3.8095238095238093,
550
+ "grad_norm": 9.480072975158691,
551
+ "learning_rate": 1.1904761904761905e-05,
552
+ "loss": 1.2707,
553
+ "step": 6400
554
+ },
555
+ {
556
+ "epoch": 3.869047619047619,
557
+ "grad_norm": 7.378728866577148,
558
+ "learning_rate": 1.130952380952381e-05,
559
+ "loss": 1.2343,
560
+ "step": 6500
561
+ },
562
+ {
563
+ "epoch": 3.869047619047619,
564
+ "eval_loss": 1.1631048917770386,
565
+ "eval_runtime": 106.023,
566
+ "eval_samples_per_second": 112.674,
567
+ "eval_steps_per_second": 1.764,
568
+ "step": 6500
569
+ },
570
+ {
571
+ "epoch": 3.928571428571429,
572
+ "grad_norm": 8.64068603515625,
573
+ "learning_rate": 1.0714285714285714e-05,
574
+ "loss": 1.2478,
575
+ "step": 6600
576
+ },
577
+ {
578
+ "epoch": 3.988095238095238,
579
+ "grad_norm": 8.00733470916748,
580
+ "learning_rate": 1.011904761904762e-05,
581
+ "loss": 1.2414,
582
+ "step": 6700
583
+ },
584
+ {
585
+ "epoch": 4.0476190476190474,
586
+ "grad_norm": 7.568075180053711,
587
+ "learning_rate": 9.523809523809523e-06,
588
+ "loss": 1.2533,
589
+ "step": 6800
590
+ },
591
+ {
592
+ "epoch": 4.107142857142857,
593
+ "grad_norm": 7.190471649169922,
594
+ "learning_rate": 8.92857142857143e-06,
595
+ "loss": 1.2495,
596
+ "step": 6900
597
+ },
598
+ {
599
+ "epoch": 4.166666666666667,
600
+ "grad_norm": 8.588766098022461,
601
+ "learning_rate": 8.333333333333334e-06,
602
+ "loss": 1.2095,
603
+ "step": 7000
604
+ },
605
+ {
606
+ "epoch": 4.166666666666667,
607
+ "eval_loss": 1.0942662954330444,
608
+ "eval_runtime": 106.0638,
609
+ "eval_samples_per_second": 112.63,
610
+ "eval_steps_per_second": 1.763,
611
+ "step": 7000
612
+ },
613
+ {
614
+ "epoch": 4.226190476190476,
615
+ "grad_norm": 8.47437858581543,
616
+ "learning_rate": 7.738095238095238e-06,
617
+ "loss": 1.2134,
618
+ "step": 7100
619
+ },
620
+ {
621
+ "epoch": 4.285714285714286,
622
+ "grad_norm": 7.481777191162109,
623
+ "learning_rate": 7.142857142857143e-06,
624
+ "loss": 1.1873,
625
+ "step": 7200
626
+ },
627
+ {
628
+ "epoch": 4.345238095238095,
629
+ "grad_norm": 7.533321380615234,
630
+ "learning_rate": 6.547619047619048e-06,
631
+ "loss": 1.2022,
632
+ "step": 7300
633
+ },
634
+ {
635
+ "epoch": 4.404761904761905,
636
+ "grad_norm": 8.106829643249512,
637
+ "learning_rate": 5.9523809523809525e-06,
638
+ "loss": 1.2162,
639
+ "step": 7400
640
+ },
641
+ {
642
+ "epoch": 4.464285714285714,
643
+ "grad_norm": 7.893404483795166,
644
+ "learning_rate": 5.357142857142857e-06,
645
+ "loss": 1.2078,
646
+ "step": 7500
647
+ },
648
+ {
649
+ "epoch": 4.464285714285714,
650
+ "eval_loss": 1.1191558837890625,
651
+ "eval_runtime": 106.3099,
652
+ "eval_samples_per_second": 112.37,
653
+ "eval_steps_per_second": 1.759,
654
+ "step": 7500
655
+ },
656
+ {
657
+ "epoch": 4.523809523809524,
658
+ "grad_norm": 8.90477466583252,
659
+ "learning_rate": 4.7619047619047615e-06,
660
+ "loss": 1.219,
661
+ "step": 7600
662
+ },
663
+ {
664
+ "epoch": 4.583333333333333,
665
+ "grad_norm": 7.603702545166016,
666
+ "learning_rate": 4.166666666666667e-06,
667
+ "loss": 1.2,
668
+ "step": 7700
669
+ },
670
+ {
671
+ "epoch": 4.642857142857143,
672
+ "grad_norm": 8.275321006774902,
673
+ "learning_rate": 3.5714285714285714e-06,
674
+ "loss": 1.1985,
675
+ "step": 7800
676
+ },
677
+ {
678
+ "epoch": 4.7023809523809526,
679
+ "grad_norm": 8.652661323547363,
680
+ "learning_rate": 2.9761904761904763e-06,
681
+ "loss": 1.2421,
682
+ "step": 7900
683
+ },
684
+ {
685
+ "epoch": 4.761904761904762,
686
+ "grad_norm": 8.569392204284668,
687
+ "learning_rate": 2.3809523809523808e-06,
688
+ "loss": 1.208,
689
+ "step": 8000
690
+ },
691
+ {
692
+ "epoch": 4.761904761904762,
693
+ "eval_loss": 1.091291069984436,
694
+ "eval_runtime": 106.5156,
695
+ "eval_samples_per_second": 112.153,
696
+ "eval_steps_per_second": 1.756,
697
+ "step": 8000
698
+ },
699
+ {
700
+ "epoch": 4.821428571428571,
701
+ "grad_norm": 8.579488754272461,
702
+ "learning_rate": 1.7857142857142857e-06,
703
+ "loss": 1.2274,
704
+ "step": 8100
705
+ },
706
+ {
707
+ "epoch": 4.880952380952381,
708
+ "grad_norm": 9.347880363464355,
709
+ "learning_rate": 1.1904761904761904e-06,
710
+ "loss": 1.1833,
711
+ "step": 8200
712
+ },
713
+ {
714
+ "epoch": 4.940476190476191,
715
+ "grad_norm": 7.7748894691467285,
716
+ "learning_rate": 5.952380952380952e-07,
717
+ "loss": 1.1919,
718
+ "step": 8300
719
+ },
720
+ {
721
+ "epoch": 5.0,
722
+ "grad_norm": 8.465712547302246,
723
+ "learning_rate": 0.0,
724
+ "loss": 1.2069,
725
+ "step": 8400
726
+ }
727
+ ],
728
+ "logging_steps": 100,
729
+ "max_steps": 8400,
730
+ "num_input_tokens_seen": 0,
731
+ "num_train_epochs": 5,
732
+ "save_steps": 2000,
733
+ "stateful_callbacks": {
734
+ "TrainerControl": {
735
+ "args": {
736
+ "should_epoch_stop": false,
737
+ "should_evaluate": false,
738
+ "should_log": false,
739
+ "should_save": true,
740
+ "should_training_stop": true
741
+ },
742
+ "attributes": {}
743
+ }
744
+ },
745
+ "total_flos": 1.5633072459126792e+16,
746
+ "train_batch_size": 64,
747
+ "trial_name": null,
748
+ "trial_params": null
749
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f8c45d08af0ee44560afc58d52049dd77ef5fe6673cec5f873bc5fe795169a2
3
+ size 5176