Sanjib Narzary commited on
Commit
a393828
·
1 Parent(s): ee21de7

Pretrained Bodo LM using Roberta base configuration from scratch and line by line

Browse files
README.md CHANGED
@@ -1,23 +1,22 @@
1
  ---
2
- license: mit
3
  tags:
4
  - generated_from_trainer
5
  metrics:
6
  - accuracy
7
  model-index:
8
- - name: brx-roberta-base-mlm
9
  results: []
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
  should probably proofread and complete it, then remove this comment. -->
14
 
15
- # brx-roberta-base-mlm
16
 
17
- This model is a fine-tuned version of [roberta-base](https://huggingface.co/roberta-base) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 0.3316
20
- - Accuracy: 0.9092
21
 
22
  ## Model description
23
 
 
1
  ---
 
2
  tags:
3
  - generated_from_trainer
4
  metrics:
5
  - accuracy
6
  model-index:
7
+ - name: brx-roberta-base-mlm-scratch-v1
8
  results: []
9
  ---
10
 
11
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
  should probably proofread and complete it, then remove this comment. -->
13
 
14
+ # brx-roberta-base-mlm-scratch-v1
15
 
16
+ This model was trained from scratch on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: nan
19
+ - Accuracy: 0.1685
20
 
21
  ## Model description
22
 
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_accuracy": 0.9092124024644979,
4
- "eval_loss": 0.33163896203041077,
5
- "eval_runtime": 92.5715,
6
  "eval_samples": 8796,
7
- "eval_samples_per_second": 95.018,
8
- "eval_steps_per_second": 47.509,
9
- "perplexity": 1.3932497416397873,
10
- "train_loss": 0.4256299230643896,
11
- "train_runtime": 18063.8373,
12
  "train_samples": 112168,
13
- "train_samples_per_second": 18.629,
14
- "train_steps_per_second": 9.314
15
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_accuracy": 0.16853486319505737,
4
+ "eval_loss": NaN,
5
+ "eval_runtime": 84.0751,
6
  "eval_samples": 8796,
7
+ "eval_samples_per_second": 104.621,
8
+ "eval_steps_per_second": 52.31,
9
+ "perplexity": NaN,
10
+ "train_loss": 4.168116396368988,
11
+ "train_runtime": 19036.7524,
12
  "train_samples": 112168,
13
+ "train_samples_per_second": 17.677,
14
+ "train_steps_per_second": 8.838
15
  }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "roberta-base",
3
  "architectures": [
4
  "RobertaForMaskedLM"
5
  ],
@@ -23,5 +23,5 @@
23
  "transformers_version": "4.26.0.dev0",
24
  "type_vocab_size": 1,
25
  "use_cache": true,
26
- "vocab_size": 50265
27
  }
 
1
  {
2
+ "_name_or_path": "/home/sn/pretrained_models/huggingface_pretrained/brx-roberta-base-mlm-scratch",
3
  "architectures": [
4
  "RobertaForMaskedLM"
5
  ],
 
23
  "transformers_version": "4.26.0.dev0",
24
  "type_vocab_size": 1,
25
  "use_cache": true,
26
+ "vocab_size": 20000
27
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_accuracy": 0.9092124024644979,
4
- "eval_loss": 0.33163896203041077,
5
- "eval_runtime": 92.5715,
6
  "eval_samples": 8796,
7
- "eval_samples_per_second": 95.018,
8
- "eval_steps_per_second": 47.509,
9
- "perplexity": 1.3932497416397873
10
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_accuracy": 0.16853486319505737,
4
+ "eval_loss": NaN,
5
+ "eval_runtime": 84.0751,
6
  "eval_samples": 8796,
7
+ "eval_samples_per_second": 104.621,
8
+ "eval_steps_per_second": 52.31,
9
+ "perplexity": NaN
10
  }
merges.txt CHANGED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac728efa6be2ccb9e62828bd8c1d7bf35a0cd2162dd68a701f772311ba981d5c
3
- size 498861675
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a797a26b0c9c83ac2216c436d3f706c401816906de2c6ccff43303317fc1f595
3
+ size 405766571
special_tokens_map.json CHANGED
@@ -1,15 +1,51 @@
1
  {
2
- "bos_token": "<s>",
3
- "cls_token": "<s>",
4
- "eos_token": "</s>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "mask_token": {
6
  "content": "<mask>",
7
  "lstrip": true,
8
- "normalized": false,
9
  "rstrip": false,
10
  "single_word": false
11
  },
12
- "pad_token": "<pad>",
13
- "sep_token": "</s>",
14
- "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
  "mask_token": {
24
  "content": "<mask>",
25
  "lstrip": true,
26
+ "normalized": true,
27
  "rstrip": false,
28
  "single_word": false
29
  },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,16 +1,65 @@
1
  {
2
  "add_prefix_space": false,
3
- "bos_token": "<s>",
4
- "cls_token": "<s>",
5
- "eos_token": "</s>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "errors": "replace",
7
- "mask_token": "<mask>",
8
- "model_max_length": 512,
9
- "name_or_path": "roberta-base",
10
- "pad_token": "<pad>",
11
- "sep_token": "</s>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  "special_tokens_map_file": null,
13
  "tokenizer_class": "RobertaTokenizer",
14
  "trim_offsets": true,
15
- "unk_token": "<unk>"
 
 
 
 
 
 
 
16
  }
 
1
  {
2
  "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "cls_token": {
12
+ "__type": "AddedToken",
13
+ "content": "<s>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eos_token": {
20
+ "__type": "AddedToken",
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
  "errors": "replace",
28
+ "mask_token": {
29
+ "__type": "AddedToken",
30
+ "content": "<mask>",
31
+ "lstrip": true,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false
35
+ },
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "name_or_path": "/home/sn/pretrained_models/huggingface_pretrained/brx-roberta-base-mlm-scratch",
38
+ "pad_token": {
39
+ "__type": "AddedToken",
40
+ "content": "<pad>",
41
+ "lstrip": false,
42
+ "normalized": true,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "sep_token": {
47
+ "__type": "AddedToken",
48
+ "content": "</s>",
49
+ "lstrip": false,
50
+ "normalized": true,
51
+ "rstrip": false,
52
+ "single_word": false
53
+ },
54
  "special_tokens_map_file": null,
55
  "tokenizer_class": "RobertaTokenizer",
56
  "trim_offsets": true,
57
+ "unk_token": {
58
+ "__type": "AddedToken",
59
+ "content": "<unk>",
60
+ "lstrip": false,
61
+ "normalized": true,
62
+ "rstrip": false,
63
+ "single_word": false
64
+ }
65
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 3.0,
3
- "train_loss": 0.4256299230643896,
4
- "train_runtime": 18063.8373,
5
  "train_samples": 112168,
6
- "train_samples_per_second": 18.629,
7
- "train_steps_per_second": 9.314
8
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "train_loss": 4.168116396368988,
4
+ "train_runtime": 19036.7524,
5
  "train_samples": 112168,
6
+ "train_samples_per_second": 17.677,
7
+ "train_steps_per_second": 8.838
8
  }
trainer_state.json CHANGED
@@ -10,2032 +10,2032 @@
10
  {
11
  "epoch": 0.01,
12
  "learning_rate": 4.9851413356156244e-05,
13
- "loss": 1.1312,
14
  "step": 500
15
  },
16
  {
17
  "epoch": 0.02,
18
  "learning_rate": 4.9702826712312485e-05,
19
- "loss": 0.9916,
20
  "step": 1000
21
  },
22
  {
23
  "epoch": 0.03,
24
  "learning_rate": 4.9554240068468726e-05,
25
- "loss": 0.8946,
26
  "step": 1500
27
  },
28
  {
29
  "epoch": 0.04,
30
  "learning_rate": 4.940565342462497e-05,
31
- "loss": 0.8687,
32
  "step": 2000
33
  },
34
  {
35
  "epoch": 0.04,
36
  "learning_rate": 4.9257066780781215e-05,
37
- "loss": 0.8543,
38
  "step": 2500
39
  },
40
  {
41
  "epoch": 0.05,
42
  "learning_rate": 4.910848013693745e-05,
43
- "loss": 0.85,
44
  "step": 3000
45
  },
46
  {
47
  "epoch": 0.06,
48
  "learning_rate": 4.89598934930937e-05,
49
- "loss": 0.8236,
50
  "step": 3500
51
  },
52
  {
53
  "epoch": 0.07,
54
  "learning_rate": 4.881130684924994e-05,
55
- "loss": 0.7902,
56
  "step": 4000
57
  },
58
  {
59
  "epoch": 0.08,
60
  "learning_rate": 4.866272020540618e-05,
61
- "loss": 0.7779,
62
  "step": 4500
63
  },
64
  {
65
  "epoch": 0.09,
66
  "learning_rate": 4.851413356156242e-05,
67
- "loss": 0.7755,
68
  "step": 5000
69
  },
70
  {
71
  "epoch": 0.1,
72
  "learning_rate": 4.836554691771866e-05,
73
- "loss": 0.7447,
74
  "step": 5500
75
  },
76
  {
77
  "epoch": 0.11,
78
  "learning_rate": 4.82169602738749e-05,
79
- "loss": 0.7624,
80
  "step": 6000
81
  },
82
  {
83
  "epoch": 0.12,
84
  "learning_rate": 4.8068373630031144e-05,
85
- "loss": 0.728,
86
  "step": 6500
87
  },
88
  {
89
  "epoch": 0.12,
90
  "learning_rate": 4.7919786986187386e-05,
91
- "loss": 0.7337,
92
  "step": 7000
93
  },
94
  {
95
  "epoch": 0.13,
96
  "learning_rate": 4.7771200342343634e-05,
97
- "loss": 0.7218,
98
  "step": 7500
99
  },
100
  {
101
  "epoch": 0.14,
102
  "learning_rate": 4.7622613698499875e-05,
103
- "loss": 0.7207,
104
  "step": 8000
105
  },
106
  {
107
  "epoch": 0.15,
108
  "learning_rate": 4.747402705465611e-05,
109
- "loss": 0.7005,
110
  "step": 8500
111
  },
112
  {
113
  "epoch": 0.16,
114
  "learning_rate": 4.732544041081236e-05,
115
- "loss": 0.69,
116
  "step": 9000
117
  },
118
  {
119
  "epoch": 0.17,
120
  "learning_rate": 4.71768537669686e-05,
121
- "loss": 0.703,
122
  "step": 9500
123
  },
124
  {
125
  "epoch": 0.18,
126
  "learning_rate": 4.702826712312484e-05,
127
- "loss": 0.6943,
128
  "step": 10000
129
  },
130
  {
131
  "epoch": 0.19,
132
  "learning_rate": 4.687968047928108e-05,
133
- "loss": 0.6649,
134
  "step": 10500
135
  },
136
  {
137
  "epoch": 0.2,
138
  "learning_rate": 4.673109383543732e-05,
139
- "loss": 0.655,
140
  "step": 11000
141
  },
142
  {
143
  "epoch": 0.21,
144
  "learning_rate": 4.658250719159356e-05,
145
- "loss": 0.6731,
146
  "step": 11500
147
  },
148
  {
149
  "epoch": 0.21,
150
  "learning_rate": 4.6433920547749804e-05,
151
- "loss": 0.6403,
152
  "step": 12000
153
  },
154
  {
155
  "epoch": 0.22,
156
  "learning_rate": 4.6285333903906045e-05,
157
- "loss": 0.6311,
158
  "step": 12500
159
  },
160
  {
161
  "epoch": 0.23,
162
  "learning_rate": 4.613674726006229e-05,
163
- "loss": 0.6491,
164
  "step": 13000
165
  },
166
  {
167
  "epoch": 0.24,
168
  "learning_rate": 4.5988160616218534e-05,
169
- "loss": 0.63,
170
  "step": 13500
171
  },
172
  {
173
  "epoch": 0.25,
174
  "learning_rate": 4.583957397237477e-05,
175
- "loss": 0.6267,
176
  "step": 14000
177
  },
178
  {
179
  "epoch": 0.26,
180
  "learning_rate": 4.569098732853102e-05,
181
- "loss": 0.6445,
182
  "step": 14500
183
  },
184
  {
185
  "epoch": 0.27,
186
  "learning_rate": 4.554240068468726e-05,
187
- "loss": 0.6162,
188
  "step": 15000
189
  },
190
  {
191
  "epoch": 0.28,
192
  "learning_rate": 4.53938140408435e-05,
193
- "loss": 0.6152,
194
  "step": 15500
195
  },
196
  {
197
  "epoch": 0.29,
198
  "learning_rate": 4.524522739699974e-05,
199
- "loss": 0.6322,
200
  "step": 16000
201
  },
202
  {
203
  "epoch": 0.29,
204
  "learning_rate": 4.509664075315598e-05,
205
- "loss": 0.5996,
206
  "step": 16500
207
  },
208
  {
209
  "epoch": 0.3,
210
  "learning_rate": 4.494805410931223e-05,
211
- "loss": 0.6001,
212
  "step": 17000
213
  },
214
  {
215
  "epoch": 0.31,
216
  "learning_rate": 4.4799467465468464e-05,
217
- "loss": 0.591,
218
  "step": 17500
219
  },
220
  {
221
  "epoch": 0.32,
222
  "learning_rate": 4.4650880821624705e-05,
223
- "loss": 0.5985,
224
  "step": 18000
225
  },
226
  {
227
  "epoch": 0.33,
228
  "learning_rate": 4.450229417778095e-05,
229
- "loss": 0.6045,
230
  "step": 18500
231
  },
232
  {
233
  "epoch": 0.34,
234
  "learning_rate": 4.435370753393719e-05,
235
- "loss": 0.5937,
236
  "step": 19000
237
  },
238
  {
239
  "epoch": 0.35,
240
  "learning_rate": 4.4205120890093435e-05,
241
- "loss": 0.5852,
242
  "step": 19500
243
  },
244
  {
245
  "epoch": 0.36,
246
  "learning_rate": 4.4056534246249676e-05,
247
- "loss": 0.5801,
248
  "step": 20000
249
  },
250
  {
251
  "epoch": 0.37,
252
  "learning_rate": 4.390794760240592e-05,
253
- "loss": 0.5777,
254
  "step": 20500
255
  },
256
  {
257
  "epoch": 0.37,
258
  "learning_rate": 4.375936095856216e-05,
259
- "loss": 0.5564,
260
  "step": 21000
261
  },
262
  {
263
  "epoch": 0.38,
264
  "learning_rate": 4.36107743147184e-05,
265
- "loss": 0.573,
266
  "step": 21500
267
  },
268
  {
269
  "epoch": 0.39,
270
  "learning_rate": 4.346218767087464e-05,
271
- "loss": 0.5653,
272
  "step": 22000
273
  },
274
  {
275
  "epoch": 0.4,
276
  "learning_rate": 4.331360102703089e-05,
277
- "loss": 0.5992,
278
  "step": 22500
279
  },
280
  {
281
  "epoch": 0.41,
282
  "learning_rate": 4.3165014383187123e-05,
283
- "loss": 0.5632,
284
  "step": 23000
285
  },
286
  {
287
  "epoch": 0.42,
288
  "learning_rate": 4.3016427739343365e-05,
289
- "loss": 0.5626,
290
  "step": 23500
291
  },
292
  {
293
  "epoch": 0.43,
294
  "learning_rate": 4.286784109549961e-05,
295
- "loss": 0.5807,
296
  "step": 24000
297
  },
298
  {
299
  "epoch": 0.44,
300
  "learning_rate": 4.271925445165585e-05,
301
- "loss": 0.5643,
302
  "step": 24500
303
  },
304
  {
305
  "epoch": 0.45,
306
  "learning_rate": 4.2570667807812095e-05,
307
- "loss": 0.5445,
308
  "step": 25000
309
  },
310
  {
311
  "epoch": 0.45,
312
  "learning_rate": 4.2422081163968336e-05,
313
- "loss": 0.5458,
314
  "step": 25500
315
  },
316
  {
317
  "epoch": 0.46,
318
  "learning_rate": 4.227349452012458e-05,
319
- "loss": 0.5669,
320
  "step": 26000
321
  },
322
  {
323
  "epoch": 0.47,
324
  "learning_rate": 4.212490787628082e-05,
325
- "loss": 0.5459,
326
  "step": 26500
327
  },
328
  {
329
  "epoch": 0.48,
330
  "learning_rate": 4.197632123243706e-05,
331
- "loss": 0.5452,
332
  "step": 27000
333
  },
334
  {
335
  "epoch": 0.49,
336
  "learning_rate": 4.18277345885933e-05,
337
- "loss": 0.5641,
338
  "step": 27500
339
  },
340
  {
341
  "epoch": 0.5,
342
  "learning_rate": 4.167914794474955e-05,
343
- "loss": 0.544,
344
  "step": 28000
345
  },
346
  {
347
  "epoch": 0.51,
348
  "learning_rate": 4.153056130090578e-05,
349
- "loss": 0.5286,
350
  "step": 28500
351
  },
352
  {
353
  "epoch": 0.52,
354
  "learning_rate": 4.138197465706203e-05,
355
- "loss": 0.5478,
356
  "step": 29000
357
  },
358
  {
359
  "epoch": 0.53,
360
  "learning_rate": 4.123338801321827e-05,
361
- "loss": 0.5193,
362
  "step": 29500
363
  },
364
  {
365
  "epoch": 0.53,
366
  "learning_rate": 4.1084801369374507e-05,
367
- "loss": 0.5335,
368
  "step": 30000
369
  },
370
  {
371
  "epoch": 0.54,
372
  "learning_rate": 4.0936214725530755e-05,
373
- "loss": 0.5261,
374
  "step": 30500
375
  },
376
  {
377
  "epoch": 0.55,
378
  "learning_rate": 4.0787628081686996e-05,
379
- "loss": 0.5348,
380
  "step": 31000
381
  },
382
  {
383
  "epoch": 0.56,
384
  "learning_rate": 4.063904143784324e-05,
385
- "loss": 0.5248,
386
  "step": 31500
387
  },
388
  {
389
  "epoch": 0.57,
390
  "learning_rate": 4.049045479399948e-05,
391
- "loss": 0.5123,
392
  "step": 32000
393
  },
394
  {
395
  "epoch": 0.58,
396
  "learning_rate": 4.034186815015572e-05,
397
- "loss": 0.4927,
398
  "step": 32500
399
  },
400
  {
401
  "epoch": 0.59,
402
  "learning_rate": 4.019328150631197e-05,
403
- "loss": 0.514,
404
  "step": 33000
405
  },
406
  {
407
  "epoch": 0.6,
408
  "learning_rate": 4.00446948624682e-05,
409
- "loss": 0.4989,
410
  "step": 33500
411
  },
412
  {
413
  "epoch": 0.61,
414
  "learning_rate": 3.989610821862444e-05,
415
- "loss": 0.5055,
416
  "step": 34000
417
  },
418
  {
419
  "epoch": 0.62,
420
  "learning_rate": 3.974752157478069e-05,
421
- "loss": 0.4768,
422
  "step": 34500
423
  },
424
  {
425
  "epoch": 0.62,
426
  "learning_rate": 3.959893493093693e-05,
427
- "loss": 0.5114,
428
  "step": 35000
429
  },
430
  {
431
  "epoch": 0.63,
432
  "learning_rate": 3.9450348287093166e-05,
433
- "loss": 0.5114,
434
  "step": 35500
435
  },
436
  {
437
  "epoch": 0.64,
438
  "learning_rate": 3.9301761643249414e-05,
439
- "loss": 0.497,
440
  "step": 36000
441
  },
442
  {
443
  "epoch": 0.65,
444
  "learning_rate": 3.9153174999405655e-05,
445
- "loss": 0.4981,
446
  "step": 36500
447
  },
448
  {
449
  "epoch": 0.66,
450
  "learning_rate": 3.9004588355561897e-05,
451
- "loss": 0.4943,
452
  "step": 37000
453
  },
454
  {
455
  "epoch": 0.67,
456
  "learning_rate": 3.885600171171814e-05,
457
- "loss": 0.5102,
458
  "step": 37500
459
  },
460
  {
461
  "epoch": 0.68,
462
  "learning_rate": 3.870741506787438e-05,
463
- "loss": 0.4934,
464
  "step": 38000
465
  },
466
  {
467
  "epoch": 0.69,
468
  "learning_rate": 3.855882842403063e-05,
469
- "loss": 0.4804,
470
  "step": 38500
471
  },
472
  {
473
  "epoch": 0.7,
474
  "learning_rate": 3.841024178018686e-05,
475
- "loss": 0.4888,
476
  "step": 39000
477
  },
478
  {
479
  "epoch": 0.7,
480
  "learning_rate": 3.82616551363431e-05,
481
- "loss": 0.4924,
482
  "step": 39500
483
  },
484
  {
485
  "epoch": 0.71,
486
  "learning_rate": 3.811306849249935e-05,
487
- "loss": 0.4985,
488
  "step": 40000
489
  },
490
  {
491
  "epoch": 0.72,
492
  "learning_rate": 3.796448184865559e-05,
493
- "loss": 0.4668,
494
  "step": 40500
495
  },
496
  {
497
  "epoch": 0.73,
498
  "learning_rate": 3.781589520481183e-05,
499
- "loss": 0.4803,
500
  "step": 41000
501
  },
502
  {
503
  "epoch": 0.74,
504
  "learning_rate": 3.7667308560968074e-05,
505
- "loss": 0.4691,
506
  "step": 41500
507
  },
508
  {
509
  "epoch": 0.75,
510
  "learning_rate": 3.7518721917124315e-05,
511
- "loss": 0.4748,
512
  "step": 42000
513
  },
514
  {
515
  "epoch": 0.76,
516
  "learning_rate": 3.737013527328056e-05,
517
- "loss": 0.4879,
518
  "step": 42500
519
  },
520
  {
521
  "epoch": 0.77,
522
  "learning_rate": 3.72215486294368e-05,
523
- "loss": 0.4844,
524
  "step": 43000
525
  },
526
  {
527
  "epoch": 0.78,
528
  "learning_rate": 3.707296198559304e-05,
529
- "loss": 0.4765,
530
  "step": 43500
531
  },
532
  {
533
  "epoch": 0.78,
534
  "learning_rate": 3.6924375341749286e-05,
535
- "loss": 0.4825,
536
  "step": 44000
537
  },
538
  {
539
  "epoch": 0.79,
540
  "learning_rate": 3.677578869790552e-05,
541
- "loss": 0.4642,
542
  "step": 44500
543
  },
544
  {
545
  "epoch": 0.8,
546
  "learning_rate": 3.662720205406177e-05,
547
- "loss": 0.4819,
548
  "step": 45000
549
  },
550
  {
551
  "epoch": 0.81,
552
  "learning_rate": 3.647861541021801e-05,
553
- "loss": 0.4657,
554
  "step": 45500
555
  },
556
  {
557
  "epoch": 0.82,
558
  "learning_rate": 3.633002876637425e-05,
559
- "loss": 0.4545,
560
  "step": 46000
561
  },
562
  {
563
  "epoch": 0.83,
564
  "learning_rate": 3.618144212253049e-05,
565
- "loss": 0.4756,
566
  "step": 46500
567
  },
568
  {
569
  "epoch": 0.84,
570
  "learning_rate": 3.6032855478686734e-05,
571
- "loss": 0.4641,
572
  "step": 47000
573
  },
574
  {
575
  "epoch": 0.85,
576
  "learning_rate": 3.5884268834842975e-05,
577
- "loss": 0.4712,
578
  "step": 47500
579
  },
580
  {
581
  "epoch": 0.86,
582
  "learning_rate": 3.5735682190999216e-05,
583
- "loss": 0.4842,
584
  "step": 48000
585
  },
586
  {
587
  "epoch": 0.86,
588
  "learning_rate": 3.558709554715546e-05,
589
- "loss": 0.4727,
590
  "step": 48500
591
  },
592
  {
593
  "epoch": 0.87,
594
  "learning_rate": 3.54385089033117e-05,
595
- "loss": 0.4474,
596
  "step": 49000
597
  },
598
  {
599
  "epoch": 0.88,
600
  "learning_rate": 3.5289922259467946e-05,
601
- "loss": 0.4594,
602
  "step": 49500
603
  },
604
  {
605
  "epoch": 0.89,
606
  "learning_rate": 3.514133561562418e-05,
607
- "loss": 0.4525,
608
  "step": 50000
609
  },
610
  {
611
  "epoch": 0.9,
612
  "learning_rate": 3.499274897178043e-05,
613
- "loss": 0.4578,
614
  "step": 50500
615
  },
616
  {
617
  "epoch": 0.91,
618
  "learning_rate": 3.484416232793667e-05,
619
- "loss": 0.4602,
620
  "step": 51000
621
  },
622
  {
623
  "epoch": 0.92,
624
  "learning_rate": 3.469557568409291e-05,
625
- "loss": 0.4584,
626
  "step": 51500
627
  },
628
  {
629
  "epoch": 0.93,
630
  "learning_rate": 3.454698904024915e-05,
631
- "loss": 0.4486,
632
  "step": 52000
633
  },
634
  {
635
  "epoch": 0.94,
636
  "learning_rate": 3.439840239640539e-05,
637
- "loss": 0.4684,
638
  "step": 52500
639
  },
640
  {
641
  "epoch": 0.95,
642
  "learning_rate": 3.4249815752561634e-05,
643
- "loss": 0.4508,
644
  "step": 53000
645
  },
646
  {
647
  "epoch": 0.95,
648
  "learning_rate": 3.4101229108717876e-05,
649
- "loss": 0.454,
650
  "step": 53500
651
  },
652
  {
653
  "epoch": 0.96,
654
  "learning_rate": 3.395264246487412e-05,
655
- "loss": 0.4447,
656
  "step": 54000
657
  },
658
  {
659
  "epoch": 0.97,
660
  "learning_rate": 3.3804055821030365e-05,
661
- "loss": 0.4489,
662
  "step": 54500
663
  },
664
  {
665
  "epoch": 0.98,
666
  "learning_rate": 3.3655469177186606e-05,
667
- "loss": 0.4306,
668
  "step": 55000
669
  },
670
  {
671
  "epoch": 0.99,
672
  "learning_rate": 3.350688253334284e-05,
673
- "loss": 0.4496,
674
  "step": 55500
675
  },
676
  {
677
  "epoch": 1.0,
678
  "learning_rate": 3.335829588949909e-05,
679
- "loss": 0.4529,
680
  "step": 56000
681
  },
682
  {
683
  "epoch": 1.01,
684
  "learning_rate": 3.320970924565533e-05,
685
- "loss": 0.422,
686
  "step": 56500
687
  },
688
  {
689
  "epoch": 1.02,
690
  "learning_rate": 3.306112260181157e-05,
691
- "loss": 0.4189,
692
  "step": 57000
693
  },
694
  {
695
  "epoch": 1.03,
696
  "learning_rate": 3.291253595796781e-05,
697
- "loss": 0.4271,
698
  "step": 57500
699
  },
700
  {
701
  "epoch": 1.03,
702
  "learning_rate": 3.276394931412405e-05,
703
- "loss": 0.454,
704
  "step": 58000
705
  },
706
  {
707
  "epoch": 1.04,
708
  "learning_rate": 3.2615362670280294e-05,
709
- "loss": 0.4247,
710
  "step": 58500
711
  },
712
  {
713
  "epoch": 1.05,
714
  "learning_rate": 3.2466776026436535e-05,
715
- "loss": 0.4548,
716
  "step": 59000
717
  },
718
  {
719
  "epoch": 1.06,
720
  "learning_rate": 3.2318189382592776e-05,
721
- "loss": 0.43,
722
  "step": 59500
723
  },
724
  {
725
  "epoch": 1.07,
726
  "learning_rate": 3.2169602738749024e-05,
727
- "loss": 0.4219,
728
  "step": 60000
729
  },
730
  {
731
  "epoch": 1.08,
732
  "learning_rate": 3.2021016094905265e-05,
733
- "loss": 0.4271,
734
  "step": 60500
735
  },
736
  {
737
  "epoch": 1.09,
738
  "learning_rate": 3.18724294510615e-05,
739
- "loss": 0.4264,
740
  "step": 61000
741
  },
742
  {
743
  "epoch": 1.1,
744
  "learning_rate": 3.172384280721775e-05,
745
- "loss": 0.4389,
746
  "step": 61500
747
  },
748
  {
749
  "epoch": 1.11,
750
  "learning_rate": 3.157525616337399e-05,
751
- "loss": 0.4113,
752
  "step": 62000
753
  },
754
  {
755
  "epoch": 1.11,
756
  "learning_rate": 3.142666951953023e-05,
757
- "loss": 0.4387,
758
  "step": 62500
759
  },
760
  {
761
  "epoch": 1.12,
762
  "learning_rate": 3.127808287568647e-05,
763
- "loss": 0.4227,
764
  "step": 63000
765
  },
766
  {
767
  "epoch": 1.13,
768
  "learning_rate": 3.112949623184271e-05,
769
- "loss": 0.4403,
770
  "step": 63500
771
  },
772
  {
773
  "epoch": 1.14,
774
  "learning_rate": 3.098090958799896e-05,
775
- "loss": 0.4321,
776
  "step": 64000
777
  },
778
  {
779
  "epoch": 1.15,
780
  "learning_rate": 3.0832322944155195e-05,
781
- "loss": 0.4235,
782
  "step": 64500
783
  },
784
  {
785
  "epoch": 1.16,
786
  "learning_rate": 3.0683736300311436e-05,
787
- "loss": 0.4205,
788
  "step": 65000
789
  },
790
  {
791
  "epoch": 1.17,
792
  "learning_rate": 3.0535149656467684e-05,
793
- "loss": 0.4344,
794
  "step": 65500
795
  },
796
  {
797
  "epoch": 1.18,
798
  "learning_rate": 3.0386563012623925e-05,
799
- "loss": 0.394,
800
  "step": 66000
801
  },
802
  {
803
  "epoch": 1.19,
804
  "learning_rate": 3.0237976368780163e-05,
805
- "loss": 0.404,
806
  "step": 66500
807
  },
808
  {
809
  "epoch": 1.19,
810
  "learning_rate": 3.0089389724936407e-05,
811
- "loss": 0.4273,
812
  "step": 67000
813
  },
814
  {
815
  "epoch": 1.2,
816
  "learning_rate": 2.994080308109265e-05,
817
- "loss": 0.3993,
818
  "step": 67500
819
  },
820
  {
821
  "epoch": 1.21,
822
  "learning_rate": 2.9792216437248886e-05,
823
- "loss": 0.4198,
824
  "step": 68000
825
  },
826
  {
827
  "epoch": 1.22,
828
  "learning_rate": 2.964362979340513e-05,
829
- "loss": 0.4268,
830
  "step": 68500
831
  },
832
  {
833
  "epoch": 1.23,
834
  "learning_rate": 2.9495043149561376e-05,
835
- "loss": 0.4057,
836
  "step": 69000
837
  },
838
  {
839
  "epoch": 1.24,
840
  "learning_rate": 2.9346456505717617e-05,
841
- "loss": 0.4021,
842
  "step": 69500
843
  },
844
  {
845
  "epoch": 1.25,
846
  "learning_rate": 2.9197869861873855e-05,
847
- "loss": 0.4051,
848
  "step": 70000
849
  },
850
  {
851
  "epoch": 1.26,
852
  "learning_rate": 2.90492832180301e-05,
853
- "loss": 0.394,
854
  "step": 70500
855
  },
856
  {
857
  "epoch": 1.27,
858
  "learning_rate": 2.8900696574186344e-05,
859
- "loss": 0.4048,
860
  "step": 71000
861
  },
862
  {
863
  "epoch": 1.27,
864
  "learning_rate": 2.875210993034258e-05,
865
- "loss": 0.4025,
866
  "step": 71500
867
  },
868
  {
869
  "epoch": 1.28,
870
  "learning_rate": 2.8603523286498823e-05,
871
- "loss": 0.4114,
872
  "step": 72000
873
  },
874
  {
875
  "epoch": 1.29,
876
  "learning_rate": 2.8454936642655067e-05,
877
- "loss": 0.4113,
878
  "step": 72500
879
  },
880
  {
881
  "epoch": 1.3,
882
  "learning_rate": 2.8306349998811312e-05,
883
- "loss": 0.4046,
884
  "step": 73000
885
  },
886
  {
887
  "epoch": 1.31,
888
  "learning_rate": 2.815776335496755e-05,
889
- "loss": 0.3996,
890
  "step": 73500
891
  },
892
  {
893
  "epoch": 1.32,
894
  "learning_rate": 2.800917671112379e-05,
895
- "loss": 0.392,
896
  "step": 74000
897
  },
898
  {
899
  "epoch": 1.33,
900
  "learning_rate": 2.7860590067280035e-05,
901
- "loss": 0.3942,
902
  "step": 74500
903
  },
904
  {
905
  "epoch": 1.34,
906
  "learning_rate": 2.7712003423436276e-05,
907
- "loss": 0.3926,
908
  "step": 75000
909
  },
910
  {
911
  "epoch": 1.35,
912
  "learning_rate": 2.7563416779592514e-05,
913
- "loss": 0.3851,
914
  "step": 75500
915
  },
916
  {
917
  "epoch": 1.36,
918
  "learning_rate": 2.741483013574876e-05,
919
- "loss": 0.3823,
920
  "step": 76000
921
  },
922
  {
923
  "epoch": 1.36,
924
  "learning_rate": 2.7266243491905003e-05,
925
- "loss": 0.4122,
926
  "step": 76500
927
  },
928
  {
929
  "epoch": 1.37,
930
  "learning_rate": 2.711765684806124e-05,
931
- "loss": 0.3894,
932
  "step": 77000
933
  },
934
  {
935
  "epoch": 1.38,
936
  "learning_rate": 2.6969070204217482e-05,
937
- "loss": 0.3771,
938
  "step": 77500
939
  },
940
  {
941
  "epoch": 1.39,
942
  "learning_rate": 2.6820483560373727e-05,
943
- "loss": 0.385,
944
  "step": 78000
945
  },
946
  {
947
  "epoch": 1.4,
948
  "learning_rate": 2.667189691652997e-05,
949
- "loss": 0.3993,
950
  "step": 78500
951
  },
952
  {
953
  "epoch": 1.41,
954
  "learning_rate": 2.652331027268621e-05,
955
- "loss": 0.4011,
956
  "step": 79000
957
  },
958
  {
959
  "epoch": 1.42,
960
  "learning_rate": 2.637472362884245e-05,
961
- "loss": 0.3742,
962
  "step": 79500
963
  },
964
  {
965
  "epoch": 1.43,
966
  "learning_rate": 2.6226136984998695e-05,
967
- "loss": 0.4022,
968
  "step": 80000
969
  },
970
  {
971
  "epoch": 1.44,
972
  "learning_rate": 2.607755034115494e-05,
973
- "loss": 0.3927,
974
  "step": 80500
975
  },
976
  {
977
  "epoch": 1.44,
978
  "learning_rate": 2.5928963697311177e-05,
979
- "loss": 0.3884,
980
  "step": 81000
981
  },
982
  {
983
  "epoch": 1.45,
984
  "learning_rate": 2.578037705346742e-05,
985
- "loss": 0.3982,
986
  "step": 81500
987
  },
988
  {
989
  "epoch": 1.46,
990
  "learning_rate": 2.5631790409623663e-05,
991
- "loss": 0.3956,
992
  "step": 82000
993
  },
994
  {
995
  "epoch": 1.47,
996
  "learning_rate": 2.54832037657799e-05,
997
- "loss": 0.3836,
998
  "step": 82500
999
  },
1000
  {
1001
  "epoch": 1.48,
1002
  "learning_rate": 2.5334617121936145e-05,
1003
- "loss": 0.3775,
1004
  "step": 83000
1005
  },
1006
  {
1007
  "epoch": 1.49,
1008
  "learning_rate": 2.5186030478092386e-05,
1009
- "loss": 0.3895,
1010
  "step": 83500
1011
  },
1012
  {
1013
  "epoch": 1.5,
1014
  "learning_rate": 2.503744383424863e-05,
1015
- "loss": 0.3819,
1016
  "step": 84000
1017
  },
1018
  {
1019
  "epoch": 1.51,
1020
  "learning_rate": 2.4888857190404872e-05,
1021
- "loss": 0.3801,
1022
  "step": 84500
1023
  },
1024
  {
1025
  "epoch": 1.52,
1026
  "learning_rate": 2.4740270546561113e-05,
1027
- "loss": 0.3872,
1028
  "step": 85000
1029
  },
1030
  {
1031
  "epoch": 1.52,
1032
  "learning_rate": 2.459168390271735e-05,
1033
- "loss": 0.3818,
1034
  "step": 85500
1035
  },
1036
  {
1037
  "epoch": 1.53,
1038
  "learning_rate": 2.4443097258873596e-05,
1039
- "loss": 0.3862,
1040
  "step": 86000
1041
  },
1042
  {
1043
  "epoch": 1.54,
1044
  "learning_rate": 2.4294510615029837e-05,
1045
- "loss": 0.3764,
1046
  "step": 86500
1047
  },
1048
  {
1049
  "epoch": 1.55,
1050
  "learning_rate": 2.4145923971186078e-05,
1051
- "loss": 0.3922,
1052
  "step": 87000
1053
  },
1054
  {
1055
  "epoch": 1.56,
1056
  "learning_rate": 2.399733732734232e-05,
1057
- "loss": 0.3738,
1058
  "step": 87500
1059
  },
1060
  {
1061
  "epoch": 1.57,
1062
  "learning_rate": 2.3848750683498564e-05,
1063
- "loss": 0.3913,
1064
  "step": 88000
1065
  },
1066
  {
1067
  "epoch": 1.58,
1068
  "learning_rate": 2.3700164039654805e-05,
1069
- "loss": 0.3773,
1070
  "step": 88500
1071
  },
1072
  {
1073
  "epoch": 1.59,
1074
  "learning_rate": 2.3551577395811046e-05,
1075
- "loss": 0.3763,
1076
  "step": 89000
1077
  },
1078
  {
1079
  "epoch": 1.6,
1080
  "learning_rate": 2.3402990751967287e-05,
1081
- "loss": 0.3708,
1082
  "step": 89500
1083
  },
1084
  {
1085
  "epoch": 1.6,
1086
  "learning_rate": 2.3254404108123532e-05,
1087
- "loss": 0.3738,
1088
  "step": 90000
1089
  },
1090
  {
1091
  "epoch": 1.61,
1092
  "learning_rate": 2.3105817464279773e-05,
1093
- "loss": 0.39,
1094
  "step": 90500
1095
  },
1096
  {
1097
  "epoch": 1.62,
1098
  "learning_rate": 2.2957230820436014e-05,
1099
- "loss": 0.3763,
1100
  "step": 91000
1101
  },
1102
  {
1103
  "epoch": 1.63,
1104
  "learning_rate": 2.2808644176592255e-05,
1105
- "loss": 0.3568,
1106
  "step": 91500
1107
  },
1108
  {
1109
  "epoch": 1.64,
1110
  "learning_rate": 2.2660057532748497e-05,
1111
- "loss": 0.3843,
1112
  "step": 92000
1113
  },
1114
  {
1115
  "epoch": 1.65,
1116
  "learning_rate": 2.251147088890474e-05,
1117
- "loss": 0.3707,
1118
  "step": 92500
1119
  },
1120
  {
1121
  "epoch": 1.66,
1122
  "learning_rate": 2.236288424506098e-05,
1123
- "loss": 0.3658,
1124
  "step": 93000
1125
  },
1126
  {
1127
  "epoch": 1.67,
1128
  "learning_rate": 2.2214297601217223e-05,
1129
- "loss": 0.3675,
1130
  "step": 93500
1131
  },
1132
  {
1133
  "epoch": 1.68,
1134
  "learning_rate": 2.2065710957373465e-05,
1135
- "loss": 0.3552,
1136
  "step": 94000
1137
  },
1138
  {
1139
  "epoch": 1.68,
1140
  "learning_rate": 2.191712431352971e-05,
1141
- "loss": 0.3709,
1142
  "step": 94500
1143
  },
1144
  {
1145
  "epoch": 1.69,
1146
  "learning_rate": 2.1768537669685947e-05,
1147
- "loss": 0.3788,
1148
  "step": 95000
1149
  },
1150
  {
1151
  "epoch": 1.7,
1152
  "learning_rate": 2.1619951025842188e-05,
1153
- "loss": 0.3618,
1154
  "step": 95500
1155
  },
1156
  {
1157
  "epoch": 1.71,
1158
  "learning_rate": 2.1471364381998433e-05,
1159
- "loss": 0.375,
1160
  "step": 96000
1161
  },
1162
  {
1163
  "epoch": 1.72,
1164
  "learning_rate": 2.1322777738154674e-05,
1165
- "loss": 0.3779,
1166
  "step": 96500
1167
  },
1168
  {
1169
  "epoch": 1.73,
1170
  "learning_rate": 2.1174191094310915e-05,
1171
- "loss": 0.367,
1172
  "step": 97000
1173
  },
1174
  {
1175
  "epoch": 1.74,
1176
  "learning_rate": 2.1025604450467156e-05,
1177
- "loss": 0.3846,
1178
  "step": 97500
1179
  },
1180
  {
1181
  "epoch": 1.75,
1182
  "learning_rate": 2.08770178066234e-05,
1183
- "loss": 0.3702,
1184
  "step": 98000
1185
  },
1186
  {
1187
  "epoch": 1.76,
1188
  "learning_rate": 2.0728431162779642e-05,
1189
- "loss": 0.344,
1190
  "step": 98500
1191
  },
1192
  {
1193
  "epoch": 1.77,
1194
  "learning_rate": 2.0579844518935883e-05,
1195
- "loss": 0.3597,
1196
  "step": 99000
1197
  },
1198
  {
1199
  "epoch": 1.77,
1200
  "learning_rate": 2.0431257875092124e-05,
1201
- "loss": 0.3617,
1202
  "step": 99500
1203
  },
1204
  {
1205
  "epoch": 1.78,
1206
  "learning_rate": 2.0282671231248365e-05,
1207
- "loss": 0.3747,
1208
  "step": 100000
1209
  },
1210
  {
1211
  "epoch": 1.79,
1212
  "learning_rate": 2.013408458740461e-05,
1213
- "loss": 0.3528,
1214
  "step": 100500
1215
  },
1216
  {
1217
  "epoch": 1.8,
1218
  "learning_rate": 1.9985497943560848e-05,
1219
- "loss": 0.3706,
1220
  "step": 101000
1221
  },
1222
  {
1223
  "epoch": 1.81,
1224
  "learning_rate": 1.9836911299717092e-05,
1225
- "loss": 0.3588,
1226
  "step": 101500
1227
  },
1228
  {
1229
  "epoch": 1.82,
1230
  "learning_rate": 1.9688324655873334e-05,
1231
- "loss": 0.3547,
1232
  "step": 102000
1233
  },
1234
  {
1235
  "epoch": 1.83,
1236
  "learning_rate": 1.9539738012029578e-05,
1237
- "loss": 0.3748,
1238
  "step": 102500
1239
  },
1240
  {
1241
  "epoch": 1.84,
1242
  "learning_rate": 1.9391151368185816e-05,
1243
- "loss": 0.3508,
1244
  "step": 103000
1245
  },
1246
  {
1247
  "epoch": 1.85,
1248
  "learning_rate": 1.924256472434206e-05,
1249
- "loss": 0.3453,
1250
  "step": 103500
1251
  },
1252
  {
1253
  "epoch": 1.85,
1254
  "learning_rate": 1.90939780804983e-05,
1255
- "loss": 0.3544,
1256
  "step": 104000
1257
  },
1258
  {
1259
  "epoch": 1.86,
1260
  "learning_rate": 1.8945391436654543e-05,
1261
- "loss": 0.354,
1262
  "step": 104500
1263
  },
1264
  {
1265
  "epoch": 1.87,
1266
  "learning_rate": 1.8796804792810784e-05,
1267
- "loss": 0.3524,
1268
  "step": 105000
1269
  },
1270
  {
1271
  "epoch": 1.88,
1272
  "learning_rate": 1.8648218148967025e-05,
1273
- "loss": 0.3463,
1274
  "step": 105500
1275
  },
1276
  {
1277
  "epoch": 1.89,
1278
  "learning_rate": 1.849963150512327e-05,
1279
- "loss": 0.3571,
1280
  "step": 106000
1281
  },
1282
  {
1283
  "epoch": 1.9,
1284
  "learning_rate": 1.835104486127951e-05,
1285
- "loss": 0.3534,
1286
  "step": 106500
1287
  },
1288
  {
1289
  "epoch": 1.91,
1290
  "learning_rate": 1.8202458217435752e-05,
1291
- "loss": 0.347,
1292
  "step": 107000
1293
  },
1294
  {
1295
  "epoch": 1.92,
1296
  "learning_rate": 1.8053871573591993e-05,
1297
- "loss": 0.3345,
1298
  "step": 107500
1299
  },
1300
  {
1301
  "epoch": 1.93,
1302
  "learning_rate": 1.7905284929748238e-05,
1303
- "loss": 0.3407,
1304
  "step": 108000
1305
  },
1306
  {
1307
  "epoch": 1.93,
1308
  "learning_rate": 1.775669828590448e-05,
1309
- "loss": 0.3517,
1310
  "step": 108500
1311
  },
1312
  {
1313
  "epoch": 1.94,
1314
  "learning_rate": 1.760811164206072e-05,
1315
- "loss": 0.3497,
1316
  "step": 109000
1317
  },
1318
  {
1319
  "epoch": 1.95,
1320
  "learning_rate": 1.745952499821696e-05,
1321
- "loss": 0.3672,
1322
  "step": 109500
1323
  },
1324
  {
1325
  "epoch": 1.96,
1326
  "learning_rate": 1.7310938354373202e-05,
1327
- "loss": 0.349,
1328
  "step": 110000
1329
  },
1330
  {
1331
  "epoch": 1.97,
1332
  "learning_rate": 1.7162351710529444e-05,
1333
- "loss": 0.3431,
1334
  "step": 110500
1335
  },
1336
  {
1337
  "epoch": 1.98,
1338
  "learning_rate": 1.7013765066685685e-05,
1339
- "loss": 0.3349,
1340
  "step": 111000
1341
  },
1342
  {
1343
  "epoch": 1.99,
1344
  "learning_rate": 1.686517842284193e-05,
1345
- "loss": 0.3499,
1346
  "step": 111500
1347
  },
1348
  {
1349
  "epoch": 2.0,
1350
  "learning_rate": 1.671659177899817e-05,
1351
- "loss": 0.3493,
1352
  "step": 112000
1353
  },
1354
  {
1355
  "epoch": 2.01,
1356
  "learning_rate": 1.656800513515441e-05,
1357
- "loss": 0.3375,
1358
  "step": 112500
1359
  },
1360
  {
1361
  "epoch": 2.01,
1362
  "learning_rate": 1.6419418491310653e-05,
1363
- "loss": 0.3255,
1364
  "step": 113000
1365
  },
1366
  {
1367
  "epoch": 2.02,
1368
  "learning_rate": 1.6270831847466897e-05,
1369
- "loss": 0.3455,
1370
  "step": 113500
1371
  },
1372
  {
1373
  "epoch": 2.03,
1374
  "learning_rate": 1.612224520362314e-05,
1375
- "loss": 0.3315,
1376
  "step": 114000
1377
  },
1378
  {
1379
  "epoch": 2.04,
1380
  "learning_rate": 1.597365855977938e-05,
1381
- "loss": 0.34,
1382
  "step": 114500
1383
  },
1384
  {
1385
  "epoch": 2.05,
1386
  "learning_rate": 1.582507191593562e-05,
1387
- "loss": 0.3372,
1388
  "step": 115000
1389
  },
1390
  {
1391
  "epoch": 2.06,
1392
  "learning_rate": 1.5676485272091862e-05,
1393
- "loss": 0.3172,
1394
  "step": 115500
1395
  },
1396
  {
1397
  "epoch": 2.07,
1398
  "learning_rate": 1.5527898628248107e-05,
1399
- "loss": 0.3419,
1400
  "step": 116000
1401
  },
1402
  {
1403
  "epoch": 2.08,
1404
  "learning_rate": 1.5379311984404344e-05,
1405
- "loss": 0.3334,
1406
  "step": 116500
1407
  },
1408
  {
1409
  "epoch": 2.09,
1410
  "learning_rate": 1.5230725340560589e-05,
1411
- "loss": 0.3381,
1412
  "step": 117000
1413
  },
1414
  {
1415
  "epoch": 2.1,
1416
  "learning_rate": 1.508213869671683e-05,
1417
- "loss": 0.3258,
1418
  "step": 117500
1419
  },
1420
  {
1421
  "epoch": 2.1,
1422
  "learning_rate": 1.4933552052873073e-05,
1423
- "loss": 0.344,
1424
  "step": 118000
1425
  },
1426
  {
1427
  "epoch": 2.11,
1428
  "learning_rate": 1.4784965409029314e-05,
1429
- "loss": 0.3423,
1430
  "step": 118500
1431
  },
1432
  {
1433
  "epoch": 2.12,
1434
  "learning_rate": 1.4636378765185554e-05,
1435
- "loss": 0.331,
1436
  "step": 119000
1437
  },
1438
  {
1439
  "epoch": 2.13,
1440
  "learning_rate": 1.4487792121341798e-05,
1441
- "loss": 0.3195,
1442
  "step": 119500
1443
  },
1444
  {
1445
  "epoch": 2.14,
1446
  "learning_rate": 1.4339205477498038e-05,
1447
- "loss": 0.339,
1448
  "step": 120000
1449
  },
1450
  {
1451
  "epoch": 2.15,
1452
  "learning_rate": 1.4190618833654282e-05,
1453
- "loss": 0.3384,
1454
  "step": 120500
1455
  },
1456
  {
1457
  "epoch": 2.16,
1458
  "learning_rate": 1.4042032189810522e-05,
1459
- "loss": 0.3316,
1460
  "step": 121000
1461
  },
1462
  {
1463
  "epoch": 2.17,
1464
  "learning_rate": 1.3893445545966766e-05,
1465
- "loss": 0.3453,
1466
  "step": 121500
1467
  },
1468
  {
1469
  "epoch": 2.18,
1470
  "learning_rate": 1.3744858902123006e-05,
1471
- "loss": 0.3332,
1472
  "step": 122000
1473
  },
1474
  {
1475
  "epoch": 2.18,
1476
  "learning_rate": 1.359627225827925e-05,
1477
- "loss": 0.3273,
1478
  "step": 122500
1479
  },
1480
  {
1481
  "epoch": 2.19,
1482
  "learning_rate": 1.344768561443549e-05,
1483
- "loss": 0.3439,
1484
  "step": 123000
1485
  },
1486
  {
1487
  "epoch": 2.2,
1488
  "learning_rate": 1.3299098970591731e-05,
1489
- "loss": 0.3381,
1490
  "step": 123500
1491
  },
1492
  {
1493
  "epoch": 2.21,
1494
  "learning_rate": 1.3150512326747974e-05,
1495
- "loss": 0.3208,
1496
  "step": 124000
1497
  },
1498
  {
1499
  "epoch": 2.22,
1500
  "learning_rate": 1.3001925682904215e-05,
1501
- "loss": 0.3274,
1502
  "step": 124500
1503
  },
1504
  {
1505
  "epoch": 2.23,
1506
  "learning_rate": 1.2853339039060458e-05,
1507
- "loss": 0.3319,
1508
  "step": 125000
1509
  },
1510
  {
1511
  "epoch": 2.24,
1512
  "learning_rate": 1.2704752395216699e-05,
1513
- "loss": 0.3192,
1514
  "step": 125500
1515
  },
1516
  {
1517
  "epoch": 2.25,
1518
  "learning_rate": 1.2556165751372942e-05,
1519
- "loss": 0.3308,
1520
  "step": 126000
1521
  },
1522
  {
1523
  "epoch": 2.26,
1524
  "learning_rate": 1.2407579107529183e-05,
1525
- "loss": 0.3385,
1526
  "step": 126500
1527
  },
1528
  {
1529
  "epoch": 2.26,
1530
  "learning_rate": 1.2258992463685424e-05,
1531
- "loss": 0.3191,
1532
  "step": 127000
1533
  },
1534
  {
1535
  "epoch": 2.27,
1536
  "learning_rate": 1.2110405819841667e-05,
1537
- "loss": 0.3225,
1538
  "step": 127500
1539
  },
1540
  {
1541
  "epoch": 2.28,
1542
  "learning_rate": 1.1961819175997908e-05,
1543
- "loss": 0.3103,
1544
  "step": 128000
1545
  },
1546
  {
1547
  "epoch": 2.29,
1548
  "learning_rate": 1.1813232532154151e-05,
1549
- "loss": 0.3211,
1550
  "step": 128500
1551
  },
1552
  {
1553
  "epoch": 2.3,
1554
  "learning_rate": 1.1664645888310392e-05,
1555
- "loss": 0.3291,
1556
  "step": 129000
1557
  },
1558
  {
1559
  "epoch": 2.31,
1560
  "learning_rate": 1.1516059244466634e-05,
1561
- "loss": 0.3247,
1562
  "step": 129500
1563
  },
1564
  {
1565
  "epoch": 2.32,
1566
  "learning_rate": 1.1367472600622876e-05,
1567
- "loss": 0.3214,
1568
  "step": 130000
1569
  },
1570
  {
1571
  "epoch": 2.33,
1572
  "learning_rate": 1.1218885956779118e-05,
1573
- "loss": 0.3268,
1574
  "step": 130500
1575
  },
1576
  {
1577
  "epoch": 2.34,
1578
  "learning_rate": 1.1070299312935359e-05,
1579
- "loss": 0.3191,
1580
  "step": 131000
1581
  },
1582
  {
1583
  "epoch": 2.34,
1584
  "learning_rate": 1.0921712669091602e-05,
1585
- "loss": 0.3128,
1586
  "step": 131500
1587
  },
1588
  {
1589
  "epoch": 2.35,
1590
  "learning_rate": 1.0773126025247843e-05,
1591
- "loss": 0.3187,
1592
  "step": 132000
1593
  },
1594
  {
1595
  "epoch": 2.36,
1596
  "learning_rate": 1.0624539381404084e-05,
1597
- "loss": 0.3142,
1598
  "step": 132500
1599
  },
1600
  {
1601
  "epoch": 2.37,
1602
  "learning_rate": 1.0475952737560327e-05,
1603
- "loss": 0.3179,
1604
  "step": 133000
1605
  },
1606
  {
1607
  "epoch": 2.38,
1608
  "learning_rate": 1.0327366093716568e-05,
1609
- "loss": 0.2963,
1610
  "step": 133500
1611
  },
1612
  {
1613
  "epoch": 2.39,
1614
  "learning_rate": 1.0178779449872811e-05,
1615
- "loss": 0.3248,
1616
  "step": 134000
1617
  },
1618
  {
1619
  "epoch": 2.4,
1620
  "learning_rate": 1.0030192806029052e-05,
1621
- "loss": 0.3121,
1622
  "step": 134500
1623
  },
1624
  {
1625
  "epoch": 2.41,
1626
  "learning_rate": 9.881606162185295e-06,
1627
- "loss": 0.3259,
1628
  "step": 135000
1629
  },
1630
  {
1631
  "epoch": 2.42,
1632
  "learning_rate": 9.733019518341534e-06,
1633
- "loss": 0.3219,
1634
  "step": 135500
1635
  },
1636
  {
1637
  "epoch": 2.42,
1638
  "learning_rate": 9.584432874497777e-06,
1639
- "loss": 0.3195,
1640
  "step": 136000
1641
  },
1642
  {
1643
  "epoch": 2.43,
1644
  "learning_rate": 9.435846230654018e-06,
1645
- "loss": 0.338,
1646
  "step": 136500
1647
  },
1648
  {
1649
  "epoch": 2.44,
1650
  "learning_rate": 9.287259586810261e-06,
1651
- "loss": 0.3191,
1652
  "step": 137000
1653
  },
1654
  {
1655
  "epoch": 2.45,
1656
  "learning_rate": 9.138672942966502e-06,
1657
- "loss": 0.3106,
1658
  "step": 137500
1659
  },
1660
  {
1661
  "epoch": 2.46,
1662
  "learning_rate": 8.990086299122745e-06,
1663
- "loss": 0.3288,
1664
  "step": 138000
1665
  },
1666
  {
1667
  "epoch": 2.47,
1668
  "learning_rate": 8.841499655278986e-06,
1669
- "loss": 0.3137,
1670
  "step": 138500
1671
  },
1672
  {
1673
  "epoch": 2.48,
1674
  "learning_rate": 8.69291301143523e-06,
1675
- "loss": 0.3023,
1676
  "step": 139000
1677
  },
1678
  {
1679
  "epoch": 2.49,
1680
  "learning_rate": 8.54432636759147e-06,
1681
- "loss": 0.3445,
1682
  "step": 139500
1683
  },
1684
  {
1685
  "epoch": 2.5,
1686
  "learning_rate": 8.395739723747713e-06,
1687
- "loss": 0.3244,
1688
  "step": 140000
1689
  },
1690
  {
1691
  "epoch": 2.51,
1692
  "learning_rate": 8.247153079903953e-06,
1693
- "loss": 0.3232,
1694
  "step": 140500
1695
  },
1696
  {
1697
  "epoch": 2.51,
1698
  "learning_rate": 8.098566436060196e-06,
1699
- "loss": 0.3013,
1700
  "step": 141000
1701
  },
1702
  {
1703
  "epoch": 2.52,
1704
  "learning_rate": 7.949979792216437e-06,
1705
- "loss": 0.3229,
1706
  "step": 141500
1707
  },
1708
  {
1709
  "epoch": 2.53,
1710
  "learning_rate": 7.80139314837268e-06,
1711
- "loss": 0.3081,
1712
  "step": 142000
1713
  },
1714
  {
1715
  "epoch": 2.54,
1716
  "learning_rate": 7.652806504528921e-06,
1717
- "loss": 0.3127,
1718
  "step": 142500
1719
  },
1720
  {
1721
  "epoch": 2.55,
1722
  "learning_rate": 7.504219860685163e-06,
1723
- "loss": 0.3214,
1724
  "step": 143000
1725
  },
1726
  {
1727
  "epoch": 2.56,
1728
  "learning_rate": 7.355633216841405e-06,
1729
- "loss": 0.3262,
1730
  "step": 143500
1731
  },
1732
  {
1733
  "epoch": 2.57,
1734
  "learning_rate": 7.207046572997647e-06,
1735
- "loss": 0.3147,
1736
  "step": 144000
1737
  },
1738
  {
1739
  "epoch": 2.58,
1740
  "learning_rate": 7.058459929153889e-06,
1741
- "loss": 0.3014,
1742
  "step": 144500
1743
  },
1744
  {
1745
  "epoch": 2.59,
1746
  "learning_rate": 6.909873285310129e-06,
1747
- "loss": 0.329,
1748
  "step": 145000
1749
  },
1750
  {
1751
  "epoch": 2.59,
1752
  "learning_rate": 6.761286641466371e-06,
1753
- "loss": 0.3055,
1754
  "step": 145500
1755
  },
1756
  {
1757
  "epoch": 2.6,
1758
  "learning_rate": 6.612699997622613e-06,
1759
- "loss": 0.3187,
1760
  "step": 146000
1761
  },
1762
  {
1763
  "epoch": 2.61,
1764
  "learning_rate": 6.464113353778855e-06,
1765
- "loss": 0.3061,
1766
  "step": 146500
1767
  },
1768
  {
1769
  "epoch": 2.62,
1770
  "learning_rate": 6.315526709935097e-06,
1771
- "loss": 0.3069,
1772
  "step": 147000
1773
  },
1774
  {
1775
  "epoch": 2.63,
1776
  "learning_rate": 6.1669400660913394e-06,
1777
- "loss": 0.292,
1778
  "step": 147500
1779
  },
1780
  {
1781
  "epoch": 2.64,
1782
  "learning_rate": 6.0183534222475815e-06,
1783
- "loss": 0.3216,
1784
  "step": 148000
1785
  },
1786
  {
1787
  "epoch": 2.65,
1788
  "learning_rate": 5.869766778403823e-06,
1789
- "loss": 0.3015,
1790
  "step": 148500
1791
  },
1792
  {
1793
  "epoch": 2.66,
1794
  "learning_rate": 5.721180134560065e-06,
1795
- "loss": 0.3182,
1796
  "step": 149000
1797
  },
1798
  {
1799
  "epoch": 2.67,
1800
  "learning_rate": 5.572593490716307e-06,
1801
- "loss": 0.3211,
1802
  "step": 149500
1803
  },
1804
  {
1805
  "epoch": 2.67,
1806
  "learning_rate": 5.424006846872549e-06,
1807
- "loss": 0.3057,
1808
  "step": 150000
1809
  },
1810
  {
1811
  "epoch": 2.68,
1812
  "learning_rate": 5.275420203028791e-06,
1813
- "loss": 0.3081,
1814
  "step": 150500
1815
  },
1816
  {
1817
  "epoch": 2.69,
1818
  "learning_rate": 5.126833559185032e-06,
1819
- "loss": 0.3293,
1820
  "step": 151000
1821
  },
1822
  {
1823
  "epoch": 2.7,
1824
  "learning_rate": 4.978246915341274e-06,
1825
- "loss": 0.2971,
1826
  "step": 151500
1827
  },
1828
  {
1829
  "epoch": 2.71,
1830
  "learning_rate": 4.829660271497516e-06,
1831
- "loss": 0.3114,
1832
  "step": 152000
1833
  },
1834
  {
1835
  "epoch": 2.72,
1836
  "learning_rate": 4.681073627653758e-06,
1837
- "loss": 0.303,
1838
  "step": 152500
1839
  },
1840
  {
1841
  "epoch": 2.73,
1842
  "learning_rate": 4.53248698381e-06,
1843
- "loss": 0.2996,
1844
  "step": 153000
1845
  },
1846
  {
1847
  "epoch": 2.74,
1848
  "learning_rate": 4.383900339966241e-06,
1849
- "loss": 0.3162,
1850
  "step": 153500
1851
  },
1852
  {
1853
  "epoch": 2.75,
1854
  "learning_rate": 4.235313696122483e-06,
1855
- "loss": 0.3036,
1856
  "step": 154000
1857
  },
1858
  {
1859
  "epoch": 2.75,
1860
  "learning_rate": 4.086727052278725e-06,
1861
- "loss": 0.2897,
1862
  "step": 154500
1863
  },
1864
  {
1865
  "epoch": 2.76,
1866
  "learning_rate": 3.938140408434967e-06,
1867
- "loss": 0.301,
1868
  "step": 155000
1869
  },
1870
  {
1871
  "epoch": 2.77,
1872
  "learning_rate": 3.7895537645912088e-06,
1873
- "loss": 0.296,
1874
  "step": 155500
1875
  },
1876
  {
1877
  "epoch": 2.78,
1878
  "learning_rate": 3.6409671207474504e-06,
1879
- "loss": 0.3033,
1880
  "step": 156000
1881
  },
1882
  {
1883
  "epoch": 2.79,
1884
  "learning_rate": 3.4923804769036924e-06,
1885
- "loss": 0.3047,
1886
  "step": 156500
1887
  },
1888
  {
1889
  "epoch": 2.8,
1890
  "learning_rate": 3.343793833059934e-06,
1891
- "loss": 0.2916,
1892
  "step": 157000
1893
  },
1894
  {
1895
  "epoch": 2.81,
1896
  "learning_rate": 3.195207189216176e-06,
1897
- "loss": 0.2832,
1898
  "step": 157500
1899
  },
1900
  {
1901
  "epoch": 2.82,
1902
  "learning_rate": 3.0466205453724176e-06,
1903
- "loss": 0.3045,
1904
  "step": 158000
1905
  },
1906
  {
1907
  "epoch": 2.83,
1908
  "learning_rate": 2.898033901528659e-06,
1909
- "loss": 0.2803,
1910
  "step": 158500
1911
  },
1912
  {
1913
  "epoch": 2.84,
1914
  "learning_rate": 2.749447257684901e-06,
1915
- "loss": 0.3031,
1916
  "step": 159000
1917
  },
1918
  {
1919
  "epoch": 2.84,
1920
  "learning_rate": 2.6008606138411432e-06,
1921
- "loss": 0.3074,
1922
  "step": 159500
1923
  },
1924
  {
1925
  "epoch": 2.85,
1926
  "learning_rate": 2.452273969997385e-06,
1927
- "loss": 0.2931,
1928
  "step": 160000
1929
  },
1930
  {
1931
  "epoch": 2.86,
1932
  "learning_rate": 2.303687326153627e-06,
1933
- "loss": 0.3012,
1934
  "step": 160500
1935
  },
1936
  {
1937
  "epoch": 2.87,
1938
  "learning_rate": 2.1551006823098684e-06,
1939
- "loss": 0.3068,
1940
  "step": 161000
1941
  },
1942
  {
1943
  "epoch": 2.88,
1944
  "learning_rate": 2.0065140384661104e-06,
1945
- "loss": 0.2975,
1946
  "step": 161500
1947
  },
1948
  {
1949
  "epoch": 2.89,
1950
  "learning_rate": 1.8579273946223525e-06,
1951
- "loss": 0.3072,
1952
  "step": 162000
1953
  },
1954
  {
1955
  "epoch": 2.9,
1956
  "learning_rate": 1.709340750778594e-06,
1957
- "loss": 0.2942,
1958
  "step": 162500
1959
  },
1960
  {
1961
  "epoch": 2.91,
1962
  "learning_rate": 1.5607541069348359e-06,
1963
- "loss": 0.3017,
1964
  "step": 163000
1965
  },
1966
  {
1967
  "epoch": 2.92,
1968
  "learning_rate": 1.4121674630910779e-06,
1969
- "loss": 0.2992,
1970
  "step": 163500
1971
  },
1972
  {
1973
  "epoch": 2.92,
1974
  "learning_rate": 1.2635808192473197e-06,
1975
- "loss": 0.3151,
1976
  "step": 164000
1977
  },
1978
  {
1979
  "epoch": 2.93,
1980
  "learning_rate": 1.1149941754035615e-06,
1981
- "loss": 0.3089,
1982
  "step": 164500
1983
  },
1984
  {
1985
  "epoch": 2.94,
1986
  "learning_rate": 9.66407531559803e-07,
1987
- "loss": 0.2798,
1988
  "step": 165000
1989
  },
1990
  {
1991
  "epoch": 2.95,
1992
  "learning_rate": 8.17820887716045e-07,
1993
- "loss": 0.3011,
1994
  "step": 165500
1995
  },
1996
  {
1997
  "epoch": 2.96,
1998
  "learning_rate": 6.692342438722869e-07,
1999
- "loss": 0.3067,
2000
  "step": 166000
2001
  },
2002
  {
2003
  "epoch": 2.97,
2004
  "learning_rate": 5.206476000285286e-07,
2005
- "loss": 0.3012,
2006
  "step": 166500
2007
  },
2008
  {
2009
  "epoch": 2.98,
2010
  "learning_rate": 3.720609561847705e-07,
2011
- "loss": 0.2898,
2012
  "step": 167000
2013
  },
2014
  {
2015
  "epoch": 2.99,
2016
  "learning_rate": 2.234743123410123e-07,
2017
- "loss": 0.2955,
2018
  "step": 167500
2019
  },
2020
  {
2021
  "epoch": 3.0,
2022
  "learning_rate": 7.488766849725412e-08,
2023
- "loss": 0.2849,
2024
  "step": 168000
2025
  },
2026
  {
2027
  "epoch": 3.0,
2028
  "step": 168252,
2029
- "total_flos": 3.8879648307414296e+16,
2030
- "train_loss": 0.4256299230643896,
2031
- "train_runtime": 18063.8373,
2032
- "train_samples_per_second": 18.629,
2033
- "train_steps_per_second": 9.314
2034
  }
2035
  ],
2036
  "max_steps": 168252,
2037
  "num_train_epochs": 3,
2038
- "total_flos": 3.8879648307414296e+16,
2039
  "trial_name": null,
2040
  "trial_params": null
2041
  }
 
10
  {
11
  "epoch": 0.01,
12
  "learning_rate": 4.9851413356156244e-05,
13
+ "loss": 4.6864,
14
  "step": 500
15
  },
16
  {
17
  "epoch": 0.02,
18
  "learning_rate": 4.9702826712312485e-05,
19
+ "loss": 4.4314,
20
  "step": 1000
21
  },
22
  {
23
  "epoch": 0.03,
24
  "learning_rate": 4.9554240068468726e-05,
25
+ "loss": 4.1833,
26
  "step": 1500
27
  },
28
  {
29
  "epoch": 0.04,
30
  "learning_rate": 4.940565342462497e-05,
31
+ "loss": 4.1583,
32
  "step": 2000
33
  },
34
  {
35
  "epoch": 0.04,
36
  "learning_rate": 4.9257066780781215e-05,
37
+ "loss": 4.2004,
38
  "step": 2500
39
  },
40
  {
41
  "epoch": 0.05,
42
  "learning_rate": 4.910848013693745e-05,
43
+ "loss": 4.2117,
44
  "step": 3000
45
  },
46
  {
47
  "epoch": 0.06,
48
  "learning_rate": 4.89598934930937e-05,
49
+ "loss": 4.2172,
50
  "step": 3500
51
  },
52
  {
53
  "epoch": 0.07,
54
  "learning_rate": 4.881130684924994e-05,
55
+ "loss": 4.2154,
56
  "step": 4000
57
  },
58
  {
59
  "epoch": 0.08,
60
  "learning_rate": 4.866272020540618e-05,
61
+ "loss": 4.1957,
62
  "step": 4500
63
  },
64
  {
65
  "epoch": 0.09,
66
  "learning_rate": 4.851413356156242e-05,
67
+ "loss": 4.199,
68
  "step": 5000
69
  },
70
  {
71
  "epoch": 0.1,
72
  "learning_rate": 4.836554691771866e-05,
73
+ "loss": 4.2366,
74
  "step": 5500
75
  },
76
  {
77
  "epoch": 0.11,
78
  "learning_rate": 4.82169602738749e-05,
79
+ "loss": 4.1593,
80
  "step": 6000
81
  },
82
  {
83
  "epoch": 0.12,
84
  "learning_rate": 4.8068373630031144e-05,
85
+ "loss": 4.1763,
86
  "step": 6500
87
  },
88
  {
89
  "epoch": 0.12,
90
  "learning_rate": 4.7919786986187386e-05,
91
+ "loss": 4.1563,
92
  "step": 7000
93
  },
94
  {
95
  "epoch": 0.13,
96
  "learning_rate": 4.7771200342343634e-05,
97
+ "loss": 4.18,
98
  "step": 7500
99
  },
100
  {
101
  "epoch": 0.14,
102
  "learning_rate": 4.7622613698499875e-05,
103
+ "loss": 4.1764,
104
  "step": 8000
105
  },
106
  {
107
  "epoch": 0.15,
108
  "learning_rate": 4.747402705465611e-05,
109
+ "loss": 4.1724,
110
  "step": 8500
111
  },
112
  {
113
  "epoch": 0.16,
114
  "learning_rate": 4.732544041081236e-05,
115
+ "loss": 4.1814,
116
  "step": 9000
117
  },
118
  {
119
  "epoch": 0.17,
120
  "learning_rate": 4.71768537669686e-05,
121
+ "loss": 4.1784,
122
  "step": 9500
123
  },
124
  {
125
  "epoch": 0.18,
126
  "learning_rate": 4.702826712312484e-05,
127
+ "loss": 4.19,
128
  "step": 10000
129
  },
130
  {
131
  "epoch": 0.19,
132
  "learning_rate": 4.687968047928108e-05,
133
+ "loss": 4.165,
134
  "step": 10500
135
  },
136
  {
137
  "epoch": 0.2,
138
  "learning_rate": 4.673109383543732e-05,
139
+ "loss": 4.1738,
140
  "step": 11000
141
  },
142
  {
143
  "epoch": 0.21,
144
  "learning_rate": 4.658250719159356e-05,
145
+ "loss": 4.1795,
146
  "step": 11500
147
  },
148
  {
149
  "epoch": 0.21,
150
  "learning_rate": 4.6433920547749804e-05,
151
+ "loss": 4.1975,
152
  "step": 12000
153
  },
154
  {
155
  "epoch": 0.22,
156
  "learning_rate": 4.6285333903906045e-05,
157
+ "loss": 4.1582,
158
  "step": 12500
159
  },
160
  {
161
  "epoch": 0.23,
162
  "learning_rate": 4.613674726006229e-05,
163
+ "loss": 4.1763,
164
  "step": 13000
165
  },
166
  {
167
  "epoch": 0.24,
168
  "learning_rate": 4.5988160616218534e-05,
169
+ "loss": 4.1703,
170
  "step": 13500
171
  },
172
  {
173
  "epoch": 0.25,
174
  "learning_rate": 4.583957397237477e-05,
175
+ "loss": 4.1839,
176
  "step": 14000
177
  },
178
  {
179
  "epoch": 0.26,
180
  "learning_rate": 4.569098732853102e-05,
181
+ "loss": 4.2002,
182
  "step": 14500
183
  },
184
  {
185
  "epoch": 0.27,
186
  "learning_rate": 4.554240068468726e-05,
187
+ "loss": 4.1428,
188
  "step": 15000
189
  },
190
  {
191
  "epoch": 0.28,
192
  "learning_rate": 4.53938140408435e-05,
193
+ "loss": 4.187,
194
  "step": 15500
195
  },
196
  {
197
  "epoch": 0.29,
198
  "learning_rate": 4.524522739699974e-05,
199
+ "loss": 4.1869,
200
  "step": 16000
201
  },
202
  {
203
  "epoch": 0.29,
204
  "learning_rate": 4.509664075315598e-05,
205
+ "loss": 4.1829,
206
  "step": 16500
207
  },
208
  {
209
  "epoch": 0.3,
210
  "learning_rate": 4.494805410931223e-05,
211
+ "loss": 4.1572,
212
  "step": 17000
213
  },
214
  {
215
  "epoch": 0.31,
216
  "learning_rate": 4.4799467465468464e-05,
217
+ "loss": 4.1367,
218
  "step": 17500
219
  },
220
  {
221
  "epoch": 0.32,
222
  "learning_rate": 4.4650880821624705e-05,
223
+ "loss": 4.1692,
224
  "step": 18000
225
  },
226
  {
227
  "epoch": 0.33,
228
  "learning_rate": 4.450229417778095e-05,
229
+ "loss": 4.182,
230
  "step": 18500
231
  },
232
  {
233
  "epoch": 0.34,
234
  "learning_rate": 4.435370753393719e-05,
235
+ "loss": 4.2536,
236
  "step": 19000
237
  },
238
  {
239
  "epoch": 0.35,
240
  "learning_rate": 4.4205120890093435e-05,
241
+ "loss": 4.2086,
242
  "step": 19500
243
  },
244
  {
245
  "epoch": 0.36,
246
  "learning_rate": 4.4056534246249676e-05,
247
+ "loss": 4.1256,
248
  "step": 20000
249
  },
250
  {
251
  "epoch": 0.37,
252
  "learning_rate": 4.390794760240592e-05,
253
+ "loss": 4.1798,
254
  "step": 20500
255
  },
256
  {
257
  "epoch": 0.37,
258
  "learning_rate": 4.375936095856216e-05,
259
+ "loss": 4.1822,
260
  "step": 21000
261
  },
262
  {
263
  "epoch": 0.38,
264
  "learning_rate": 4.36107743147184e-05,
265
+ "loss": 4.1925,
266
  "step": 21500
267
  },
268
  {
269
  "epoch": 0.39,
270
  "learning_rate": 4.346218767087464e-05,
271
+ "loss": 4.1702,
272
  "step": 22000
273
  },
274
  {
275
  "epoch": 0.4,
276
  "learning_rate": 4.331360102703089e-05,
277
+ "loss": 4.1983,
278
  "step": 22500
279
  },
280
  {
281
  "epoch": 0.41,
282
  "learning_rate": 4.3165014383187123e-05,
283
+ "loss": 4.2146,
284
  "step": 23000
285
  },
286
  {
287
  "epoch": 0.42,
288
  "learning_rate": 4.3016427739343365e-05,
289
+ "loss": 4.2053,
290
  "step": 23500
291
  },
292
  {
293
  "epoch": 0.43,
294
  "learning_rate": 4.286784109549961e-05,
295
+ "loss": 4.2043,
296
  "step": 24000
297
  },
298
  {
299
  "epoch": 0.44,
300
  "learning_rate": 4.271925445165585e-05,
301
+ "loss": 4.1945,
302
  "step": 24500
303
  },
304
  {
305
  "epoch": 0.45,
306
  "learning_rate": 4.2570667807812095e-05,
307
+ "loss": 4.1583,
308
  "step": 25000
309
  },
310
  {
311
  "epoch": 0.45,
312
  "learning_rate": 4.2422081163968336e-05,
313
+ "loss": 4.2209,
314
  "step": 25500
315
  },
316
  {
317
  "epoch": 0.46,
318
  "learning_rate": 4.227349452012458e-05,
319
+ "loss": 4.1892,
320
  "step": 26000
321
  },
322
  {
323
  "epoch": 0.47,
324
  "learning_rate": 4.212490787628082e-05,
325
+ "loss": 4.1542,
326
  "step": 26500
327
  },
328
  {
329
  "epoch": 0.48,
330
  "learning_rate": 4.197632123243706e-05,
331
+ "loss": 4.1594,
332
  "step": 27000
333
  },
334
  {
335
  "epoch": 0.49,
336
  "learning_rate": 4.18277345885933e-05,
337
+ "loss": 4.1926,
338
  "step": 27500
339
  },
340
  {
341
  "epoch": 0.5,
342
  "learning_rate": 4.167914794474955e-05,
343
+ "loss": 4.1596,
344
  "step": 28000
345
  },
346
  {
347
  "epoch": 0.51,
348
  "learning_rate": 4.153056130090578e-05,
349
+ "loss": 4.1525,
350
  "step": 28500
351
  },
352
  {
353
  "epoch": 0.52,
354
  "learning_rate": 4.138197465706203e-05,
355
+ "loss": 4.1282,
356
  "step": 29000
357
  },
358
  {
359
  "epoch": 0.53,
360
  "learning_rate": 4.123338801321827e-05,
361
+ "loss": 4.1569,
362
  "step": 29500
363
  },
364
  {
365
  "epoch": 0.53,
366
  "learning_rate": 4.1084801369374507e-05,
367
+ "loss": 4.1544,
368
  "step": 30000
369
  },
370
  {
371
  "epoch": 0.54,
372
  "learning_rate": 4.0936214725530755e-05,
373
+ "loss": 4.1984,
374
  "step": 30500
375
  },
376
  {
377
  "epoch": 0.55,
378
  "learning_rate": 4.0787628081686996e-05,
379
+ "loss": 4.1907,
380
  "step": 31000
381
  },
382
  {
383
  "epoch": 0.56,
384
  "learning_rate": 4.063904143784324e-05,
385
+ "loss": 4.2402,
386
  "step": 31500
387
  },
388
  {
389
  "epoch": 0.57,
390
  "learning_rate": 4.049045479399948e-05,
391
+ "loss": 4.1914,
392
  "step": 32000
393
  },
394
  {
395
  "epoch": 0.58,
396
  "learning_rate": 4.034186815015572e-05,
397
+ "loss": 4.1622,
398
  "step": 32500
399
  },
400
  {
401
  "epoch": 0.59,
402
  "learning_rate": 4.019328150631197e-05,
403
+ "loss": 4.1444,
404
  "step": 33000
405
  },
406
  {
407
  "epoch": 0.6,
408
  "learning_rate": 4.00446948624682e-05,
409
+ "loss": 4.1581,
410
  "step": 33500
411
  },
412
  {
413
  "epoch": 0.61,
414
  "learning_rate": 3.989610821862444e-05,
415
+ "loss": 4.2014,
416
  "step": 34000
417
  },
418
  {
419
  "epoch": 0.62,
420
  "learning_rate": 3.974752157478069e-05,
421
+ "loss": 4.1797,
422
  "step": 34500
423
  },
424
  {
425
  "epoch": 0.62,
426
  "learning_rate": 3.959893493093693e-05,
427
+ "loss": 4.1669,
428
  "step": 35000
429
  },
430
  {
431
  "epoch": 0.63,
432
  "learning_rate": 3.9450348287093166e-05,
433
+ "loss": 4.1538,
434
  "step": 35500
435
  },
436
  {
437
  "epoch": 0.64,
438
  "learning_rate": 3.9301761643249414e-05,
439
+ "loss": 4.1359,
440
  "step": 36000
441
  },
442
  {
443
  "epoch": 0.65,
444
  "learning_rate": 3.9153174999405655e-05,
445
+ "loss": 4.1551,
446
  "step": 36500
447
  },
448
  {
449
  "epoch": 0.66,
450
  "learning_rate": 3.9004588355561897e-05,
451
+ "loss": 4.2037,
452
  "step": 37000
453
  },
454
  {
455
  "epoch": 0.67,
456
  "learning_rate": 3.885600171171814e-05,
457
+ "loss": 4.1428,
458
  "step": 37500
459
  },
460
  {
461
  "epoch": 0.68,
462
  "learning_rate": 3.870741506787438e-05,
463
+ "loss": 4.1649,
464
  "step": 38000
465
  },
466
  {
467
  "epoch": 0.69,
468
  "learning_rate": 3.855882842403063e-05,
469
+ "loss": 4.1858,
470
  "step": 38500
471
  },
472
  {
473
  "epoch": 0.7,
474
  "learning_rate": 3.841024178018686e-05,
475
+ "loss": 4.1586,
476
  "step": 39000
477
  },
478
  {
479
  "epoch": 0.7,
480
  "learning_rate": 3.82616551363431e-05,
481
+ "loss": 4.1849,
482
  "step": 39500
483
  },
484
  {
485
  "epoch": 0.71,
486
  "learning_rate": 3.811306849249935e-05,
487
+ "loss": 4.1875,
488
  "step": 40000
489
  },
490
  {
491
  "epoch": 0.72,
492
  "learning_rate": 3.796448184865559e-05,
493
+ "loss": 4.1367,
494
  "step": 40500
495
  },
496
  {
497
  "epoch": 0.73,
498
  "learning_rate": 3.781589520481183e-05,
499
+ "loss": 4.1681,
500
  "step": 41000
501
  },
502
  {
503
  "epoch": 0.74,
504
  "learning_rate": 3.7667308560968074e-05,
505
+ "loss": 4.1701,
506
  "step": 41500
507
  },
508
  {
509
  "epoch": 0.75,
510
  "learning_rate": 3.7518721917124315e-05,
511
+ "loss": 4.1509,
512
  "step": 42000
513
  },
514
  {
515
  "epoch": 0.76,
516
  "learning_rate": 3.737013527328056e-05,
517
+ "loss": 4.2155,
518
  "step": 42500
519
  },
520
  {
521
  "epoch": 0.77,
522
  "learning_rate": 3.72215486294368e-05,
523
+ "loss": 4.2056,
524
  "step": 43000
525
  },
526
  {
527
  "epoch": 0.78,
528
  "learning_rate": 3.707296198559304e-05,
529
+ "loss": 4.1598,
530
  "step": 43500
531
  },
532
  {
533
  "epoch": 0.78,
534
  "learning_rate": 3.6924375341749286e-05,
535
+ "loss": 4.1701,
536
  "step": 44000
537
  },
538
  {
539
  "epoch": 0.79,
540
  "learning_rate": 3.677578869790552e-05,
541
+ "loss": 4.1735,
542
  "step": 44500
543
  },
544
  {
545
  "epoch": 0.8,
546
  "learning_rate": 3.662720205406177e-05,
547
+ "loss": 4.1457,
548
  "step": 45000
549
  },
550
  {
551
  "epoch": 0.81,
552
  "learning_rate": 3.647861541021801e-05,
553
+ "loss": 4.1469,
554
  "step": 45500
555
  },
556
  {
557
  "epoch": 0.82,
558
  "learning_rate": 3.633002876637425e-05,
559
+ "loss": 4.1014,
560
  "step": 46000
561
  },
562
  {
563
  "epoch": 0.83,
564
  "learning_rate": 3.618144212253049e-05,
565
+ "loss": 4.1308,
566
  "step": 46500
567
  },
568
  {
569
  "epoch": 0.84,
570
  "learning_rate": 3.6032855478686734e-05,
571
+ "loss": 4.1282,
572
  "step": 47000
573
  },
574
  {
575
  "epoch": 0.85,
576
  "learning_rate": 3.5884268834842975e-05,
577
+ "loss": 4.1384,
578
  "step": 47500
579
  },
580
  {
581
  "epoch": 0.86,
582
  "learning_rate": 3.5735682190999216e-05,
583
+ "loss": 4.1709,
584
  "step": 48000
585
  },
586
  {
587
  "epoch": 0.86,
588
  "learning_rate": 3.558709554715546e-05,
589
+ "loss": 4.1861,
590
  "step": 48500
591
  },
592
  {
593
  "epoch": 0.87,
594
  "learning_rate": 3.54385089033117e-05,
595
+ "loss": 4.1939,
596
  "step": 49000
597
  },
598
  {
599
  "epoch": 0.88,
600
  "learning_rate": 3.5289922259467946e-05,
601
+ "loss": 4.186,
602
  "step": 49500
603
  },
604
  {
605
  "epoch": 0.89,
606
  "learning_rate": 3.514133561562418e-05,
607
+ "loss": 4.1661,
608
  "step": 50000
609
  },
610
  {
611
  "epoch": 0.9,
612
  "learning_rate": 3.499274897178043e-05,
613
+ "loss": 4.1291,
614
  "step": 50500
615
  },
616
  {
617
  "epoch": 0.91,
618
  "learning_rate": 3.484416232793667e-05,
619
+ "loss": 4.1377,
620
  "step": 51000
621
  },
622
  {
623
  "epoch": 0.92,
624
  "learning_rate": 3.469557568409291e-05,
625
+ "loss": 4.2083,
626
  "step": 51500
627
  },
628
  {
629
  "epoch": 0.93,
630
  "learning_rate": 3.454698904024915e-05,
631
+ "loss": 4.1944,
632
  "step": 52000
633
  },
634
  {
635
  "epoch": 0.94,
636
  "learning_rate": 3.439840239640539e-05,
637
+ "loss": 4.1597,
638
  "step": 52500
639
  },
640
  {
641
  "epoch": 0.95,
642
  "learning_rate": 3.4249815752561634e-05,
643
+ "loss": 4.1432,
644
  "step": 53000
645
  },
646
  {
647
  "epoch": 0.95,
648
  "learning_rate": 3.4101229108717876e-05,
649
+ "loss": 4.1659,
650
  "step": 53500
651
  },
652
  {
653
  "epoch": 0.96,
654
  "learning_rate": 3.395264246487412e-05,
655
+ "loss": 4.1568,
656
  "step": 54000
657
  },
658
  {
659
  "epoch": 0.97,
660
  "learning_rate": 3.3804055821030365e-05,
661
+ "loss": 4.1673,
662
  "step": 54500
663
  },
664
  {
665
  "epoch": 0.98,
666
  "learning_rate": 3.3655469177186606e-05,
667
+ "loss": 4.1522,
668
  "step": 55000
669
  },
670
  {
671
  "epoch": 0.99,
672
  "learning_rate": 3.350688253334284e-05,
673
+ "loss": 4.1354,
674
  "step": 55500
675
  },
676
  {
677
  "epoch": 1.0,
678
  "learning_rate": 3.335829588949909e-05,
679
+ "loss": 4.172,
680
  "step": 56000
681
  },
682
  {
683
  "epoch": 1.01,
684
  "learning_rate": 3.320970924565533e-05,
685
+ "loss": 4.1534,
686
  "step": 56500
687
  },
688
  {
689
  "epoch": 1.02,
690
  "learning_rate": 3.306112260181157e-05,
691
+ "loss": 4.1092,
692
  "step": 57000
693
  },
694
  {
695
  "epoch": 1.03,
696
  "learning_rate": 3.291253595796781e-05,
697
+ "loss": 4.1695,
698
  "step": 57500
699
  },
700
  {
701
  "epoch": 1.03,
702
  "learning_rate": 3.276394931412405e-05,
703
+ "loss": 4.1512,
704
  "step": 58000
705
  },
706
  {
707
  "epoch": 1.04,
708
  "learning_rate": 3.2615362670280294e-05,
709
+ "loss": 4.1747,
710
  "step": 58500
711
  },
712
  {
713
  "epoch": 1.05,
714
  "learning_rate": 3.2466776026436535e-05,
715
+ "loss": 4.1872,
716
  "step": 59000
717
  },
718
  {
719
  "epoch": 1.06,
720
  "learning_rate": 3.2318189382592776e-05,
721
+ "loss": 4.1634,
722
  "step": 59500
723
  },
724
  {
725
  "epoch": 1.07,
726
  "learning_rate": 3.2169602738749024e-05,
727
+ "loss": 4.1033,
728
  "step": 60000
729
  },
730
  {
731
  "epoch": 1.08,
732
  "learning_rate": 3.2021016094905265e-05,
733
+ "loss": 4.1247,
734
  "step": 60500
735
  },
736
  {
737
  "epoch": 1.09,
738
  "learning_rate": 3.18724294510615e-05,
739
+ "loss": 4.1823,
740
  "step": 61000
741
  },
742
  {
743
  "epoch": 1.1,
744
  "learning_rate": 3.172384280721775e-05,
745
+ "loss": 4.1718,
746
  "step": 61500
747
  },
748
  {
749
  "epoch": 1.11,
750
  "learning_rate": 3.157525616337399e-05,
751
+ "loss": 4.1982,
752
  "step": 62000
753
  },
754
  {
755
  "epoch": 1.11,
756
  "learning_rate": 3.142666951953023e-05,
757
+ "loss": 4.1956,
758
  "step": 62500
759
  },
760
  {
761
  "epoch": 1.12,
762
  "learning_rate": 3.127808287568647e-05,
763
+ "loss": 4.162,
764
  "step": 63000
765
  },
766
  {
767
  "epoch": 1.13,
768
  "learning_rate": 3.112949623184271e-05,
769
+ "loss": 4.1438,
770
  "step": 63500
771
  },
772
  {
773
  "epoch": 1.14,
774
  "learning_rate": 3.098090958799896e-05,
775
+ "loss": 4.2248,
776
  "step": 64000
777
  },
778
  {
779
  "epoch": 1.15,
780
  "learning_rate": 3.0832322944155195e-05,
781
+ "loss": 4.1655,
782
  "step": 64500
783
  },
784
  {
785
  "epoch": 1.16,
786
  "learning_rate": 3.0683736300311436e-05,
787
+ "loss": 4.152,
788
  "step": 65000
789
  },
790
  {
791
  "epoch": 1.17,
792
  "learning_rate": 3.0535149656467684e-05,
793
+ "loss": 4.1784,
794
  "step": 65500
795
  },
796
  {
797
  "epoch": 1.18,
798
  "learning_rate": 3.0386563012623925e-05,
799
+ "loss": 4.1958,
800
  "step": 66000
801
  },
802
  {
803
  "epoch": 1.19,
804
  "learning_rate": 3.0237976368780163e-05,
805
+ "loss": 4.156,
806
  "step": 66500
807
  },
808
  {
809
  "epoch": 1.19,
810
  "learning_rate": 3.0089389724936407e-05,
811
+ "loss": 4.178,
812
  "step": 67000
813
  },
814
  {
815
  "epoch": 1.2,
816
  "learning_rate": 2.994080308109265e-05,
817
+ "loss": 4.1562,
818
  "step": 67500
819
  },
820
  {
821
  "epoch": 1.21,
822
  "learning_rate": 2.9792216437248886e-05,
823
+ "loss": 4.1535,
824
  "step": 68000
825
  },
826
  {
827
  "epoch": 1.22,
828
  "learning_rate": 2.964362979340513e-05,
829
+ "loss": 4.1224,
830
  "step": 68500
831
  },
832
  {
833
  "epoch": 1.23,
834
  "learning_rate": 2.9495043149561376e-05,
835
+ "loss": 4.1547,
836
  "step": 69000
837
  },
838
  {
839
  "epoch": 1.24,
840
  "learning_rate": 2.9346456505717617e-05,
841
+ "loss": 4.1378,
842
  "step": 69500
843
  },
844
  {
845
  "epoch": 1.25,
846
  "learning_rate": 2.9197869861873855e-05,
847
+ "loss": 4.1573,
848
  "step": 70000
849
  },
850
  {
851
  "epoch": 1.26,
852
  "learning_rate": 2.90492832180301e-05,
853
+ "loss": 4.1575,
854
  "step": 70500
855
  },
856
  {
857
  "epoch": 1.27,
858
  "learning_rate": 2.8900696574186344e-05,
859
+ "loss": 4.1726,
860
  "step": 71000
861
  },
862
  {
863
  "epoch": 1.27,
864
  "learning_rate": 2.875210993034258e-05,
865
+ "loss": 4.1971,
866
  "step": 71500
867
  },
868
  {
869
  "epoch": 1.28,
870
  "learning_rate": 2.8603523286498823e-05,
871
+ "loss": 4.1289,
872
  "step": 72000
873
  },
874
  {
875
  "epoch": 1.29,
876
  "learning_rate": 2.8454936642655067e-05,
877
+ "loss": 4.1571,
878
  "step": 72500
879
  },
880
  {
881
  "epoch": 1.3,
882
  "learning_rate": 2.8306349998811312e-05,
883
+ "loss": 4.124,
884
  "step": 73000
885
  },
886
  {
887
  "epoch": 1.31,
888
  "learning_rate": 2.815776335496755e-05,
889
+ "loss": 4.1769,
890
  "step": 73500
891
  },
892
  {
893
  "epoch": 1.32,
894
  "learning_rate": 2.800917671112379e-05,
895
+ "loss": 4.1581,
896
  "step": 74000
897
  },
898
  {
899
  "epoch": 1.33,
900
  "learning_rate": 2.7860590067280035e-05,
901
+ "loss": 4.1828,
902
  "step": 74500
903
  },
904
  {
905
  "epoch": 1.34,
906
  "learning_rate": 2.7712003423436276e-05,
907
+ "loss": 4.1448,
908
  "step": 75000
909
  },
910
  {
911
  "epoch": 1.35,
912
  "learning_rate": 2.7563416779592514e-05,
913
+ "loss": 4.2025,
914
  "step": 75500
915
  },
916
  {
917
  "epoch": 1.36,
918
  "learning_rate": 2.741483013574876e-05,
919
+ "loss": 4.1474,
920
  "step": 76000
921
  },
922
  {
923
  "epoch": 1.36,
924
  "learning_rate": 2.7266243491905003e-05,
925
+ "loss": 4.2199,
926
  "step": 76500
927
  },
928
  {
929
  "epoch": 1.37,
930
  "learning_rate": 2.711765684806124e-05,
931
+ "loss": 4.1517,
932
  "step": 77000
933
  },
934
  {
935
  "epoch": 1.38,
936
  "learning_rate": 2.6969070204217482e-05,
937
+ "loss": 4.1469,
938
  "step": 77500
939
  },
940
  {
941
  "epoch": 1.39,
942
  "learning_rate": 2.6820483560373727e-05,
943
+ "loss": 4.1104,
944
  "step": 78000
945
  },
946
  {
947
  "epoch": 1.4,
948
  "learning_rate": 2.667189691652997e-05,
949
+ "loss": 4.1482,
950
  "step": 78500
951
  },
952
  {
953
  "epoch": 1.41,
954
  "learning_rate": 2.652331027268621e-05,
955
+ "loss": 4.1674,
956
  "step": 79000
957
  },
958
  {
959
  "epoch": 1.42,
960
  "learning_rate": 2.637472362884245e-05,
961
+ "loss": 4.1601,
962
  "step": 79500
963
  },
964
  {
965
  "epoch": 1.43,
966
  "learning_rate": 2.6226136984998695e-05,
967
+ "loss": 4.2056,
968
  "step": 80000
969
  },
970
  {
971
  "epoch": 1.44,
972
  "learning_rate": 2.607755034115494e-05,
973
+ "loss": 4.1223,
974
  "step": 80500
975
  },
976
  {
977
  "epoch": 1.44,
978
  "learning_rate": 2.5928963697311177e-05,
979
+ "loss": 4.1843,
980
  "step": 81000
981
  },
982
  {
983
  "epoch": 1.45,
984
  "learning_rate": 2.578037705346742e-05,
985
+ "loss": 4.1564,
986
  "step": 81500
987
  },
988
  {
989
  "epoch": 1.46,
990
  "learning_rate": 2.5631790409623663e-05,
991
+ "loss": 4.1476,
992
  "step": 82000
993
  },
994
  {
995
  "epoch": 1.47,
996
  "learning_rate": 2.54832037657799e-05,
997
+ "loss": 4.139,
998
  "step": 82500
999
  },
1000
  {
1001
  "epoch": 1.48,
1002
  "learning_rate": 2.5334617121936145e-05,
1003
+ "loss": 4.1541,
1004
  "step": 83000
1005
  },
1006
  {
1007
  "epoch": 1.49,
1008
  "learning_rate": 2.5186030478092386e-05,
1009
+ "loss": 4.1593,
1010
  "step": 83500
1011
  },
1012
  {
1013
  "epoch": 1.5,
1014
  "learning_rate": 2.503744383424863e-05,
1015
+ "loss": 4.1922,
1016
  "step": 84000
1017
  },
1018
  {
1019
  "epoch": 1.51,
1020
  "learning_rate": 2.4888857190404872e-05,
1021
+ "loss": 4.1539,
1022
  "step": 84500
1023
  },
1024
  {
1025
  "epoch": 1.52,
1026
  "learning_rate": 2.4740270546561113e-05,
1027
+ "loss": 4.1665,
1028
  "step": 85000
1029
  },
1030
  {
1031
  "epoch": 1.52,
1032
  "learning_rate": 2.459168390271735e-05,
1033
+ "loss": 4.1476,
1034
  "step": 85500
1035
  },
1036
  {
1037
  "epoch": 1.53,
1038
  "learning_rate": 2.4443097258873596e-05,
1039
+ "loss": 4.1933,
1040
  "step": 86000
1041
  },
1042
  {
1043
  "epoch": 1.54,
1044
  "learning_rate": 2.4294510615029837e-05,
1045
+ "loss": 4.2239,
1046
  "step": 86500
1047
  },
1048
  {
1049
  "epoch": 1.55,
1050
  "learning_rate": 2.4145923971186078e-05,
1051
+ "loss": 4.1581,
1052
  "step": 87000
1053
  },
1054
  {
1055
  "epoch": 1.56,
1056
  "learning_rate": 2.399733732734232e-05,
1057
+ "loss": 4.217,
1058
  "step": 87500
1059
  },
1060
  {
1061
  "epoch": 1.57,
1062
  "learning_rate": 2.3848750683498564e-05,
1063
+ "loss": 4.1504,
1064
  "step": 88000
1065
  },
1066
  {
1067
  "epoch": 1.58,
1068
  "learning_rate": 2.3700164039654805e-05,
1069
+ "loss": 4.1499,
1070
  "step": 88500
1071
  },
1072
  {
1073
  "epoch": 1.59,
1074
  "learning_rate": 2.3551577395811046e-05,
1075
+ "loss": 4.1879,
1076
  "step": 89000
1077
  },
1078
  {
1079
  "epoch": 1.6,
1080
  "learning_rate": 2.3402990751967287e-05,
1081
+ "loss": 4.1592,
1082
  "step": 89500
1083
  },
1084
  {
1085
  "epoch": 1.6,
1086
  "learning_rate": 2.3254404108123532e-05,
1087
+ "loss": 4.1356,
1088
  "step": 90000
1089
  },
1090
  {
1091
  "epoch": 1.61,
1092
  "learning_rate": 2.3105817464279773e-05,
1093
+ "loss": 4.1924,
1094
  "step": 90500
1095
  },
1096
  {
1097
  "epoch": 1.62,
1098
  "learning_rate": 2.2957230820436014e-05,
1099
+ "loss": 4.1122,
1100
  "step": 91000
1101
  },
1102
  {
1103
  "epoch": 1.63,
1104
  "learning_rate": 2.2808644176592255e-05,
1105
+ "loss": 4.1681,
1106
  "step": 91500
1107
  },
1108
  {
1109
  "epoch": 1.64,
1110
  "learning_rate": 2.2660057532748497e-05,
1111
+ "loss": 4.1771,
1112
  "step": 92000
1113
  },
1114
  {
1115
  "epoch": 1.65,
1116
  "learning_rate": 2.251147088890474e-05,
1117
+ "loss": 4.1596,
1118
  "step": 92500
1119
  },
1120
  {
1121
  "epoch": 1.66,
1122
  "learning_rate": 2.236288424506098e-05,
1123
+ "loss": 4.1591,
1124
  "step": 93000
1125
  },
1126
  {
1127
  "epoch": 1.67,
1128
  "learning_rate": 2.2214297601217223e-05,
1129
+ "loss": 4.1259,
1130
  "step": 93500
1131
  },
1132
  {
1133
  "epoch": 1.68,
1134
  "learning_rate": 2.2065710957373465e-05,
1135
+ "loss": 4.1673,
1136
  "step": 94000
1137
  },
1138
  {
1139
  "epoch": 1.68,
1140
  "learning_rate": 2.191712431352971e-05,
1141
+ "loss": 4.1816,
1142
  "step": 94500
1143
  },
1144
  {
1145
  "epoch": 1.69,
1146
  "learning_rate": 2.1768537669685947e-05,
1147
+ "loss": 4.1738,
1148
  "step": 95000
1149
  },
1150
  {
1151
  "epoch": 1.7,
1152
  "learning_rate": 2.1619951025842188e-05,
1153
+ "loss": 4.1417,
1154
  "step": 95500
1155
  },
1156
  {
1157
  "epoch": 1.71,
1158
  "learning_rate": 2.1471364381998433e-05,
1159
+ "loss": 4.141,
1160
  "step": 96000
1161
  },
1162
  {
1163
  "epoch": 1.72,
1164
  "learning_rate": 2.1322777738154674e-05,
1165
+ "loss": 4.1754,
1166
  "step": 96500
1167
  },
1168
  {
1169
  "epoch": 1.73,
1170
  "learning_rate": 2.1174191094310915e-05,
1171
+ "loss": 4.1311,
1172
  "step": 97000
1173
  },
1174
  {
1175
  "epoch": 1.74,
1176
  "learning_rate": 2.1025604450467156e-05,
1177
+ "loss": 4.2245,
1178
  "step": 97500
1179
  },
1180
  {
1181
  "epoch": 1.75,
1182
  "learning_rate": 2.08770178066234e-05,
1183
+ "loss": 4.2167,
1184
  "step": 98000
1185
  },
1186
  {
1187
  "epoch": 1.76,
1188
  "learning_rate": 2.0728431162779642e-05,
1189
+ "loss": 4.1694,
1190
  "step": 98500
1191
  },
1192
  {
1193
  "epoch": 1.77,
1194
  "learning_rate": 2.0579844518935883e-05,
1195
+ "loss": 4.1671,
1196
  "step": 99000
1197
  },
1198
  {
1199
  "epoch": 1.77,
1200
  "learning_rate": 2.0431257875092124e-05,
1201
+ "loss": 4.1839,
1202
  "step": 99500
1203
  },
1204
  {
1205
  "epoch": 1.78,
1206
  "learning_rate": 2.0282671231248365e-05,
1207
+ "loss": 4.1632,
1208
  "step": 100000
1209
  },
1210
  {
1211
  "epoch": 1.79,
1212
  "learning_rate": 2.013408458740461e-05,
1213
+ "loss": 4.2308,
1214
  "step": 100500
1215
  },
1216
  {
1217
  "epoch": 1.8,
1218
  "learning_rate": 1.9985497943560848e-05,
1219
+ "loss": 4.1291,
1220
  "step": 101000
1221
  },
1222
  {
1223
  "epoch": 1.81,
1224
  "learning_rate": 1.9836911299717092e-05,
1225
+ "loss": 4.1498,
1226
  "step": 101500
1227
  },
1228
  {
1229
  "epoch": 1.82,
1230
  "learning_rate": 1.9688324655873334e-05,
1231
+ "loss": 4.1277,
1232
  "step": 102000
1233
  },
1234
  {
1235
  "epoch": 1.83,
1236
  "learning_rate": 1.9539738012029578e-05,
1237
+ "loss": 4.1425,
1238
  "step": 102500
1239
  },
1240
  {
1241
  "epoch": 1.84,
1242
  "learning_rate": 1.9391151368185816e-05,
1243
+ "loss": 4.1518,
1244
  "step": 103000
1245
  },
1246
  {
1247
  "epoch": 1.85,
1248
  "learning_rate": 1.924256472434206e-05,
1249
+ "loss": 4.1907,
1250
  "step": 103500
1251
  },
1252
  {
1253
  "epoch": 1.85,
1254
  "learning_rate": 1.90939780804983e-05,
1255
+ "loss": 4.1234,
1256
  "step": 104000
1257
  },
1258
  {
1259
  "epoch": 1.86,
1260
  "learning_rate": 1.8945391436654543e-05,
1261
+ "loss": 4.1702,
1262
  "step": 104500
1263
  },
1264
  {
1265
  "epoch": 1.87,
1266
  "learning_rate": 1.8796804792810784e-05,
1267
+ "loss": 4.1586,
1268
  "step": 105000
1269
  },
1270
  {
1271
  "epoch": 1.88,
1272
  "learning_rate": 1.8648218148967025e-05,
1273
+ "loss": 4.1564,
1274
  "step": 105500
1275
  },
1276
  {
1277
  "epoch": 1.89,
1278
  "learning_rate": 1.849963150512327e-05,
1279
+ "loss": 4.1591,
1280
  "step": 106000
1281
  },
1282
  {
1283
  "epoch": 1.9,
1284
  "learning_rate": 1.835104486127951e-05,
1285
+ "loss": 4.1811,
1286
  "step": 106500
1287
  },
1288
  {
1289
  "epoch": 1.91,
1290
  "learning_rate": 1.8202458217435752e-05,
1291
+ "loss": 4.1795,
1292
  "step": 107000
1293
  },
1294
  {
1295
  "epoch": 1.92,
1296
  "learning_rate": 1.8053871573591993e-05,
1297
+ "loss": 4.1605,
1298
  "step": 107500
1299
  },
1300
  {
1301
  "epoch": 1.93,
1302
  "learning_rate": 1.7905284929748238e-05,
1303
+ "loss": 4.1761,
1304
  "step": 108000
1305
  },
1306
  {
1307
  "epoch": 1.93,
1308
  "learning_rate": 1.775669828590448e-05,
1309
+ "loss": 4.186,
1310
  "step": 108500
1311
  },
1312
  {
1313
  "epoch": 1.94,
1314
  "learning_rate": 1.760811164206072e-05,
1315
+ "loss": 4.2381,
1316
  "step": 109000
1317
  },
1318
  {
1319
  "epoch": 1.95,
1320
  "learning_rate": 1.745952499821696e-05,
1321
+ "loss": 4.1839,
1322
  "step": 109500
1323
  },
1324
  {
1325
  "epoch": 1.96,
1326
  "learning_rate": 1.7310938354373202e-05,
1327
+ "loss": 4.2121,
1328
  "step": 110000
1329
  },
1330
  {
1331
  "epoch": 1.97,
1332
  "learning_rate": 1.7162351710529444e-05,
1333
+ "loss": 4.1832,
1334
  "step": 110500
1335
  },
1336
  {
1337
  "epoch": 1.98,
1338
  "learning_rate": 1.7013765066685685e-05,
1339
+ "loss": 4.1467,
1340
  "step": 111000
1341
  },
1342
  {
1343
  "epoch": 1.99,
1344
  "learning_rate": 1.686517842284193e-05,
1345
+ "loss": 4.1932,
1346
  "step": 111500
1347
  },
1348
  {
1349
  "epoch": 2.0,
1350
  "learning_rate": 1.671659177899817e-05,
1351
+ "loss": 4.1277,
1352
  "step": 112000
1353
  },
1354
  {
1355
  "epoch": 2.01,
1356
  "learning_rate": 1.656800513515441e-05,
1357
+ "loss": 4.0758,
1358
  "step": 112500
1359
  },
1360
  {
1361
  "epoch": 2.01,
1362
  "learning_rate": 1.6419418491310653e-05,
1363
+ "loss": 4.1676,
1364
  "step": 113000
1365
  },
1366
  {
1367
  "epoch": 2.02,
1368
  "learning_rate": 1.6270831847466897e-05,
1369
+ "loss": 4.2034,
1370
  "step": 113500
1371
  },
1372
  {
1373
  "epoch": 2.03,
1374
  "learning_rate": 1.612224520362314e-05,
1375
+ "loss": 4.1162,
1376
  "step": 114000
1377
  },
1378
  {
1379
  "epoch": 2.04,
1380
  "learning_rate": 1.597365855977938e-05,
1381
+ "loss": 4.1963,
1382
  "step": 114500
1383
  },
1384
  {
1385
  "epoch": 2.05,
1386
  "learning_rate": 1.582507191593562e-05,
1387
+ "loss": 4.1657,
1388
  "step": 115000
1389
  },
1390
  {
1391
  "epoch": 2.06,
1392
  "learning_rate": 1.5676485272091862e-05,
1393
+ "loss": 4.1668,
1394
  "step": 115500
1395
  },
1396
  {
1397
  "epoch": 2.07,
1398
  "learning_rate": 1.5527898628248107e-05,
1399
+ "loss": 4.153,
1400
  "step": 116000
1401
  },
1402
  {
1403
  "epoch": 2.08,
1404
  "learning_rate": 1.5379311984404344e-05,
1405
+ "loss": 4.1883,
1406
  "step": 116500
1407
  },
1408
  {
1409
  "epoch": 2.09,
1410
  "learning_rate": 1.5230725340560589e-05,
1411
+ "loss": 4.1552,
1412
  "step": 117000
1413
  },
1414
  {
1415
  "epoch": 2.1,
1416
  "learning_rate": 1.508213869671683e-05,
1417
+ "loss": 4.1509,
1418
  "step": 117500
1419
  },
1420
  {
1421
  "epoch": 2.1,
1422
  "learning_rate": 1.4933552052873073e-05,
1423
+ "loss": 4.1463,
1424
  "step": 118000
1425
  },
1426
  {
1427
  "epoch": 2.11,
1428
  "learning_rate": 1.4784965409029314e-05,
1429
+ "loss": 4.1101,
1430
  "step": 118500
1431
  },
1432
  {
1433
  "epoch": 2.12,
1434
  "learning_rate": 1.4636378765185554e-05,
1435
+ "loss": 4.085,
1436
  "step": 119000
1437
  },
1438
  {
1439
  "epoch": 2.13,
1440
  "learning_rate": 1.4487792121341798e-05,
1441
+ "loss": 4.1117,
1442
  "step": 119500
1443
  },
1444
  {
1445
  "epoch": 2.14,
1446
  "learning_rate": 1.4339205477498038e-05,
1447
+ "loss": 4.1806,
1448
  "step": 120000
1449
  },
1450
  {
1451
  "epoch": 2.15,
1452
  "learning_rate": 1.4190618833654282e-05,
1453
+ "loss": 4.1498,
1454
  "step": 120500
1455
  },
1456
  {
1457
  "epoch": 2.16,
1458
  "learning_rate": 1.4042032189810522e-05,
1459
+ "loss": 4.1366,
1460
  "step": 121000
1461
  },
1462
  {
1463
  "epoch": 2.17,
1464
  "learning_rate": 1.3893445545966766e-05,
1465
+ "loss": 4.1916,
1466
  "step": 121500
1467
  },
1468
  {
1469
  "epoch": 2.18,
1470
  "learning_rate": 1.3744858902123006e-05,
1471
+ "loss": 4.1232,
1472
  "step": 122000
1473
  },
1474
  {
1475
  "epoch": 2.18,
1476
  "learning_rate": 1.359627225827925e-05,
1477
+ "loss": 4.1785,
1478
  "step": 122500
1479
  },
1480
  {
1481
  "epoch": 2.19,
1482
  "learning_rate": 1.344768561443549e-05,
1483
+ "loss": 4.1444,
1484
  "step": 123000
1485
  },
1486
  {
1487
  "epoch": 2.2,
1488
  "learning_rate": 1.3299098970591731e-05,
1489
+ "loss": 4.1848,
1490
  "step": 123500
1491
  },
1492
  {
1493
  "epoch": 2.21,
1494
  "learning_rate": 1.3150512326747974e-05,
1495
+ "loss": 4.1975,
1496
  "step": 124000
1497
  },
1498
  {
1499
  "epoch": 2.22,
1500
  "learning_rate": 1.3001925682904215e-05,
1501
+ "loss": 4.1828,
1502
  "step": 124500
1503
  },
1504
  {
1505
  "epoch": 2.23,
1506
  "learning_rate": 1.2853339039060458e-05,
1507
+ "loss": 4.1857,
1508
  "step": 125000
1509
  },
1510
  {
1511
  "epoch": 2.24,
1512
  "learning_rate": 1.2704752395216699e-05,
1513
+ "loss": 4.1146,
1514
  "step": 125500
1515
  },
1516
  {
1517
  "epoch": 2.25,
1518
  "learning_rate": 1.2556165751372942e-05,
1519
+ "loss": 4.1367,
1520
  "step": 126000
1521
  },
1522
  {
1523
  "epoch": 2.26,
1524
  "learning_rate": 1.2407579107529183e-05,
1525
+ "loss": 4.1608,
1526
  "step": 126500
1527
  },
1528
  {
1529
  "epoch": 2.26,
1530
  "learning_rate": 1.2258992463685424e-05,
1531
+ "loss": 4.1655,
1532
  "step": 127000
1533
  },
1534
  {
1535
  "epoch": 2.27,
1536
  "learning_rate": 1.2110405819841667e-05,
1537
+ "loss": 4.158,
1538
  "step": 127500
1539
  },
1540
  {
1541
  "epoch": 2.28,
1542
  "learning_rate": 1.1961819175997908e-05,
1543
+ "loss": 4.1302,
1544
  "step": 128000
1545
  },
1546
  {
1547
  "epoch": 2.29,
1548
  "learning_rate": 1.1813232532154151e-05,
1549
+ "loss": 4.1538,
1550
  "step": 128500
1551
  },
1552
  {
1553
  "epoch": 2.3,
1554
  "learning_rate": 1.1664645888310392e-05,
1555
+ "loss": 4.1838,
1556
  "step": 129000
1557
  },
1558
  {
1559
  "epoch": 2.31,
1560
  "learning_rate": 1.1516059244466634e-05,
1561
+ "loss": 4.178,
1562
  "step": 129500
1563
  },
1564
  {
1565
  "epoch": 2.32,
1566
  "learning_rate": 1.1367472600622876e-05,
1567
+ "loss": 4.1766,
1568
  "step": 130000
1569
  },
1570
  {
1571
  "epoch": 2.33,
1572
  "learning_rate": 1.1218885956779118e-05,
1573
+ "loss": 4.1533,
1574
  "step": 130500
1575
  },
1576
  {
1577
  "epoch": 2.34,
1578
  "learning_rate": 1.1070299312935359e-05,
1579
+ "loss": 4.1715,
1580
  "step": 131000
1581
  },
1582
  {
1583
  "epoch": 2.34,
1584
  "learning_rate": 1.0921712669091602e-05,
1585
+ "loss": 4.1212,
1586
  "step": 131500
1587
  },
1588
  {
1589
  "epoch": 2.35,
1590
  "learning_rate": 1.0773126025247843e-05,
1591
+ "loss": 4.1308,
1592
  "step": 132000
1593
  },
1594
  {
1595
  "epoch": 2.36,
1596
  "learning_rate": 1.0624539381404084e-05,
1597
+ "loss": 4.1403,
1598
  "step": 132500
1599
  },
1600
  {
1601
  "epoch": 2.37,
1602
  "learning_rate": 1.0475952737560327e-05,
1603
+ "loss": 4.1551,
1604
  "step": 133000
1605
  },
1606
  {
1607
  "epoch": 2.38,
1608
  "learning_rate": 1.0327366093716568e-05,
1609
+ "loss": 4.1915,
1610
  "step": 133500
1611
  },
1612
  {
1613
  "epoch": 2.39,
1614
  "learning_rate": 1.0178779449872811e-05,
1615
+ "loss": 4.1381,
1616
  "step": 134000
1617
  },
1618
  {
1619
  "epoch": 2.4,
1620
  "learning_rate": 1.0030192806029052e-05,
1621
+ "loss": 4.1527,
1622
  "step": 134500
1623
  },
1624
  {
1625
  "epoch": 2.41,
1626
  "learning_rate": 9.881606162185295e-06,
1627
+ "loss": 4.1663,
1628
  "step": 135000
1629
  },
1630
  {
1631
  "epoch": 2.42,
1632
  "learning_rate": 9.733019518341534e-06,
1633
+ "loss": 4.2214,
1634
  "step": 135500
1635
  },
1636
  {
1637
  "epoch": 2.42,
1638
  "learning_rate": 9.584432874497777e-06,
1639
+ "loss": 4.1587,
1640
  "step": 136000
1641
  },
1642
  {
1643
  "epoch": 2.43,
1644
  "learning_rate": 9.435846230654018e-06,
1645
+ "loss": 4.1345,
1646
  "step": 136500
1647
  },
1648
  {
1649
  "epoch": 2.44,
1650
  "learning_rate": 9.287259586810261e-06,
1651
+ "loss": 4.189,
1652
  "step": 137000
1653
  },
1654
  {
1655
  "epoch": 2.45,
1656
  "learning_rate": 9.138672942966502e-06,
1657
+ "loss": 4.1978,
1658
  "step": 137500
1659
  },
1660
  {
1661
  "epoch": 2.46,
1662
  "learning_rate": 8.990086299122745e-06,
1663
+ "loss": 4.1403,
1664
  "step": 138000
1665
  },
1666
  {
1667
  "epoch": 2.47,
1668
  "learning_rate": 8.841499655278986e-06,
1669
+ "loss": 4.1012,
1670
  "step": 138500
1671
  },
1672
  {
1673
  "epoch": 2.48,
1674
  "learning_rate": 8.69291301143523e-06,
1675
+ "loss": 4.1605,
1676
  "step": 139000
1677
  },
1678
  {
1679
  "epoch": 2.49,
1680
  "learning_rate": 8.54432636759147e-06,
1681
+ "loss": 4.1597,
1682
  "step": 139500
1683
  },
1684
  {
1685
  "epoch": 2.5,
1686
  "learning_rate": 8.395739723747713e-06,
1687
+ "loss": 4.1998,
1688
  "step": 140000
1689
  },
1690
  {
1691
  "epoch": 2.51,
1692
  "learning_rate": 8.247153079903953e-06,
1693
+ "loss": 4.1804,
1694
  "step": 140500
1695
  },
1696
  {
1697
  "epoch": 2.51,
1698
  "learning_rate": 8.098566436060196e-06,
1699
+ "loss": 4.1377,
1700
  "step": 141000
1701
  },
1702
  {
1703
  "epoch": 2.52,
1704
  "learning_rate": 7.949979792216437e-06,
1705
+ "loss": 4.1452,
1706
  "step": 141500
1707
  },
1708
  {
1709
  "epoch": 2.53,
1710
  "learning_rate": 7.80139314837268e-06,
1711
+ "loss": 4.1304,
1712
  "step": 142000
1713
  },
1714
  {
1715
  "epoch": 2.54,
1716
  "learning_rate": 7.652806504528921e-06,
1717
+ "loss": 4.15,
1718
  "step": 142500
1719
  },
1720
  {
1721
  "epoch": 2.55,
1722
  "learning_rate": 7.504219860685163e-06,
1723
+ "loss": 4.1462,
1724
  "step": 143000
1725
  },
1726
  {
1727
  "epoch": 2.56,
1728
  "learning_rate": 7.355633216841405e-06,
1729
+ "loss": 4.1741,
1730
  "step": 143500
1731
  },
1732
  {
1733
  "epoch": 2.57,
1734
  "learning_rate": 7.207046572997647e-06,
1735
+ "loss": 4.187,
1736
  "step": 144000
1737
  },
1738
  {
1739
  "epoch": 2.58,
1740
  "learning_rate": 7.058459929153889e-06,
1741
+ "loss": 4.183,
1742
  "step": 144500
1743
  },
1744
  {
1745
  "epoch": 2.59,
1746
  "learning_rate": 6.909873285310129e-06,
1747
+ "loss": 4.0993,
1748
  "step": 145000
1749
  },
1750
  {
1751
  "epoch": 2.59,
1752
  "learning_rate": 6.761286641466371e-06,
1753
+ "loss": 4.2255,
1754
  "step": 145500
1755
  },
1756
  {
1757
  "epoch": 2.6,
1758
  "learning_rate": 6.612699997622613e-06,
1759
+ "loss": 4.1642,
1760
  "step": 146000
1761
  },
1762
  {
1763
  "epoch": 2.61,
1764
  "learning_rate": 6.464113353778855e-06,
1765
+ "loss": 4.1559,
1766
  "step": 146500
1767
  },
1768
  {
1769
  "epoch": 2.62,
1770
  "learning_rate": 6.315526709935097e-06,
1771
+ "loss": 4.1859,
1772
  "step": 147000
1773
  },
1774
  {
1775
  "epoch": 2.63,
1776
  "learning_rate": 6.1669400660913394e-06,
1777
+ "loss": 4.1573,
1778
  "step": 147500
1779
  },
1780
  {
1781
  "epoch": 2.64,
1782
  "learning_rate": 6.0183534222475815e-06,
1783
+ "loss": 4.1746,
1784
  "step": 148000
1785
  },
1786
  {
1787
  "epoch": 2.65,
1788
  "learning_rate": 5.869766778403823e-06,
1789
+ "loss": 4.2273,
1790
  "step": 148500
1791
  },
1792
  {
1793
  "epoch": 2.66,
1794
  "learning_rate": 5.721180134560065e-06,
1795
+ "loss": 4.1844,
1796
  "step": 149000
1797
  },
1798
  {
1799
  "epoch": 2.67,
1800
  "learning_rate": 5.572593490716307e-06,
1801
+ "loss": 4.1636,
1802
  "step": 149500
1803
  },
1804
  {
1805
  "epoch": 2.67,
1806
  "learning_rate": 5.424006846872549e-06,
1807
+ "loss": 4.1785,
1808
  "step": 150000
1809
  },
1810
  {
1811
  "epoch": 2.68,
1812
  "learning_rate": 5.275420203028791e-06,
1813
+ "loss": 4.2064,
1814
  "step": 150500
1815
  },
1816
  {
1817
  "epoch": 2.69,
1818
  "learning_rate": 5.126833559185032e-06,
1819
+ "loss": 4.1103,
1820
  "step": 151000
1821
  },
1822
  {
1823
  "epoch": 2.7,
1824
  "learning_rate": 4.978246915341274e-06,
1825
+ "loss": 4.1814,
1826
  "step": 151500
1827
  },
1828
  {
1829
  "epoch": 2.71,
1830
  "learning_rate": 4.829660271497516e-06,
1831
+ "loss": 4.1903,
1832
  "step": 152000
1833
  },
1834
  {
1835
  "epoch": 2.72,
1836
  "learning_rate": 4.681073627653758e-06,
1837
+ "loss": 4.1319,
1838
  "step": 152500
1839
  },
1840
  {
1841
  "epoch": 2.73,
1842
  "learning_rate": 4.53248698381e-06,
1843
+ "loss": 4.127,
1844
  "step": 153000
1845
  },
1846
  {
1847
  "epoch": 2.74,
1848
  "learning_rate": 4.383900339966241e-06,
1849
+ "loss": 4.1278,
1850
  "step": 153500
1851
  },
1852
  {
1853
  "epoch": 2.75,
1854
  "learning_rate": 4.235313696122483e-06,
1855
+ "loss": 4.1209,
1856
  "step": 154000
1857
  },
1858
  {
1859
  "epoch": 2.75,
1860
  "learning_rate": 4.086727052278725e-06,
1861
+ "loss": 4.145,
1862
  "step": 154500
1863
  },
1864
  {
1865
  "epoch": 2.76,
1866
  "learning_rate": 3.938140408434967e-06,
1867
+ "loss": 4.1624,
1868
  "step": 155000
1869
  },
1870
  {
1871
  "epoch": 2.77,
1872
  "learning_rate": 3.7895537645912088e-06,
1873
+ "loss": 4.1436,
1874
  "step": 155500
1875
  },
1876
  {
1877
  "epoch": 2.78,
1878
  "learning_rate": 3.6409671207474504e-06,
1879
+ "loss": 4.1485,
1880
  "step": 156000
1881
  },
1882
  {
1883
  "epoch": 2.79,
1884
  "learning_rate": 3.4923804769036924e-06,
1885
+ "loss": 4.1695,
1886
  "step": 156500
1887
  },
1888
  {
1889
  "epoch": 2.8,
1890
  "learning_rate": 3.343793833059934e-06,
1891
+ "loss": 4.0884,
1892
  "step": 157000
1893
  },
1894
  {
1895
  "epoch": 2.81,
1896
  "learning_rate": 3.195207189216176e-06,
1897
+ "loss": 4.1719,
1898
  "step": 157500
1899
  },
1900
  {
1901
  "epoch": 2.82,
1902
  "learning_rate": 3.0466205453724176e-06,
1903
+ "loss": 4.1232,
1904
  "step": 158000
1905
  },
1906
  {
1907
  "epoch": 2.83,
1908
  "learning_rate": 2.898033901528659e-06,
1909
+ "loss": 4.1327,
1910
  "step": 158500
1911
  },
1912
  {
1913
  "epoch": 2.84,
1914
  "learning_rate": 2.749447257684901e-06,
1915
+ "loss": 4.1568,
1916
  "step": 159000
1917
  },
1918
  {
1919
  "epoch": 2.84,
1920
  "learning_rate": 2.6008606138411432e-06,
1921
+ "loss": 4.0888,
1922
  "step": 159500
1923
  },
1924
  {
1925
  "epoch": 2.85,
1926
  "learning_rate": 2.452273969997385e-06,
1927
+ "loss": 4.1711,
1928
  "step": 160000
1929
  },
1930
  {
1931
  "epoch": 2.86,
1932
  "learning_rate": 2.303687326153627e-06,
1933
+ "loss": 4.2205,
1934
  "step": 160500
1935
  },
1936
  {
1937
  "epoch": 2.87,
1938
  "learning_rate": 2.1551006823098684e-06,
1939
+ "loss": 4.1221,
1940
  "step": 161000
1941
  },
1942
  {
1943
  "epoch": 2.88,
1944
  "learning_rate": 2.0065140384661104e-06,
1945
+ "loss": 4.1887,
1946
  "step": 161500
1947
  },
1948
  {
1949
  "epoch": 2.89,
1950
  "learning_rate": 1.8579273946223525e-06,
1951
+ "loss": 4.138,
1952
  "step": 162000
1953
  },
1954
  {
1955
  "epoch": 2.9,
1956
  "learning_rate": 1.709340750778594e-06,
1957
+ "loss": 4.166,
1958
  "step": 162500
1959
  },
1960
  {
1961
  "epoch": 2.91,
1962
  "learning_rate": 1.5607541069348359e-06,
1963
+ "loss": 4.1599,
1964
  "step": 163000
1965
  },
1966
  {
1967
  "epoch": 2.92,
1968
  "learning_rate": 1.4121674630910779e-06,
1969
+ "loss": 4.1308,
1970
  "step": 163500
1971
  },
1972
  {
1973
  "epoch": 2.92,
1974
  "learning_rate": 1.2635808192473197e-06,
1975
+ "loss": 4.2161,
1976
  "step": 164000
1977
  },
1978
  {
1979
  "epoch": 2.93,
1980
  "learning_rate": 1.1149941754035615e-06,
1981
+ "loss": 4.1883,
1982
  "step": 164500
1983
  },
1984
  {
1985
  "epoch": 2.94,
1986
  "learning_rate": 9.66407531559803e-07,
1987
+ "loss": 4.17,
1988
  "step": 165000
1989
  },
1990
  {
1991
  "epoch": 2.95,
1992
  "learning_rate": 8.17820887716045e-07,
1993
+ "loss": 4.1741,
1994
  "step": 165500
1995
  },
1996
  {
1997
  "epoch": 2.96,
1998
  "learning_rate": 6.692342438722869e-07,
1999
+ "loss": 4.1797,
2000
  "step": 166000
2001
  },
2002
  {
2003
  "epoch": 2.97,
2004
  "learning_rate": 5.206476000285286e-07,
2005
+ "loss": 4.1768,
2006
  "step": 166500
2007
  },
2008
  {
2009
  "epoch": 2.98,
2010
  "learning_rate": 3.720609561847705e-07,
2011
+ "loss": 4.1909,
2012
  "step": 167000
2013
  },
2014
  {
2015
  "epoch": 2.99,
2016
  "learning_rate": 2.234743123410123e-07,
2017
+ "loss": 4.1689,
2018
  "step": 167500
2019
  },
2020
  {
2021
  "epoch": 3.0,
2022
  "learning_rate": 7.488766849725412e-08,
2023
+ "loss": 4.1115,
2024
  "step": 168000
2025
  },
2026
  {
2027
  "epoch": 3.0,
2028
  "step": 168252,
2029
+ "total_flos": 2.098762564848768e+16,
2030
+ "train_loss": 4.168116396368988,
2031
+ "train_runtime": 19036.7524,
2032
+ "train_samples_per_second": 17.677,
2033
+ "train_steps_per_second": 8.838
2034
  }
2035
  ],
2036
  "max_steps": 168252,
2037
  "num_train_epochs": 3,
2038
+ "total_flos": 2.098762564848768e+16,
2039
  "trial_name": null,
2040
  "trial_params": null
2041
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06d92f3244137cb861270518d3b54b24fbe585551b539c504962e5046efe4dda
3
  size 3567
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20a50fe822855aeb5b17d149f2c34cd69ab6301adcc00442b5e2675797edd1d6
3
  size 3567
vocab.json CHANGED
The diff for this file is too large to render. See raw diff