hllj commited on
Commit
449e067
·
1 Parent(s): d0fbdd4

Model save

Browse files
README.md CHANGED
@@ -14,7 +14,7 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  This model is a fine-tuned version of [hllj/zephyr-7b-beta-vi-math](https://huggingface.co/hllj/zephyr-7b-beta-vi-math) on an unknown dataset.
16
  It achieves the following results on the evaluation set:
17
- - Loss: 0.4905
18
 
19
  ## Model description
20
 
@@ -33,7 +33,7 @@ More information needed
33
  ### Training hyperparameters
34
 
35
  The following hyperparameters were used during training:
36
- - learning_rate: 0.0003
37
  - train_batch_size: 4
38
  - eval_batch_size: 4
39
  - seed: 42
@@ -48,26 +48,26 @@ The following hyperparameters were used during training:
48
 
49
  | Training Loss | Epoch | Step | Validation Loss |
50
  |:-------------:|:-----:|:----:|:---------------:|
51
- | 0.42 | 0.19 | 50 | 0.4167 |
52
- | 0.3769 | 0.37 | 100 | 0.3928 |
53
- | 0.3502 | 0.56 | 150 | 0.3917 |
54
- | 0.3151 | 0.74 | 200 | 0.3844 |
55
- | 0.2859 | 0.93 | 250 | 0.3882 |
56
- | 0.2749 | 1.12 | 300 | 0.3927 |
57
- | 0.2447 | 1.3 | 350 | 0.4060 |
58
- | 0.2176 | 1.49 | 400 | 0.4102 |
59
- | 0.2095 | 1.67 | 450 | 0.4099 |
60
- | 0.1732 | 1.86 | 500 | 0.4182 |
61
- | 0.1545 | 2.04 | 550 | 0.4349 |
62
- | 0.1546 | 2.23 | 600 | 0.4248 |
63
- | 0.122 | 2.42 | 650 | 0.4543 |
64
- | 0.1157 | 2.6 | 700 | 0.4587 |
65
- | 0.1055 | 2.79 | 750 | 0.4623 |
66
- | 0.0958 | 2.97 | 800 | 0.4744 |
67
- | 0.09 | 3.16 | 850 | 0.4796 |
68
- | 0.0914 | 3.35 | 900 | 0.4880 |
69
- | 0.0893 | 3.53 | 950 | 0.4895 |
70
- | 0.0794 | 3.72 | 1000 | 0.4905 |
71
 
72
 
73
  ### Framework versions
 
14
 
15
  This model is a fine-tuned version of [hllj/zephyr-7b-beta-vi-math](https://huggingface.co/hllj/zephyr-7b-beta-vi-math) on an unknown dataset.
16
  It achieves the following results on the evaluation set:
17
+ - Loss: 0.3935
18
 
19
  ## Model description
20
 
 
33
  ### Training hyperparameters
34
 
35
  The following hyperparameters were used during training:
36
+ - learning_rate: 3e-05
37
  - train_batch_size: 4
38
  - eval_batch_size: 4
39
  - seed: 42
 
48
 
49
  | Training Loss | Epoch | Step | Validation Loss |
50
  |:-------------:|:-----:|:----:|:---------------:|
51
+ | 0.6583 | 0.19 | 50 | 0.5998 |
52
+ | 0.4808 | 0.37 | 100 | 0.4464 |
53
+ | 0.4476 | 0.56 | 150 | 0.4201 |
54
+ | 0.4158 | 0.74 | 200 | 0.4091 |
55
+ | 0.4028 | 0.93 | 250 | 0.4018 |
56
+ | 0.4074 | 1.12 | 300 | 0.3965 |
57
+ | 0.388 | 1.3 | 350 | 0.3942 |
58
+ | 0.3699 | 1.49 | 400 | 0.3921 |
59
+ | 0.3699 | 1.67 | 450 | 0.3932 |
60
+ | 0.336 | 1.86 | 500 | 0.3955 |
61
+ | 0.3512 | 2.04 | 550 | 0.3911 |
62
+ | 0.3413 | 2.23 | 600 | 0.3900 |
63
+ | 0.3402 | 2.42 | 650 | 0.3932 |
64
+ | 0.3255 | 2.6 | 700 | 0.3948 |
65
+ | 0.3252 | 2.79 | 750 | 0.3930 |
66
+ | 0.316 | 2.97 | 800 | 0.3946 |
67
+ | 0.305 | 3.16 | 850 | 0.3931 |
68
+ | 0.3248 | 3.35 | 900 | 0.3935 |
69
+ | 0.3363 | 3.53 | 950 | 0.3934 |
70
+ | 0.3032 | 3.72 | 1000 | 0.3935 |
71
 
72
 
73
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 3.72,
3
- "eval_loss": 0.4905177056789398,
4
- "eval_runtime": 10.4024,
5
  "eval_samples": 120,
6
- "eval_samples_per_second": 11.536,
7
- "eval_steps_per_second": 2.884,
8
- "train_loss": 0.21093143409490586,
9
- "train_runtime": 3638.7602,
10
  "train_samples": 1076,
11
- "train_samples_per_second": 1.099,
12
- "train_steps_per_second": 0.275
13
  }
 
1
  {
2
  "epoch": 3.72,
3
+ "eval_loss": 0.3934732675552368,
4
+ "eval_runtime": 10.335,
5
  "eval_samples": 120,
6
+ "eval_samples_per_second": 11.611,
7
+ "eval_steps_per_second": 2.903,
8
+ "train_loss": 0.3842643254995346,
9
+ "train_runtime": 3643.6441,
10
  "train_samples": 1076,
11
+ "train_samples_per_second": 1.098,
12
+ "train_steps_per_second": 0.274
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 3.72,
3
- "eval_loss": 0.4905177056789398,
4
- "eval_runtime": 10.4024,
5
  "eval_samples": 120,
6
- "eval_samples_per_second": 11.536,
7
- "eval_steps_per_second": 2.884
8
  }
 
1
  {
2
  "epoch": 3.72,
3
+ "eval_loss": 0.3934732675552368,
4
+ "eval_runtime": 10.335,
5
  "eval_samples": 120,
6
+ "eval_samples_per_second": 11.611,
7
+ "eval_steps_per_second": 2.903
8
  }
runs/Nov18_04-50-43_7a59b30c842e/events.out.tfevents.1700286710.7a59b30c842e.54271.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39ba94ef47f9c1b8d610e80ca3e3d08def27c38b656d0e24800e1d3be0e29e05
3
+ size 359
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 3.72,
3
- "train_loss": 0.21093143409490586,
4
- "train_runtime": 3638.7602,
5
  "train_samples": 1076,
6
- "train_samples_per_second": 1.099,
7
- "train_steps_per_second": 0.275
8
  }
 
1
  {
2
  "epoch": 3.72,
3
+ "train_loss": 0.3842643254995346,
4
+ "train_runtime": 3643.6441,
5
  "train_samples": 1076,
6
+ "train_samples_per_second": 1.098,
7
+ "train_steps_per_second": 0.274
8
  }
trainer_state.json CHANGED
@@ -10,785 +10,785 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "learning_rate": 5.999999999999999e-06,
14
  "loss": 0.9756,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.04,
19
- "learning_rate": 5.9999999999999995e-05,
20
- "loss": 0.8426,
21
  "step": 10
22
  },
23
  {
24
  "epoch": 0.07,
25
- "learning_rate": 0.00011999999999999999,
26
- "loss": 0.6295,
27
  "step": 20
28
  },
29
  {
30
  "epoch": 0.11,
31
- "learning_rate": 0.00017999999999999998,
32
- "loss": 0.4895,
33
  "step": 30
34
  },
35
  {
36
  "epoch": 0.15,
37
- "learning_rate": 0.00023999999999999998,
38
- "loss": 0.4679,
39
  "step": 40
40
  },
41
  {
42
  "epoch": 0.19,
43
- "learning_rate": 0.0003,
44
- "loss": 0.42,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.19,
49
- "eval_loss": 0.41668352484703064,
50
- "eval_runtime": 10.3592,
51
- "eval_samples_per_second": 11.584,
52
- "eval_steps_per_second": 2.896,
53
  "step": 50
54
  },
55
  {
56
  "epoch": 0.22,
57
- "learning_rate": 0.0002999179886011389,
58
- "loss": 0.4233,
59
  "step": 60
60
  },
61
  {
62
  "epoch": 0.26,
63
- "learning_rate": 0.00029967204408281613,
64
- "loss": 0.3914,
65
  "step": 70
66
  },
67
  {
68
  "epoch": 0.3,
69
- "learning_rate": 0.0002992624353817517,
70
- "loss": 0.3716,
71
  "step": 80
72
  },
73
  {
74
  "epoch": 0.33,
75
- "learning_rate": 0.00029868961039904624,
76
- "loss": 0.4056,
77
  "step": 90
78
  },
79
  {
80
  "epoch": 0.37,
81
- "learning_rate": 0.00029795419551040833,
82
- "loss": 0.3769,
83
  "step": 100
84
  },
85
  {
86
  "epoch": 0.37,
87
- "eval_loss": 0.39277157187461853,
88
- "eval_runtime": 10.3723,
89
- "eval_samples_per_second": 11.569,
90
- "eval_steps_per_second": 2.892,
91
  "step": 100
92
  },
93
  {
94
  "epoch": 0.41,
95
- "learning_rate": 0.0002970569948812214,
96
- "loss": 0.3592,
97
  "step": 110
98
  },
99
  {
100
  "epoch": 0.45,
101
- "learning_rate": 0.0002959989895872009,
102
- "loss": 0.3469,
103
  "step": 120
104
  },
105
  {
106
  "epoch": 0.48,
107
- "learning_rate": 0.0002947813365416023,
108
- "loss": 0.3546,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.52,
113
- "learning_rate": 0.0002934053672301536,
114
- "loss": 0.3347,
115
  "step": 140
116
  },
117
  {
118
  "epoch": 0.56,
119
- "learning_rate": 0.00029187258625509513,
120
- "loss": 0.3502,
121
  "step": 150
122
  },
123
  {
124
  "epoch": 0.56,
125
- "eval_loss": 0.39167603850364685,
126
- "eval_runtime": 10.3789,
127
- "eval_samples_per_second": 11.562,
128
- "eval_steps_per_second": 2.89,
129
  "step": 150
130
  },
131
  {
132
  "epoch": 0.59,
133
- "learning_rate": 0.00029036039116586096,
134
- "loss": 0.3241,
135
  "step": 160
136
  },
137
  {
138
  "epoch": 0.63,
139
- "learning_rate": 0.00028853442585949227,
140
- "loss": 0.3123,
141
  "step": 170
142
  },
143
  {
144
  "epoch": 0.67,
145
- "learning_rate": 0.0002865569751923882,
146
- "loss": 0.3092,
147
  "step": 180
148
  },
149
  {
150
  "epoch": 0.71,
151
- "learning_rate": 0.0002844302014778205,
152
- "loss": 0.3223,
153
  "step": 190
154
  },
155
  {
156
  "epoch": 0.74,
157
- "learning_rate": 0.0002821564303116212,
158
- "loss": 0.3151,
159
  "step": 200
160
  },
161
  {
162
  "epoch": 0.74,
163
- "eval_loss": 0.3844055235385895,
164
- "eval_runtime": 10.3763,
165
- "eval_samples_per_second": 11.565,
166
- "eval_steps_per_second": 2.891,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 0.78,
171
- "learning_rate": 0.00027973814802917727,
172
- "loss": 0.3196,
173
  "step": 210
174
  },
175
  {
176
  "epoch": 0.82,
177
- "learning_rate": 0.00027717799898665976,
178
- "loss": 0.297,
179
  "step": 220
180
  },
181
  {
182
  "epoch": 0.86,
183
- "learning_rate": 0.0002744787826694589,
184
- "loss": 0.3007,
185
  "step": 230
186
  },
187
  {
188
  "epoch": 0.89,
189
- "learning_rate": 0.000271643450630988,
190
- "loss": 0.3022,
191
  "step": 240
192
  },
193
  {
194
  "epoch": 0.93,
195
- "learning_rate": 0.00026867510326520326,
196
- "loss": 0.2859,
197
  "step": 250
198
  },
199
  {
200
  "epoch": 0.93,
201
- "eval_loss": 0.38817495107650757,
202
- "eval_runtime": 10.3807,
203
- "eval_samples_per_second": 11.56,
204
- "eval_steps_per_second": 2.89,
205
  "step": 250
206
  },
207
  {
208
  "epoch": 0.97,
209
- "learning_rate": 0.00026557698641636835,
210
- "loss": 0.2823,
211
  "step": 260
212
  },
213
  {
214
  "epoch": 1.0,
215
- "learning_rate": 0.0002623524878297714,
216
- "loss": 0.2749,
217
  "step": 270
218
  },
219
  {
220
  "epoch": 1.04,
221
- "learning_rate": 0.00025900513344727507,
222
- "loss": 0.2709,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 1.08,
227
- "learning_rate": 0.0002555385835517515,
228
- "loss": 0.2841,
229
  "step": 290
230
  },
231
  {
232
  "epoch": 1.12,
233
- "learning_rate": 0.00025195662876461596,
234
- "loss": 0.2749,
235
  "step": 300
236
  },
237
  {
238
  "epoch": 1.12,
239
- "eval_loss": 0.392701655626297,
240
- "eval_runtime": 10.3797,
241
- "eval_samples_per_second": 11.561,
242
- "eval_steps_per_second": 2.89,
243
  "step": 300
244
  },
245
  {
246
  "epoch": 1.15,
247
- "learning_rate": 0.0002482631859008384,
248
- "loss": 0.2694,
249
  "step": 310
250
  },
251
  {
252
  "epoch": 1.19,
253
- "learning_rate": 0.00024446229368596387,
254
- "loss": 0.2189,
255
  "step": 320
256
  },
257
  {
258
  "epoch": 1.23,
259
- "learning_rate": 0.0002405581083398251,
260
- "loss": 0.2456,
261
  "step": 330
262
  },
263
  {
264
  "epoch": 1.26,
265
- "learning_rate": 0.0002365548990317775,
266
- "loss": 0.2564,
267
  "step": 340
268
  },
269
  {
270
  "epoch": 1.3,
271
- "learning_rate": 0.00023245704321242492,
272
- "loss": 0.2447,
273
  "step": 350
274
  },
275
  {
276
  "epoch": 1.3,
277
- "eval_loss": 0.40599215030670166,
278
- "eval_runtime": 10.3832,
279
- "eval_samples_per_second": 11.557,
280
  "eval_steps_per_second": 2.889,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 1.34,
285
- "learning_rate": 0.00022826902182694156,
286
- "loss": 0.2386,
287
  "step": 360
288
  },
289
  {
290
  "epoch": 1.38,
291
- "learning_rate": 0.00022399541441522474,
292
- "loss": 0.2459,
293
  "step": 370
294
  },
295
  {
296
  "epoch": 1.41,
297
- "learning_rate": 0.00021964089410423456,
298
- "loss": 0.2342,
299
  "step": 380
300
  },
301
  {
302
  "epoch": 1.45,
303
- "learning_rate": 0.0002152102224979987,
304
- "loss": 0.2318,
305
  "step": 390
306
  },
307
  {
308
  "epoch": 1.49,
309
- "learning_rate": 0.00021070824447086807,
310
- "loss": 0.2176,
311
  "step": 400
312
  },
313
  {
314
  "epoch": 1.49,
315
- "eval_loss": 0.41023018956184387,
316
- "eval_runtime": 10.3769,
317
- "eval_samples_per_second": 11.564,
318
- "eval_steps_per_second": 2.891,
319
  "step": 400
320
  },
321
  {
322
  "epoch": 1.52,
323
- "learning_rate": 0.00020613988286971802,
324
- "loss": 0.1921,
325
  "step": 410
326
  },
327
  {
328
  "epoch": 1.56,
329
- "learning_rate": 0.00020151013313088746,
330
- "loss": 0.1997,
331
  "step": 420
332
  },
333
  {
334
  "epoch": 1.6,
335
- "learning_rate": 0.00019682405781774239,
336
- "loss": 0.196,
337
  "step": 430
338
  },
339
  {
340
  "epoch": 1.64,
341
- "learning_rate": 0.00019208678108483746,
342
- "loss": 0.2059,
343
  "step": 440
344
  },
345
  {
346
  "epoch": 1.67,
347
- "learning_rate": 0.00018730348307472824,
348
- "loss": 0.2095,
349
  "step": 450
350
  },
351
  {
352
  "epoch": 1.67,
353
- "eval_loss": 0.4098932445049286,
354
- "eval_runtime": 10.3869,
355
- "eval_samples_per_second": 11.553,
356
- "eval_steps_per_second": 2.888,
357
  "step": 450
358
  },
359
  {
360
  "epoch": 1.71,
361
- "learning_rate": 0.00018247939425356096,
362
- "loss": 0.1969,
363
  "step": 460
364
  },
365
  {
366
  "epoch": 1.75,
367
- "learning_rate": 0.00017761978969163506,
368
- "loss": 0.1975,
369
  "step": 470
370
  },
371
  {
372
  "epoch": 1.78,
373
- "learning_rate": 0.00017272998329519103,
374
- "loss": 0.1971,
375
  "step": 480
376
  },
377
  {
378
  "epoch": 1.82,
379
- "learning_rate": 0.000167815321995732,
380
- "loss": 0.1898,
381
  "step": 490
382
  },
383
  {
384
  "epoch": 1.86,
385
- "learning_rate": 0.00016288117990323256,
386
- "loss": 0.1732,
387
  "step": 500
388
  },
389
  {
390
  "epoch": 1.86,
391
- "eval_loss": 0.4181649684906006,
392
- "eval_runtime": 10.3784,
393
- "eval_samples_per_second": 11.562,
394
- "eval_steps_per_second": 2.891,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 1.9,
399
- "learning_rate": 0.0001579329524296285,
400
- "loss": 0.1726,
401
  "step": 510
402
  },
403
  {
404
  "epoch": 1.93,
405
- "learning_rate": 0.00015297605038901304,
406
- "loss": 0.1774,
407
  "step": 520
408
  },
409
  {
410
  "epoch": 1.97,
411
- "learning_rate": 0.00014801589408099117,
412
- "loss": 0.1854,
413
  "step": 530
414
  },
415
  {
416
  "epoch": 2.01,
417
- "learning_rate": 0.00014305790736366135,
418
- "loss": 0.168,
419
  "step": 540
420
  },
421
  {
422
  "epoch": 2.04,
423
- "learning_rate": 0.00013810751172270658,
424
- "loss": 0.1545,
425
  "step": 550
426
  },
427
  {
428
  "epoch": 2.04,
429
- "eval_loss": 0.4348754286766052,
430
- "eval_runtime": 10.3825,
431
- "eval_samples_per_second": 11.558,
432
- "eval_steps_per_second": 2.889,
433
  "step": 550
434
  },
435
  {
436
  "epoch": 2.08,
437
- "learning_rate": 0.00013317012034307936,
438
- "loss": 0.1635,
439
  "step": 560
440
  },
441
  {
442
  "epoch": 2.12,
443
- "learning_rate": 0.0001282511321897631,
444
- "loss": 0.1668,
445
  "step": 570
446
  },
447
  {
448
  "epoch": 2.16,
449
- "learning_rate": 0.0001233559261040837,
450
- "loss": 0.156,
451
  "step": 580
452
  },
453
  {
454
  "epoch": 2.19,
455
- "learning_rate": 0.00011848985492202512,
456
- "loss": 0.1471,
457
  "step": 590
458
  },
459
  {
460
  "epoch": 2.23,
461
- "learning_rate": 0.00011365823962098206,
462
- "loss": 0.1546,
463
  "step": 600
464
  },
465
  {
466
  "epoch": 2.23,
467
- "eval_loss": 0.42479971051216125,
468
- "eval_runtime": 10.3847,
469
- "eval_samples_per_second": 11.556,
470
- "eval_steps_per_second": 2.889,
471
  "step": 600
472
  },
473
  {
474
  "epoch": 2.27,
475
- "learning_rate": 0.00010886636350134905,
476
- "loss": 0.151,
477
  "step": 610
478
  },
479
  {
480
  "epoch": 2.3,
481
- "learning_rate": 0.00010411946640930938,
482
- "loss": 0.1459,
483
  "step": 620
484
  },
485
  {
486
  "epoch": 2.34,
487
- "learning_rate": 9.942273900713996e-05,
488
- "loss": 0.1316,
489
  "step": 630
490
  },
491
  {
492
  "epoch": 2.38,
493
- "learning_rate": 9.47813170972983e-05,
494
- "loss": 0.1118,
495
  "step": 640
496
  },
497
  {
498
  "epoch": 2.42,
499
- "learning_rate": 9.020027600649824e-05,
500
- "loss": 0.122,
501
  "step": 650
502
  },
503
  {
504
  "epoch": 2.42,
505
- "eval_loss": 0.4543386399745941,
506
- "eval_runtime": 10.3856,
507
- "eval_samples_per_second": 11.555,
508
  "eval_steps_per_second": 2.889,
509
  "step": 650
510
  },
511
  {
512
  "epoch": 2.45,
513
- "learning_rate": 8.568462503591441e-05,
514
- "loss": 0.1073,
515
  "step": 660
516
  },
517
  {
518
  "epoch": 2.49,
519
- "learning_rate": 8.123930198358497e-05,
520
- "loss": 0.1176,
521
  "step": 670
522
  },
523
  {
524
  "epoch": 2.53,
525
- "learning_rate": 7.686916774500205e-05,
526
- "loss": 0.1219,
527
  "step": 680
528
  },
529
  {
530
  "epoch": 2.57,
531
- "learning_rate": 7.257900099779394e-05,
532
- "loss": 0.1063,
533
  "step": 690
534
  },
535
  {
536
  "epoch": 2.6,
537
- "learning_rate": 6.837349297631113e-05,
538
- "loss": 0.1157,
539
  "step": 700
540
  },
541
  {
542
  "epoch": 2.6,
543
- "eval_loss": 0.45872315764427185,
544
- "eval_runtime": 10.3905,
545
- "eval_samples_per_second": 11.549,
546
  "eval_steps_per_second": 2.887,
547
  "step": 700
548
  },
549
  {
550
  "epoch": 2.64,
551
- "learning_rate": 6.425724234183036e-05,
552
- "loss": 0.1195,
553
  "step": 710
554
  },
555
  {
556
  "epoch": 2.68,
557
- "learning_rate": 6.0234750153986346e-05,
558
- "loss": 0.1018,
559
  "step": 720
560
  },
561
  {
562
  "epoch": 2.71,
563
- "learning_rate": 5.631041494892882e-05,
564
- "loss": 0.1051,
565
  "step": 730
566
  },
567
  {
568
  "epoch": 2.75,
569
- "learning_rate": 5.248852792958801e-05,
570
- "loss": 0.1057,
571
  "step": 740
572
  },
573
  {
574
  "epoch": 2.79,
575
- "learning_rate": 4.877326827330719e-05,
576
- "loss": 0.1055,
577
  "step": 750
578
  },
579
  {
580
  "epoch": 2.79,
581
- "eval_loss": 0.4623105823993683,
582
- "eval_runtime": 10.3899,
583
- "eval_samples_per_second": 11.55,
584
- "eval_steps_per_second": 2.887,
585
  "step": 750
586
  },
587
  {
588
  "epoch": 2.83,
589
- "learning_rate": 4.516869856197362e-05,
590
- "loss": 0.1059,
591
  "step": 760
592
  },
593
  {
594
  "epoch": 2.86,
595
- "learning_rate": 4.1678760339644933e-05,
596
- "loss": 0.0999,
597
  "step": 770
598
  },
599
  {
600
  "epoch": 2.9,
601
- "learning_rate": 3.830726980252837e-05,
602
- "loss": 0.0962,
603
  "step": 780
604
  },
605
  {
606
  "epoch": 2.94,
607
- "learning_rate": 3.505791362602661e-05,
608
- "loss": 0.0932,
609
  "step": 790
610
  },
611
  {
612
  "epoch": 2.97,
613
- "learning_rate": 3.1934244933412124e-05,
614
- "loss": 0.0958,
615
  "step": 800
616
  },
617
  {
618
  "epoch": 2.97,
619
- "eval_loss": 0.474372923374176,
620
- "eval_runtime": 10.3939,
621
- "eval_samples_per_second": 11.545,
622
- "eval_steps_per_second": 2.886,
623
  "step": 800
624
  },
625
  {
626
  "epoch": 3.01,
627
- "learning_rate": 2.893967941053898e-05,
628
- "loss": 0.102,
629
  "step": 810
630
  },
631
  {
632
  "epoch": 3.05,
633
- "learning_rate": 2.607749157084067e-05,
634
- "loss": 0.0836,
635
  "step": 820
636
  },
637
  {
638
  "epoch": 3.09,
639
- "learning_rate": 2.335081117469777e-05,
640
- "loss": 0.0974,
641
  "step": 830
642
  },
643
  {
644
  "epoch": 3.12,
645
- "learning_rate": 2.0762619807090657e-05,
646
- "loss": 0.1015,
647
  "step": 840
648
  },
649
  {
650
  "epoch": 3.16,
651
- "learning_rate": 1.831574761728038e-05,
652
- "loss": 0.09,
653
  "step": 850
654
  },
655
  {
656
  "epoch": 3.16,
657
- "eval_loss": 0.479593425989151,
658
- "eval_runtime": 10.3885,
659
- "eval_samples_per_second": 11.551,
660
- "eval_steps_per_second": 2.888,
661
  "step": 850
662
  },
663
  {
664
  "epoch": 3.2,
665
- "learning_rate": 1.6012870224081877e-05,
666
- "loss": 0.0761,
667
  "step": 860
668
  },
669
  {
670
  "epoch": 3.23,
671
- "learning_rate": 1.3856505790114187e-05,
672
- "loss": 0.0876,
673
  "step": 870
674
  },
675
  {
676
  "epoch": 3.27,
677
- "learning_rate": 1.1849012268226338e-05,
678
- "loss": 0.0902,
679
  "step": 880
680
  },
681
  {
682
  "epoch": 3.31,
683
- "learning_rate": 9.992584823110834e-06,
684
- "loss": 0.0871,
685
  "step": 890
686
  },
687
  {
688
  "epoch": 3.35,
689
- "learning_rate": 8.289253430923126e-06,
690
- "loss": 0.0914,
691
  "step": 900
692
  },
693
  {
694
  "epoch": 3.35,
695
- "eval_loss": 0.4879765808582306,
696
- "eval_runtime": 10.3912,
697
- "eval_samples_per_second": 11.548,
698
- "eval_steps_per_second": 2.887,
699
  "step": 900
700
  },
701
  {
702
  "epoch": 3.38,
703
- "learning_rate": 6.7408806595324715e-06,
704
- "loss": 0.0872,
705
  "step": 910
706
  },
707
  {
708
  "epoch": 3.42,
709
- "learning_rate": 5.349159631831423e-06,
710
- "loss": 0.0937,
711
  "step": 920
712
  },
713
  {
714
  "epoch": 3.46,
715
- "learning_rate": 4.1156121743307405e-06,
716
- "loss": 0.0819,
717
  "step": 930
718
  },
719
  {
720
  "epoch": 3.49,
721
- "learning_rate": 3.0415871530644233e-06,
722
- "loss": 0.0877,
723
  "step": 940
724
  },
725
  {
726
  "epoch": 3.53,
727
- "learning_rate": 2.128258998624549e-06,
728
- "loss": 0.0893,
729
  "step": 950
730
  },
731
  {
732
  "epoch": 3.53,
733
- "eval_loss": 0.4895183742046356,
734
- "eval_runtime": 10.3915,
735
- "eval_samples_per_second": 11.548,
736
- "eval_steps_per_second": 2.887,
737
  "step": 950
738
  },
739
  {
740
  "epoch": 3.57,
741
- "learning_rate": 1.3766264219386759e-06,
742
- "loss": 0.0867,
743
  "step": 960
744
  },
745
  {
746
  "epoch": 3.61,
747
- "learning_rate": 7.875113221940287e-07,
748
- "loss": 0.0913,
749
  "step": 970
750
  },
751
  {
752
  "epoch": 3.64,
753
- "learning_rate": 3.6155788810286467e-07,
754
- "loss": 0.0902,
755
  "step": 980
756
  },
757
  {
758
  "epoch": 3.68,
759
- "learning_rate": 9.923189349162475e-08,
760
- "loss": 0.0801,
761
  "step": 990
762
  },
763
  {
764
  "epoch": 3.72,
765
- "learning_rate": 8.201879839297986e-10,
766
- "loss": 0.0794,
767
  "step": 1000
768
  },
769
  {
770
  "epoch": 3.72,
771
- "eval_loss": 0.49052247405052185,
772
- "eval_runtime": 10.3853,
773
- "eval_samples_per_second": 11.555,
774
- "eval_steps_per_second": 2.889,
775
  "step": 1000
776
  },
777
  {
778
  "epoch": 3.72,
779
  "step": 1000,
780
- "total_flos": 1.7508659664571597e+17,
781
- "train_loss": 0.21093143409490586,
782
- "train_runtime": 3638.7602,
783
- "train_samples_per_second": 1.099,
784
- "train_steps_per_second": 0.275
785
  }
786
  ],
787
  "logging_steps": 10,
788
  "max_steps": 1000,
789
  "num_train_epochs": 4,
790
  "save_steps": 500,
791
- "total_flos": 1.7508659664571597e+17,
792
  "trial_name": null,
793
  "trial_params": null
794
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "learning_rate": 6.000000000000001e-07,
14
  "loss": 0.9756,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.04,
19
+ "learning_rate": 6e-06,
20
+ "loss": 0.892,
21
  "step": 10
22
  },
23
  {
24
  "epoch": 0.07,
25
+ "learning_rate": 1.2e-05,
26
+ "loss": 0.852,
27
  "step": 20
28
  },
29
  {
30
  "epoch": 0.11,
31
+ "learning_rate": 1.8e-05,
32
+ "loss": 0.8052,
33
  "step": 30
34
  },
35
  {
36
  "epoch": 0.15,
37
+ "learning_rate": 2.4e-05,
38
+ "loss": 0.762,
39
  "step": 40
40
  },
41
  {
42
  "epoch": 0.19,
43
+ "learning_rate": 3e-05,
44
+ "loss": 0.6583,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.19,
49
+ "eval_loss": 0.5997987389564514,
50
+ "eval_runtime": 10.374,
51
+ "eval_samples_per_second": 11.567,
52
+ "eval_steps_per_second": 2.892,
53
  "step": 50
54
  },
55
  {
56
  "epoch": 0.22,
57
+ "learning_rate": 2.999179886011389e-05,
58
+ "loss": 0.5995,
59
  "step": 60
60
  },
61
  {
62
  "epoch": 0.26,
63
+ "learning_rate": 2.9967204408281618e-05,
64
+ "loss": 0.5319,
65
  "step": 70
66
  },
67
  {
68
  "epoch": 0.3,
69
+ "learning_rate": 2.9926243538175172e-05,
70
+ "loss": 0.4955,
71
  "step": 80
72
  },
73
  {
74
  "epoch": 0.33,
75
+ "learning_rate": 2.9868961039904628e-05,
76
+ "loss": 0.5063,
77
  "step": 90
78
  },
79
  {
80
  "epoch": 0.37,
81
+ "learning_rate": 2.9795419551040836e-05,
82
+ "loss": 0.4808,
83
  "step": 100
84
  },
85
  {
86
  "epoch": 0.37,
87
+ "eval_loss": 0.44642969965934753,
88
+ "eval_runtime": 10.3848,
89
+ "eval_samples_per_second": 11.555,
90
+ "eval_steps_per_second": 2.889,
91
  "step": 100
92
  },
93
  {
94
  "epoch": 0.41,
95
+ "learning_rate": 2.970569948812214e-05,
96
+ "loss": 0.4638,
97
  "step": 110
98
  },
99
  {
100
  "epoch": 0.45,
101
+ "learning_rate": 2.9599898958720088e-05,
102
+ "loss": 0.4424,
103
  "step": 120
104
  },
105
  {
106
  "epoch": 0.48,
107
+ "learning_rate": 2.947813365416023e-05,
108
+ "loss": 0.4506,
109
  "step": 130
110
  },
111
  {
112
  "epoch": 0.52,
113
+ "learning_rate": 2.9340536723015367e-05,
114
+ "loss": 0.4449,
115
  "step": 140
116
  },
117
  {
118
  "epoch": 0.56,
119
+ "learning_rate": 2.9187258625509518e-05,
120
+ "loss": 0.4476,
121
  "step": 150
122
  },
123
  {
124
  "epoch": 0.56,
125
+ "eval_loss": 0.4200552701950073,
126
+ "eval_runtime": 10.3829,
127
+ "eval_samples_per_second": 11.557,
128
+ "eval_steps_per_second": 2.889,
129
  "step": 150
130
  },
131
  {
132
  "epoch": 0.59,
133
+ "learning_rate": 2.9036039116586097e-05,
134
+ "loss": 0.4266,
135
  "step": 160
136
  },
137
  {
138
  "epoch": 0.63,
139
+ "learning_rate": 2.885344258594923e-05,
140
+ "loss": 0.4162,
141
  "step": 170
142
  },
143
  {
144
  "epoch": 0.67,
145
+ "learning_rate": 2.865569751923882e-05,
146
+ "loss": 0.4106,
147
  "step": 180
148
  },
149
  {
150
  "epoch": 0.71,
151
+ "learning_rate": 2.8443020147782055e-05,
152
+ "loss": 0.4255,
153
  "step": 190
154
  },
155
  {
156
  "epoch": 0.74,
157
+ "learning_rate": 2.821564303116212e-05,
158
+ "loss": 0.4158,
159
  "step": 200
160
  },
161
  {
162
  "epoch": 0.74,
163
+ "eval_loss": 0.4091338515281677,
164
+ "eval_runtime": 10.3877,
165
+ "eval_samples_per_second": 11.552,
166
+ "eval_steps_per_second": 2.888,
167
  "step": 200
168
  },
169
  {
170
  "epoch": 0.78,
171
+ "learning_rate": 2.797381480291773e-05,
172
+ "loss": 0.4362,
173
  "step": 210
174
  },
175
  {
176
  "epoch": 0.82,
177
+ "learning_rate": 2.7717799898665977e-05,
178
+ "loss": 0.4048,
179
  "step": 220
180
  },
181
  {
182
  "epoch": 0.86,
183
+ "learning_rate": 2.744787826694589e-05,
184
+ "loss": 0.4074,
185
  "step": 230
186
  },
187
  {
188
  "epoch": 0.89,
189
+ "learning_rate": 2.71643450630988e-05,
190
+ "loss": 0.4273,
191
  "step": 240
192
  },
193
  {
194
  "epoch": 0.93,
195
+ "learning_rate": 2.686751032652033e-05,
196
+ "loss": 0.4028,
197
  "step": 250
198
  },
199
  {
200
  "epoch": 0.93,
201
+ "eval_loss": 0.4017806947231293,
202
+ "eval_runtime": 10.3868,
203
+ "eval_samples_per_second": 11.553,
204
+ "eval_steps_per_second": 2.888,
205
  "step": 250
206
  },
207
  {
208
  "epoch": 0.97,
209
+ "learning_rate": 2.655769864163684e-05,
210
+ "loss": 0.409,
211
  "step": 260
212
  },
213
  {
214
  "epoch": 1.0,
215
+ "learning_rate": 2.623524878297714e-05,
216
+ "loss": 0.4021,
217
  "step": 270
218
  },
219
  {
220
  "epoch": 1.04,
221
+ "learning_rate": 2.590051334472751e-05,
222
+ "loss": 0.3942,
223
  "step": 280
224
  },
225
  {
226
  "epoch": 1.08,
227
+ "learning_rate": 2.5553858355175156e-05,
228
+ "loss": 0.3821,
229
  "step": 290
230
  },
231
  {
232
  "epoch": 1.12,
233
+ "learning_rate": 2.51956628764616e-05,
234
+ "loss": 0.4074,
235
  "step": 300
236
  },
237
  {
238
  "epoch": 1.12,
239
+ "eval_loss": 0.3964887857437134,
240
+ "eval_runtime": 10.3909,
241
+ "eval_samples_per_second": 11.549,
242
+ "eval_steps_per_second": 2.887,
243
  "step": 300
244
  },
245
  {
246
  "epoch": 1.15,
247
+ "learning_rate": 2.482631859008384e-05,
248
+ "loss": 0.3937,
249
  "step": 310
250
  },
251
  {
252
  "epoch": 1.19,
253
+ "learning_rate": 2.4446229368596388e-05,
254
+ "loss": 0.3503,
255
  "step": 320
256
  },
257
  {
258
  "epoch": 1.23,
259
+ "learning_rate": 2.4055810833982512e-05,
260
+ "loss": 0.3724,
261
  "step": 330
262
  },
263
  {
264
  "epoch": 1.26,
265
+ "learning_rate": 2.365548990317775e-05,
266
+ "loss": 0.3733,
267
  "step": 340
268
  },
269
  {
270
  "epoch": 1.3,
271
+ "learning_rate": 2.3245704321242494e-05,
272
+ "loss": 0.388,
273
  "step": 350
274
  },
275
  {
276
  "epoch": 1.3,
277
+ "eval_loss": 0.3942064344882965,
278
+ "eval_runtime": 10.3843,
279
+ "eval_samples_per_second": 11.556,
280
  "eval_steps_per_second": 2.889,
281
  "step": 350
282
  },
283
  {
284
  "epoch": 1.34,
285
+ "learning_rate": 2.282690218269416e-05,
286
+ "loss": 0.3713,
287
  "step": 360
288
  },
289
  {
290
  "epoch": 1.38,
291
+ "learning_rate": 2.2442649405387632e-05,
292
+ "loss": 0.3792,
293
  "step": 370
294
  },
295
  {
296
  "epoch": 1.41,
297
+ "learning_rate": 2.2007985218000543e-05,
298
+ "loss": 0.3665,
299
  "step": 380
300
  },
301
  {
302
  "epoch": 1.45,
303
+ "learning_rate": 2.1565657901667777e-05,
304
+ "loss": 0.3529,
305
  "step": 390
306
  },
307
  {
308
  "epoch": 1.49,
309
+ "learning_rate": 2.1116151134815555e-05,
310
+ "loss": 0.3699,
311
  "step": 400
312
  },
313
  {
314
  "epoch": 1.49,
315
+ "eval_loss": 0.39205998182296753,
316
+ "eval_runtime": 10.3869,
317
+ "eval_samples_per_second": 11.553,
318
+ "eval_steps_per_second": 2.888,
319
  "step": 400
320
  },
321
  {
322
  "epoch": 1.52,
323
+ "learning_rate": 2.065995644649384e-05,
324
+ "loss": 0.3441,
325
  "step": 410
326
  },
327
  {
328
  "epoch": 1.56,
329
+ "learning_rate": 2.0197572678896522e-05,
330
+ "loss": 0.3399,
331
  "step": 420
332
  },
333
  {
334
  "epoch": 1.6,
335
+ "learning_rate": 1.9729505441884825e-05,
336
+ "loss": 0.3617,
337
  "step": 430
338
  },
339
  {
340
  "epoch": 1.64,
341
+ "learning_rate": 1.9256266560110322e-05,
342
+ "loss": 0.3596,
343
  "step": 440
344
  },
345
  {
346
  "epoch": 1.67,
347
+ "learning_rate": 1.8778373513342223e-05,
348
+ "loss": 0.3699,
349
  "step": 450
350
  },
351
  {
352
  "epoch": 1.67,
353
+ "eval_loss": 0.3931977450847626,
354
+ "eval_runtime": 10.3775,
355
+ "eval_samples_per_second": 11.564,
356
+ "eval_steps_per_second": 2.891,
357
  "step": 450
358
  },
359
  {
360
  "epoch": 1.71,
361
+ "learning_rate": 1.8296348870610798e-05,
362
+ "loss": 0.3654,
363
  "step": 460
364
  },
365
  {
366
  "epoch": 1.75,
367
+ "learning_rate": 1.781071971878587e-05,
368
+ "loss": 0.3588,
369
  "step": 470
370
  },
371
  {
372
  "epoch": 1.78,
373
+ "learning_rate": 1.7322017086215023e-05,
374
+ "loss": 0.352,
375
  "step": 480
376
  },
377
  {
378
  "epoch": 1.82,
379
+ "learning_rate": 1.6830775362051904e-05,
380
+ "loss": 0.3639,
381
  "step": 490
382
  },
383
  {
384
  "epoch": 1.86,
385
+ "learning_rate": 1.633753171190956e-05,
386
+ "loss": 0.336,
387
  "step": 500
388
  },
389
  {
390
  "epoch": 1.86,
391
+ "eval_loss": 0.3954925537109375,
392
+ "eval_runtime": 10.3812,
393
+ "eval_samples_per_second": 11.559,
394
+ "eval_steps_per_second": 2.89,
395
  "step": 500
396
  },
397
  {
398
  "epoch": 1.9,
399
+ "learning_rate": 1.5842825490477683e-05,
400
+ "loss": 0.3421,
401
  "step": 510
402
  },
403
  {
404
  "epoch": 1.93,
405
+ "learning_rate": 1.5347197651746207e-05,
406
+ "loss": 0.3421,
407
  "step": 520
408
  },
409
  {
410
  "epoch": 1.97,
411
+ "learning_rate": 1.4851190157480054e-05,
412
+ "loss": 0.3547,
413
  "step": 530
414
  },
415
  {
416
  "epoch": 2.01,
417
+ "learning_rate": 1.4355345384591894e-05,
418
+ "loss": 0.3355,
419
  "step": 540
420
  },
421
  {
422
  "epoch": 2.04,
423
+ "learning_rate": 1.3860205532060953e-05,
424
+ "loss": 0.3512,
425
  "step": 550
426
  },
427
  {
428
  "epoch": 2.04,
429
+ "eval_loss": 0.3910907804965973,
430
+ "eval_runtime": 10.3795,
431
+ "eval_samples_per_second": 11.561,
432
+ "eval_steps_per_second": 2.89,
433
  "step": 550
434
  },
435
  {
436
  "epoch": 2.08,
437
+ "learning_rate": 1.3366312028046412e-05,
438
+ "loss": 0.3426,
439
  "step": 560
440
  },
441
  {
442
  "epoch": 2.12,
443
+ "learning_rate": 1.2874204937843636e-05,
444
+ "loss": 0.3577,
445
  "step": 570
446
  },
447
  {
448
  "epoch": 2.16,
449
+ "learning_rate": 1.2384422373330728e-05,
450
+ "loss": 0.3308,
451
  "step": 580
452
  },
453
  {
454
  "epoch": 2.19,
455
+ "learning_rate": 1.189749990455105e-05,
456
+ "loss": 0.3464,
457
  "step": 590
458
  },
459
  {
460
  "epoch": 2.23,
461
+ "learning_rate": 1.1413969974075299e-05,
462
+ "loss": 0.3413,
463
  "step": 600
464
  },
465
  {
466
  "epoch": 2.23,
467
+ "eval_loss": 0.39001432061195374,
468
+ "eval_runtime": 10.3775,
469
+ "eval_samples_per_second": 11.563,
470
+ "eval_steps_per_second": 2.891,
471
  "step": 600
472
  },
473
  {
474
  "epoch": 2.27,
475
+ "learning_rate": 1.0934361314783339e-05,
476
+ "loss": 0.3535,
477
  "step": 610
478
  },
479
  {
480
  "epoch": 2.3,
481
+ "learning_rate": 1.0459198371702553e-05,
482
+ "loss": 0.3322,
483
  "step": 620
484
  },
485
  {
486
  "epoch": 2.34,
487
+ "learning_rate": 9.989000728534936e-06,
488
+ "loss": 0.347,
489
  "step": 630
490
  },
491
  {
492
  "epoch": 2.38,
493
+ "learning_rate": 9.524282539499916e-06,
494
+ "loss": 0.3088,
495
  "step": 640
496
  },
497
  {
498
  "epoch": 2.42,
499
+ "learning_rate": 9.06555196711428e-06,
500
+ "loss": 0.3402,
501
  "step": 650
502
  },
503
  {
504
  "epoch": 2.42,
505
+ "eval_loss": 0.39315077662467957,
506
+ "eval_runtime": 10.3839,
507
+ "eval_samples_per_second": 11.556,
508
  "eval_steps_per_second": 2.889,
509
  "step": 650
510
  },
511
  {
512
  "epoch": 2.45,
513
+ "learning_rate": 8.61331062652391e-06,
514
+ "loss": 0.3123,
515
  "step": 660
516
  },
517
  {
518
  "epoch": 2.49,
519
+ "learning_rate": 8.168053036995011e-06,
520
+ "loss": 0.322,
521
  "step": 670
522
  },
523
  {
524
  "epoch": 2.53,
525
+ "learning_rate": 7.73026608116453e-06,
526
+ "loss": 0.3335,
527
  "step": 680
528
  },
529
  {
530
  "epoch": 2.57,
531
+ "learning_rate": 7.3004284726411315e-06,
532
+ "loss": 0.318,
533
  "step": 690
534
  },
535
  {
536
  "epoch": 2.6,
537
+ "learning_rate": 6.87901023253893e-06,
538
+ "loss": 0.3255,
539
  "step": 700
540
  },
541
  {
542
  "epoch": 2.6,
543
+ "eval_loss": 0.3948245942592621,
544
+ "eval_runtime": 10.39,
545
+ "eval_samples_per_second": 11.55,
546
  "eval_steps_per_second": 2.887,
547
  "step": 700
548
  },
549
  {
550
  "epoch": 2.64,
551
+ "learning_rate": 6.466472175516284e-06,
552
+ "loss": 0.3275,
553
  "step": 710
554
  },
555
  {
556
  "epoch": 2.68,
557
+ "learning_rate": 6.06326540588171e-06,
558
+ "loss": 0.3226,
559
  "step": 720
560
  },
561
  {
562
  "epoch": 2.71,
563
+ "learning_rate": 5.669830824317992e-06,
564
+ "loss": 0.3154,
565
  "step": 730
566
  },
567
  {
568
  "epoch": 2.75,
569
+ "learning_rate": 5.286598645763718e-06,
570
+ "loss": 0.3194,
571
  "step": 740
572
  },
573
  {
574
  "epoch": 2.79,
575
+ "learning_rate": 4.91398792897958e-06,
576
+ "loss": 0.3252,
577
  "step": 750
578
  },
579
  {
580
  "epoch": 2.79,
581
+ "eval_loss": 0.39301279187202454,
582
+ "eval_runtime": 10.3855,
583
+ "eval_samples_per_second": 11.555,
584
+ "eval_steps_per_second": 2.889,
585
  "step": 750
586
  },
587
  {
588
  "epoch": 2.83,
589
+ "learning_rate": 4.552406118313767e-06,
590
+ "loss": 0.3198,
591
  "step": 760
592
  },
593
  {
594
  "epoch": 2.86,
595
+ "learning_rate": 4.202248598167549e-06,
596
+ "loss": 0.3136,
597
  "step": 770
598
  },
599
  {
600
  "epoch": 2.9,
601
+ "learning_rate": 3.8638982606482525e-06,
602
+ "loss": 0.3179,
603
  "step": 780
604
  },
605
  {
606
  "epoch": 2.94,
607
+ "learning_rate": 3.537725086882333e-06,
608
+ "loss": 0.3196,
609
  "step": 790
610
  },
611
  {
612
  "epoch": 2.97,
613
+ "learning_rate": 3.224085742446484e-06,
614
+ "loss": 0.316,
615
  "step": 800
616
  },
617
  {
618
  "epoch": 2.97,
619
+ "eval_loss": 0.3946268558502197,
620
+ "eval_runtime": 10.3837,
621
+ "eval_samples_per_second": 11.557,
622
+ "eval_steps_per_second": 2.889,
623
  "step": 800
624
  },
625
  {
626
  "epoch": 3.01,
627
+ "learning_rate": 2.9233231873590445e-06,
628
+ "loss": 0.3046,
629
  "step": 810
630
  },
631
  {
632
  "epoch": 3.05,
633
+ "learning_rate": 2.635766301058241e-06,
634
+ "loss": 0.3013,
635
  "step": 820
636
  },
637
  {
638
  "epoch": 3.09,
639
+ "learning_rate": 2.3617295227773805e-06,
640
+ "loss": 0.3181,
641
  "step": 830
642
  },
643
  {
644
  "epoch": 3.12,
645
+ "learning_rate": 2.101512507710146e-06,
646
+ "loss": 0.326,
647
  "step": 840
648
  },
649
  {
650
  "epoch": 3.16,
651
+ "learning_rate": 1.8553997993420495e-06,
652
+ "loss": 0.305,
653
  "step": 850
654
  },
655
  {
656
  "epoch": 3.16,
657
+ "eval_loss": 0.3930993974208832,
658
+ "eval_runtime": 10.3853,
659
+ "eval_samples_per_second": 11.555,
660
+ "eval_steps_per_second": 2.889,
661
  "step": 850
662
  },
663
  {
664
  "epoch": 3.2,
665
+ "learning_rate": 1.623660518306293e-06,
666
+ "loss": 0.2808,
667
  "step": 860
668
  },
669
  {
670
  "epoch": 3.23,
671
+ "learning_rate": 1.4065480681043319e-06,
672
+ "loss": 0.3079,
673
  "step": 870
674
  },
675
  {
676
  "epoch": 3.27,
677
+ "learning_rate": 1.2042998580128488e-06,
678
+ "loss": 0.3259,
679
  "step": 880
680
  },
681
  {
682
  "epoch": 3.31,
683
+ "learning_rate": 1.0171370434802018e-06,
684
+ "loss": 0.3016,
685
  "step": 890
686
  },
687
  {
688
  "epoch": 3.35,
689
+ "learning_rate": 8.452642842961845e-07,
690
+ "loss": 0.3248,
691
  "step": 900
692
  },
693
  {
694
  "epoch": 3.35,
695
+ "eval_loss": 0.393511027097702,
696
+ "eval_runtime": 10.3784,
697
+ "eval_samples_per_second": 11.562,
698
+ "eval_steps_per_second": 2.891,
699
  "step": 900
700
  },
701
  {
702
  "epoch": 3.38,
703
+ "learning_rate": 6.888695207995532e-07,
704
+ "loss": 0.3202,
705
  "step": 910
706
  },
707
  {
708
  "epoch": 3.42,
709
+ "learning_rate": 5.481237683680291e-07,
710
+ "loss": 0.3202,
711
  "step": 920
712
  },
713
  {
714
  "epoch": 3.46,
715
+ "learning_rate": 4.231809304154849e-07,
716
+ "loss": 0.2963,
717
  "step": 930
718
  },
719
  {
720
  "epoch": 3.49,
721
+ "learning_rate": 3.1417763010083033e-07,
722
+ "loss": 0.298,
723
  "step": 940
724
  },
725
  {
726
  "epoch": 3.53,
727
+ "learning_rate": 2.2123306093259022e-07,
728
+ "loss": 0.3363,
729
  "step": 950
730
  },
731
  {
732
  "epoch": 3.53,
733
+ "eval_loss": 0.3934156000614166,
734
+ "eval_runtime": 10.3847,
735
+ "eval_samples_per_second": 11.556,
736
+ "eval_steps_per_second": 2.889,
737
  "step": 950
738
  },
739
  {
740
  "epoch": 3.57,
741
+ "learning_rate": 1.4444885643255136e-07,
742
+ "loss": 0.3097,
743
  "step": 960
744
  },
745
  {
746
  "epoch": 3.61,
747
+ "learning_rate": 8.390897900099781e-08,
748
+ "loss": 0.2979,
749
  "step": 970
750
  },
751
  {
752
  "epoch": 3.64,
753
+ "learning_rate": 3.9679628105067643e-08,
754
+ "loss": 0.305,
755
  "step": 980
756
  },
757
  {
758
  "epoch": 3.68,
759
+ "learning_rate": 1.1809167890592388e-08,
760
+ "loss": 0.3314,
761
  "step": 990
762
  },
763
  {
764
  "epoch": 3.72,
765
+ "learning_rate": 3.280742966310646e-10,
766
+ "loss": 0.3032,
767
  "step": 1000
768
  },
769
  {
770
  "epoch": 3.72,
771
+ "eval_loss": 0.39346638321876526,
772
+ "eval_runtime": 10.3788,
773
+ "eval_samples_per_second": 11.562,
774
+ "eval_steps_per_second": 2.891,
775
  "step": 1000
776
  },
777
  {
778
  "epoch": 3.72,
779
  "step": 1000,
780
+ "total_flos": 1.7609161899297997e+17,
781
+ "train_loss": 0.3842643254995346,
782
+ "train_runtime": 3643.6441,
783
+ "train_samples_per_second": 1.098,
784
+ "train_steps_per_second": 0.274
785
  }
786
  ],
787
  "logging_steps": 10,
788
  "max_steps": 1000,
789
  "num_train_epochs": 4,
790
  "save_steps": 500,
791
+ "total_flos": 1.7609161899297997e+17,
792
  "trial_name": null,
793
  "trial_params": null
794
  }