vincentmin commited on
Commit
54e0d5b
·
1 Parent(s): c8e4489

End of training

Browse files
README.md CHANGED
@@ -5,6 +5,7 @@ tags:
5
  model-index:
6
  - name: llama-2-7b-reward-oasst1
7
  results: []
 
8
  ---
9
 
10
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -36,6 +37,17 @@ More information needed
36
 
37
  ## Training procedure
38
 
 
 
 
 
 
 
 
 
 
 
 
39
  ### Training hyperparameters
40
 
41
  The following hyperparameters were used during training:
@@ -51,6 +63,7 @@ The following hyperparameters were used during training:
51
 
52
  ### Framework versions
53
 
 
54
  - Transformers 4.32.0.dev0
55
  - Pytorch 2.0.1+cu118
56
  - Datasets 2.14.0
 
5
  model-index:
6
  - name: llama-2-7b-reward-oasst1
7
  results: []
8
+ library_name: peft
9
  ---
10
 
11
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
37
 
38
  ## Training procedure
39
 
40
+
41
+ The following `bitsandbytes` quantization config was used during training:
42
+ - load_in_8bit: False
43
+ - load_in_4bit: True
44
+ - llm_int8_threshold: 6.0
45
+ - llm_int8_skip_modules: None
46
+ - llm_int8_enable_fp32_cpu_offload: False
47
+ - llm_int8_has_fp16_weight: False
48
+ - bnb_4bit_quant_type: nf4
49
+ - bnb_4bit_use_double_quant: False
50
+ - bnb_4bit_compute_dtype: float16
51
  ### Training hyperparameters
52
 
53
  The following hyperparameters were used during training:
 
63
 
64
  ### Framework versions
65
 
66
+ - PEFT 0.5.0.dev0
67
  - Transformers 4.32.0.dev0
68
  - Pytorch 2.0.1+cu118
69
  - Datasets 2.14.0
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca86272c09ee12ad51e3dd3fa34800ac25d445b24404b2334f5f602b82d01fb5
3
  size 33617169
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c28e6b880cb10257b54341041ba350a3702b791043b6f9c38cc5fdd4ea4beb96
3
  size 33617169
runs/Jul26_13-26-01_37f0fbb90905/events.out.tfevents.1690378681.37f0fbb90905.5486.1 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f5eb2378b8807874cde36b8d6b221bb892d2904e1a80158dd870bbdb3f98c60
3
- size 11119
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:281b0194e10344d5e0cad0182c29d82ec860dc9529dd505bce4bbc73c980cc11
3
+ size 12541
trainer_state.json CHANGED
@@ -1,500 +1,299 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7561820571018995,
5
- "global_step": 3165,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.01,
12
- "learning_rate": 1.978494623655914e-05,
13
- "loss": 1.0308,
14
  "step": 50
15
  },
16
  {
17
- "epoch": 0.02,
18
- "learning_rate": 1.954599761051374e-05,
19
- "loss": 0.7656,
20
  "step": 100
21
  },
22
  {
23
- "epoch": 0.04,
24
- "learning_rate": 1.930704898446834e-05,
25
- "loss": 0.8032,
26
  "step": 150
27
  },
28
  {
29
- "epoch": 0.05,
30
- "learning_rate": 1.906810035842294e-05,
31
- "loss": 0.7398,
32
  "step": 200
33
  },
34
  {
35
- "epoch": 0.06,
36
- "learning_rate": 1.882915173237754e-05,
37
- "loss": 0.7004,
38
  "step": 250
39
  },
40
  {
41
- "epoch": 0.06,
42
- "eval_accuracy": 0.6535433070866141,
43
- "eval_loss": 0.7174085378646851,
44
- "eval_runtime": 431.9798,
45
- "eval_samples_per_second": 2.058,
46
- "eval_steps_per_second": 2.058,
47
  "step": 250
48
  },
49
  {
50
- "epoch": 0.07,
51
- "learning_rate": 1.859020310633214e-05,
52
- "loss": 0.8009,
53
  "step": 300
54
  },
55
  {
56
- "epoch": 0.08,
57
- "learning_rate": 1.835125448028674e-05,
58
- "loss": 0.6551,
59
  "step": 350
60
  },
61
  {
62
- "epoch": 0.1,
63
- "learning_rate": 1.811230585424134e-05,
64
- "loss": 0.6348,
65
  "step": 400
66
  },
67
  {
68
- "epoch": 0.11,
69
- "learning_rate": 1.787335722819594e-05,
70
- "loss": 0.5236,
71
  "step": 450
72
  },
73
  {
74
- "epoch": 0.12,
75
- "learning_rate": 1.763440860215054e-05,
76
- "loss": 0.6735,
77
  "step": 500
78
  },
79
  {
80
- "epoch": 0.12,
81
- "eval_accuracy": 0.7007874015748031,
82
- "eval_loss": 0.6178467869758606,
83
- "eval_runtime": 430.5762,
84
- "eval_samples_per_second": 2.065,
85
- "eval_steps_per_second": 2.065,
86
  "step": 500
87
  },
88
  {
89
- "epoch": 0.13,
90
- "learning_rate": 1.7395459976105136e-05,
91
- "loss": 0.7371,
92
  "step": 550
93
  },
94
  {
95
- "epoch": 0.14,
96
- "learning_rate": 1.7156511350059738e-05,
97
- "loss": 0.6957,
98
  "step": 600
99
  },
100
  {
101
- "epoch": 0.16,
102
- "learning_rate": 1.691756272401434e-05,
103
- "loss": 0.592,
104
  "step": 650
105
  },
106
  {
107
- "epoch": 0.17,
108
- "learning_rate": 1.6678614097968937e-05,
109
- "loss": 0.5799,
110
  "step": 700
111
  },
112
  {
113
- "epoch": 0.18,
114
- "learning_rate": 1.6439665471923538e-05,
115
- "loss": 0.6165,
116
  "step": 750
117
  },
118
  {
119
- "epoch": 0.18,
120
- "eval_accuracy": 0.7424071991001124,
121
- "eval_loss": 0.5588846206665039,
122
- "eval_runtime": 431.475,
123
- "eval_samples_per_second": 2.06,
124
- "eval_steps_per_second": 2.06,
125
  "step": 750
126
  },
127
  {
128
- "epoch": 0.19,
129
- "learning_rate": 1.6200716845878136e-05,
130
- "loss": 0.5045,
131
  "step": 800
132
  },
133
  {
134
- "epoch": 0.2,
135
- "learning_rate": 1.5961768219832737e-05,
136
- "loss": 0.6213,
137
  "step": 850
138
  },
139
  {
140
- "epoch": 0.22,
141
- "learning_rate": 1.5722819593787338e-05,
142
- "loss": 0.6248,
143
  "step": 900
144
  },
145
  {
146
- "epoch": 0.23,
147
- "learning_rate": 1.5483870967741936e-05,
148
- "loss": 0.4998,
149
  "step": 950
150
  },
151
  {
152
- "epoch": 0.24,
153
- "learning_rate": 1.5244922341696537e-05,
154
- "loss": 0.6603,
155
  "step": 1000
156
  },
157
  {
158
- "epoch": 0.24,
159
- "eval_accuracy": 0.7559055118110236,
160
- "eval_loss": 0.5710476636886597,
161
- "eval_runtime": 432.0116,
162
- "eval_samples_per_second": 2.058,
163
- "eval_steps_per_second": 2.058,
164
  "step": 1000
165
  },
166
  {
167
- "epoch": 0.25,
168
- "learning_rate": 1.5005973715651137e-05,
169
- "loss": 0.5468,
170
  "step": 1050
171
  },
172
  {
173
- "epoch": 0.26,
174
- "learning_rate": 1.4767025089605736e-05,
175
- "loss": 0.511,
176
  "step": 1100
177
  },
178
  {
179
- "epoch": 0.27,
180
- "learning_rate": 1.4528076463560337e-05,
181
- "loss": 0.5575,
182
  "step": 1150
183
  },
184
  {
185
- "epoch": 0.29,
186
- "learning_rate": 1.4289127837514935e-05,
187
- "loss": 0.7658,
188
  "step": 1200
189
  },
190
  {
191
- "epoch": 0.3,
192
- "learning_rate": 1.4050179211469535e-05,
193
- "loss": 0.5575,
194
  "step": 1250
195
  },
196
  {
197
- "epoch": 0.3,
198
- "eval_accuracy": 0.7570303712035995,
199
- "eval_loss": 0.5421488881111145,
200
- "eval_runtime": 430.448,
201
- "eval_samples_per_second": 2.065,
202
- "eval_steps_per_second": 2.065,
203
  "step": 1250
204
  },
205
  {
206
- "epoch": 0.31,
207
- "learning_rate": 1.3811230585424136e-05,
208
- "loss": 0.6252,
209
  "step": 1300
210
  },
211
  {
212
- "epoch": 0.32,
213
- "learning_rate": 1.3572281959378735e-05,
214
- "loss": 0.4673,
215
  "step": 1350
216
  },
217
  {
218
- "epoch": 0.33,
219
- "learning_rate": 1.3333333333333333e-05,
220
- "loss": 0.5266,
221
  "step": 1400
222
  },
223
  {
224
- "epoch": 0.35,
225
- "learning_rate": 1.3094384707287935e-05,
226
- "loss": 0.6353,
227
  "step": 1450
228
  },
229
  {
230
- "epoch": 0.36,
231
- "learning_rate": 1.2855436081242534e-05,
232
- "loss": 0.589,
233
  "step": 1500
234
  },
235
  {
236
- "epoch": 0.36,
237
- "eval_accuracy": 0.750281214848144,
238
- "eval_loss": 0.5329739451408386,
239
- "eval_runtime": 429.8372,
240
- "eval_samples_per_second": 2.068,
241
- "eval_steps_per_second": 2.068,
242
  "step": 1500
243
  },
244
  {
245
- "epoch": 0.37,
246
- "learning_rate": 1.2616487455197134e-05,
247
- "loss": 0.6944,
248
  "step": 1550
249
  },
250
  {
251
- "epoch": 0.38,
252
- "learning_rate": 1.2377538829151735e-05,
253
- "loss": 0.4992,
254
  "step": 1600
255
  },
256
  {
257
- "epoch": 0.39,
258
- "learning_rate": 1.2138590203106333e-05,
259
- "loss": 0.6429,
260
  "step": 1650
261
  },
262
  {
263
- "epoch": 0.41,
264
- "learning_rate": 1.1899641577060932e-05,
265
- "loss": 0.5243,
266
  "step": 1700
267
  },
268
  {
269
- "epoch": 0.42,
270
- "learning_rate": 1.1660692951015533e-05,
271
- "loss": 0.5644,
272
- "step": 1750
273
- },
274
- {
275
- "epoch": 0.42,
276
- "eval_accuracy": 0.7559055118110236,
277
- "eval_loss": 0.5390347838401794,
278
- "eval_runtime": 429.8773,
279
- "eval_samples_per_second": 2.068,
280
- "eval_steps_per_second": 2.068,
281
  "step": 1750
282
  },
283
- {
284
- "epoch": 0.43,
285
- "learning_rate": 1.1421744324970133e-05,
286
- "loss": 0.6321,
287
- "step": 1800
288
- },
289
- {
290
- "epoch": 0.44,
291
- "learning_rate": 1.118279569892473e-05,
292
- "loss": 0.5939,
293
- "step": 1850
294
- },
295
- {
296
- "epoch": 0.45,
297
- "learning_rate": 1.0943847072879332e-05,
298
- "loss": 0.5477,
299
- "step": 1900
300
- },
301
- {
302
- "epoch": 0.47,
303
- "learning_rate": 1.0704898446833931e-05,
304
- "loss": 0.5171,
305
- "step": 1950
306
- },
307
- {
308
- "epoch": 0.48,
309
- "learning_rate": 1.0465949820788533e-05,
310
- "loss": 0.503,
311
- "step": 2000
312
- },
313
- {
314
- "epoch": 0.48,
315
- "eval_accuracy": 0.7592800899887514,
316
- "eval_loss": 0.5502843856811523,
317
- "eval_runtime": 429.7289,
318
- "eval_samples_per_second": 2.069,
319
- "eval_steps_per_second": 2.069,
320
- "step": 2000
321
- },
322
- {
323
- "epoch": 0.49,
324
- "learning_rate": 1.0227001194743132e-05,
325
- "loss": 0.6059,
326
- "step": 2050
327
- },
328
- {
329
- "epoch": 0.5,
330
- "learning_rate": 9.98805256869773e-06,
331
- "loss": 0.5716,
332
- "step": 2100
333
- },
334
- {
335
- "epoch": 0.51,
336
- "learning_rate": 9.749103942652331e-06,
337
- "loss": 0.5367,
338
- "step": 2150
339
- },
340
- {
341
- "epoch": 0.53,
342
- "learning_rate": 9.51015531660693e-06,
343
- "loss": 0.578,
344
- "step": 2200
345
- },
346
- {
347
- "epoch": 0.54,
348
- "learning_rate": 9.27120669056153e-06,
349
- "loss": 0.6361,
350
- "step": 2250
351
- },
352
- {
353
- "epoch": 0.54,
354
- "eval_accuracy": 0.7637795275590551,
355
- "eval_loss": 0.5347180366516113,
356
- "eval_runtime": 432.0598,
357
- "eval_samples_per_second": 2.058,
358
- "eval_steps_per_second": 2.058,
359
- "step": 2250
360
- },
361
- {
362
- "epoch": 0.55,
363
- "learning_rate": 9.03225806451613e-06,
364
- "loss": 0.5387,
365
- "step": 2300
366
- },
367
- {
368
- "epoch": 0.56,
369
- "learning_rate": 8.793309438470729e-06,
370
- "loss": 0.5409,
371
- "step": 2350
372
- },
373
- {
374
- "epoch": 0.57,
375
- "learning_rate": 8.55436081242533e-06,
376
- "loss": 0.4895,
377
- "step": 2400
378
- },
379
- {
380
- "epoch": 0.59,
381
- "learning_rate": 8.315412186379928e-06,
382
- "loss": 0.5966,
383
- "step": 2450
384
- },
385
- {
386
- "epoch": 0.6,
387
- "learning_rate": 8.07646356033453e-06,
388
- "loss": 0.517,
389
- "step": 2500
390
- },
391
- {
392
- "epoch": 0.6,
393
- "eval_accuracy": 0.7649043869516311,
394
- "eval_loss": 0.5409161448478699,
395
- "eval_runtime": 432.0753,
396
- "eval_samples_per_second": 2.058,
397
- "eval_steps_per_second": 2.058,
398
- "step": 2500
399
- },
400
- {
401
- "epoch": 0.61,
402
- "learning_rate": 7.837514934289129e-06,
403
- "loss": 0.5852,
404
- "step": 2550
405
- },
406
- {
407
- "epoch": 0.62,
408
- "learning_rate": 7.5985663082437275e-06,
409
- "loss": 0.5853,
410
- "step": 2600
411
- },
412
- {
413
- "epoch": 0.63,
414
- "learning_rate": 7.359617682198328e-06,
415
- "loss": 0.6096,
416
- "step": 2650
417
- },
418
- {
419
- "epoch": 0.65,
420
- "learning_rate": 7.120669056152928e-06,
421
- "loss": 0.6285,
422
- "step": 2700
423
- },
424
- {
425
- "epoch": 0.66,
426
- "learning_rate": 6.881720430107528e-06,
427
- "loss": 0.5481,
428
- "step": 2750
429
- },
430
- {
431
- "epoch": 0.66,
432
- "eval_accuracy": 0.7525309336332958,
433
- "eval_loss": 0.5184136033058167,
434
- "eval_runtime": 431.9074,
435
- "eval_samples_per_second": 2.058,
436
- "eval_steps_per_second": 2.058,
437
- "step": 2750
438
- },
439
- {
440
- "epoch": 0.67,
441
- "learning_rate": 6.642771804062127e-06,
442
- "loss": 0.6294,
443
- "step": 2800
444
- },
445
- {
446
- "epoch": 0.68,
447
- "learning_rate": 6.403823178016727e-06,
448
- "loss": 0.5585,
449
- "step": 2850
450
- },
451
- {
452
- "epoch": 0.69,
453
- "learning_rate": 6.164874551971327e-06,
454
- "loss": 0.5914,
455
- "step": 2900
456
- },
457
  {
458
  "epoch": 0.7,
459
- "learning_rate": 5.925925925925926e-06,
460
- "loss": 0.5831,
461
- "step": 2950
462
- },
463
- {
464
- "epoch": 0.72,
465
- "learning_rate": 5.686977299880526e-06,
466
- "loss": 0.6036,
467
- "step": 3000
468
  },
469
  {
470
  "epoch": 0.72,
471
- "eval_accuracy": 0.7592800899887514,
472
- "eval_loss": 0.5061925649642944,
473
- "eval_runtime": 431.9996,
474
- "eval_samples_per_second": 2.058,
475
- "eval_steps_per_second": 2.058,
476
- "step": 3000
477
- },
478
- {
479
- "epoch": 0.73,
480
- "learning_rate": 5.4480286738351265e-06,
481
- "loss": 0.4684,
482
- "step": 3050
483
  },
484
  {
485
  "epoch": 0.74,
486
- "learning_rate": 5.209080047789725e-06,
487
- "loss": 0.5459,
488
- "step": 3100
489
- },
490
- {
491
- "epoch": 0.75,
492
- "learning_rate": 4.9701314217443256e-06,
493
- "loss": 0.5358,
494
- "step": 3150
495
  }
496
  ],
497
- "max_steps": 4185,
498
  "num_train_epochs": 1,
499
  "total_flos": 0.0,
500
  "trial_name": null,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7584,
5
+ "global_step": 1896,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 0.02,
12
+ "learning_rate": 1.9624e-05,
13
+ "loss": 0.9388,
14
  "step": 50
15
  },
16
  {
17
+ "epoch": 0.04,
18
+ "learning_rate": 1.9232e-05,
19
+ "loss": 0.9522,
20
  "step": 100
21
  },
22
  {
23
+ "epoch": 0.06,
24
+ "learning_rate": 1.8832000000000002e-05,
25
+ "loss": 0.8005,
26
  "step": 150
27
  },
28
  {
29
+ "epoch": 0.08,
30
+ "learning_rate": 1.8432000000000002e-05,
31
+ "loss": 0.6908,
32
  "step": 200
33
  },
34
  {
35
+ "epoch": 0.1,
36
+ "learning_rate": 1.8032e-05,
37
+ "loss": 0.7927,
38
  "step": 250
39
  },
40
  {
41
+ "epoch": 0.1,
42
+ "eval_accuracy": 0.6659167604049494,
43
+ "eval_loss": 0.7351371645927429,
44
+ "eval_runtime": 434.6411,
45
+ "eval_samples_per_second": 2.045,
46
+ "eval_steps_per_second": 2.045,
47
  "step": 250
48
  },
49
  {
50
+ "epoch": 0.12,
51
+ "learning_rate": 1.764e-05,
52
+ "loss": 0.5962,
53
  "step": 300
54
  },
55
  {
56
+ "epoch": 0.14,
57
+ "learning_rate": 1.724e-05,
58
+ "loss": 0.6989,
59
  "step": 350
60
  },
61
  {
62
+ "epoch": 0.16,
63
+ "learning_rate": 1.684e-05,
64
+ "loss": 0.7051,
65
  "step": 400
66
  },
67
  {
68
+ "epoch": 0.18,
69
+ "learning_rate": 1.6440000000000002e-05,
70
+ "loss": 0.6664,
71
  "step": 450
72
  },
73
  {
74
+ "epoch": 0.2,
75
+ "learning_rate": 1.6040000000000002e-05,
76
+ "loss": 0.6547,
77
  "step": 500
78
  },
79
  {
80
+ "epoch": 0.2,
81
+ "eval_accuracy": 0.7041619797525309,
82
+ "eval_loss": 0.6934666037559509,
83
+ "eval_runtime": 436.2176,
84
+ "eval_samples_per_second": 2.038,
85
+ "eval_steps_per_second": 2.038,
86
  "step": 500
87
  },
88
  {
89
+ "epoch": 0.22,
90
+ "learning_rate": 1.5640000000000003e-05,
91
+ "loss": 0.665,
92
  "step": 550
93
  },
94
  {
95
+ "epoch": 0.24,
96
+ "learning_rate": 1.5240000000000001e-05,
97
+ "loss": 0.654,
98
  "step": 600
99
  },
100
  {
101
+ "epoch": 0.26,
102
+ "learning_rate": 1.4840000000000002e-05,
103
+ "loss": 0.6714,
104
  "step": 650
105
  },
106
  {
107
+ "epoch": 0.28,
108
+ "learning_rate": 1.444e-05,
109
+ "loss": 0.7395,
110
  "step": 700
111
  },
112
  {
113
+ "epoch": 0.3,
114
+ "learning_rate": 1.4040000000000001e-05,
115
+ "loss": 0.5393,
116
  "step": 750
117
  },
118
  {
119
+ "epoch": 0.3,
120
+ "eval_accuracy": 0.7142857142857143,
121
+ "eval_loss": 0.621578574180603,
122
+ "eval_runtime": 436.3187,
123
+ "eval_samples_per_second": 2.038,
124
+ "eval_steps_per_second": 2.038,
125
  "step": 750
126
  },
127
  {
128
+ "epoch": 0.32,
129
+ "learning_rate": 1.3640000000000002e-05,
130
+ "loss": 0.5185,
131
  "step": 800
132
  },
133
  {
134
+ "epoch": 0.34,
135
+ "learning_rate": 1.3240000000000002e-05,
136
+ "loss": 0.6009,
137
  "step": 850
138
  },
139
  {
140
+ "epoch": 0.36,
141
+ "learning_rate": 1.284e-05,
142
+ "loss": 0.6588,
143
  "step": 900
144
  },
145
  {
146
+ "epoch": 0.38,
147
+ "learning_rate": 1.2440000000000001e-05,
148
+ "loss": 0.6022,
149
  "step": 950
150
  },
151
  {
152
+ "epoch": 0.4,
153
+ "learning_rate": 1.204e-05,
154
+ "loss": 0.7316,
155
  "step": 1000
156
  },
157
  {
158
+ "epoch": 0.4,
159
+ "eval_accuracy": 0.734533183352081,
160
+ "eval_loss": 0.5916205644607544,
161
+ "eval_runtime": 436.6514,
162
+ "eval_samples_per_second": 2.036,
163
+ "eval_steps_per_second": 2.036,
164
  "step": 1000
165
  },
166
  {
167
+ "epoch": 0.42,
168
+ "learning_rate": 1.164e-05,
169
+ "loss": 0.6086,
170
  "step": 1050
171
  },
172
  {
173
+ "epoch": 0.44,
174
+ "learning_rate": 1.1240000000000002e-05,
175
+ "loss": 0.5806,
176
  "step": 1100
177
  },
178
  {
179
+ "epoch": 0.46,
180
+ "learning_rate": 1.0840000000000001e-05,
181
+ "loss": 0.5992,
182
  "step": 1150
183
  },
184
  {
185
+ "epoch": 0.48,
186
+ "learning_rate": 1.0440000000000002e-05,
187
+ "loss": 0.5807,
188
  "step": 1200
189
  },
190
  {
191
+ "epoch": 0.5,
192
+ "learning_rate": 1.004e-05,
193
+ "loss": 0.5667,
194
  "step": 1250
195
  },
196
  {
197
+ "epoch": 0.5,
198
+ "eval_accuracy": 0.734533183352081,
199
+ "eval_loss": 0.5785398483276367,
200
+ "eval_runtime": 436.2096,
201
+ "eval_samples_per_second": 2.038,
202
+ "eval_steps_per_second": 2.038,
203
  "step": 1250
204
  },
205
  {
206
+ "epoch": 0.52,
207
+ "learning_rate": 9.640000000000001e-06,
208
+ "loss": 0.4989,
209
  "step": 1300
210
  },
211
  {
212
+ "epoch": 0.54,
213
+ "learning_rate": 9.240000000000001e-06,
214
+ "loss": 0.6015,
215
  "step": 1350
216
  },
217
  {
218
+ "epoch": 0.56,
219
+ "learning_rate": 8.848e-06,
220
+ "loss": 0.5728,
221
  "step": 1400
222
  },
223
  {
224
+ "epoch": 0.58,
225
+ "learning_rate": 8.448000000000001e-06,
226
+ "loss": 0.6285,
227
  "step": 1450
228
  },
229
  {
230
+ "epoch": 0.6,
231
+ "learning_rate": 8.048e-06,
232
+ "loss": 0.498,
233
  "step": 1500
234
  },
235
  {
236
+ "epoch": 0.6,
237
+ "eval_accuracy": 0.7435320584926884,
238
+ "eval_loss": 0.5632913708686829,
239
+ "eval_runtime": 436.3374,
240
+ "eval_samples_per_second": 2.037,
241
+ "eval_steps_per_second": 2.037,
242
  "step": 1500
243
  },
244
  {
245
+ "epoch": 0.62,
246
+ "learning_rate": 7.648e-06,
247
+ "loss": 0.5134,
248
  "step": 1550
249
  },
250
  {
251
+ "epoch": 0.64,
252
+ "learning_rate": 7.248000000000001e-06,
253
+ "loss": 0.4582,
254
  "step": 1600
255
  },
256
  {
257
+ "epoch": 0.66,
258
+ "learning_rate": 6.848e-06,
259
+ "loss": 0.534,
260
  "step": 1650
261
  },
262
  {
263
+ "epoch": 0.68,
264
+ "learning_rate": 6.448000000000001e-06,
265
+ "loss": 0.6765,
266
  "step": 1700
267
  },
268
  {
269
+ "epoch": 0.7,
270
+ "learning_rate": 6.048e-06,
271
+ "loss": 0.6598,
 
 
 
 
 
 
 
 
 
272
  "step": 1750
273
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  {
275
  "epoch": 0.7,
276
+ "eval_accuracy": 0.7457817772778402,
277
+ "eval_loss": 0.565944254398346,
278
+ "eval_runtime": 435.7582,
279
+ "eval_samples_per_second": 2.04,
280
+ "eval_steps_per_second": 2.04,
281
+ "step": 1750
 
 
 
282
  },
283
  {
284
  "epoch": 0.72,
285
+ "learning_rate": 5.648e-06,
286
+ "loss": 0.5006,
287
+ "step": 1800
 
 
 
 
 
 
 
 
 
288
  },
289
  {
290
  "epoch": 0.74,
291
+ "learning_rate": 5.248000000000001e-06,
292
+ "loss": 0.579,
293
+ "step": 1850
 
 
 
 
 
 
294
  }
295
  ],
296
+ "max_steps": 2500,
297
  "num_train_epochs": 1,
298
  "total_flos": 0.0,
299
  "trial_name": null,