bitsoko commited on
Commit
4e61dfd
·
verified ·
1 Parent(s): dd758b1

Training in progress, step 50, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -10,7 +10,7 @@
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
- "lora_alpha": 4,
14
  "lora_dropout": 0,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": "unsloth",
22
  "target_modules": [
23
- "q_proj",
24
- "k_proj",
25
  "up_proj",
26
- "gate_proj",
27
- "v_proj",
28
  "down_proj",
29
- "o_proj"
 
 
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
+ "lora_alpha": 8,
14
  "lora_dropout": 0,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
 
20
  "rank_pattern": {},
21
  "revision": "unsloth",
22
  "target_modules": [
 
 
23
  "up_proj",
24
+ "o_proj",
 
25
  "down_proj",
26
+ "q_proj",
27
+ "v_proj",
28
+ "gate_proj",
29
+ "k_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b4682692992db075f9024f2c63eac83eb305915bd6c61234d4a5deca1fac75c
3
  size 1912664024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d918b445e9c66a0ce0e4e84d9061fa2984ef4baa2b691afe223994e68094059d
3
  size 1912664024
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42f24b4f52bd0b8f9ac26e16e78879f5a8c4c27264adc49893b3fc5b8423ce1e
3
- size 958697812
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:420ae61f406a1716b041f637974c538814787f0c36b6cb165d38598f7461778f
3
+ size 958697364
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0a61ccdb5b3252599d2e2432ba4f211122102a4126260f1358927b77c71653b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:376fa8cdca401e75879a433c4cf57e5c31b5973d7635f7e690ffeb05b513101b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b183230938bcc01af8f0f0d377dbe65feea9f53394705c31ed6dde5c6bf91e43
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43eaf6e8b9b7e97d802563efbae1976d32c9bf9869f2ff306873ea7672065324
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,537 +1,42 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.04294742008712191,
5
  "eval_steps": 20,
6
- "global_step": 700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0012270691453463403,
13
- "grad_norm": 0.05003070831298828,
14
  "learning_rate": 0.00019981588314717073,
15
- "loss": 2.6972,
16
  "step": 20
17
  },
18
  {
19
  "epoch": 0.0012270691453463403,
20
- "eval_loss": 2.2967841625213623,
21
- "eval_runtime": 23.2641,
22
- "eval_samples_per_second": 4.298,
23
- "eval_steps_per_second": 0.559,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.0024541382906926807,
28
- "grad_norm": 0.07180789858102798,
29
  "learning_rate": 0.00019957039401006504,
30
- "loss": 2.2022,
31
  "step": 40
32
  },
33
  {
34
  "epoch": 0.0024541382906926807,
35
- "eval_loss": 2.068006992340088,
36
- "eval_runtime": 23.5719,
37
- "eval_samples_per_second": 4.242,
38
- "eval_steps_per_second": 0.552,
39
- "step": 40
40
- },
41
- {
42
- "epoch": 0.003681207436039021,
43
- "grad_norm": 0.08049603551626205,
44
- "learning_rate": 0.00019932490487295938,
45
- "loss": 2.0529,
46
- "step": 60
47
- },
48
- {
49
- "epoch": 0.003681207436039021,
50
- "eval_loss": 1.9338455200195312,
51
- "eval_runtime": 23.1495,
52
- "eval_samples_per_second": 4.32,
53
- "eval_steps_per_second": 0.562,
54
- "step": 60
55
- },
56
- {
57
- "epoch": 0.004908276581385361,
58
- "grad_norm": 0.08653070032596588,
59
- "learning_rate": 0.00019907941573585368,
60
- "loss": 1.9395,
61
- "step": 80
62
- },
63
- {
64
- "epoch": 0.004908276581385361,
65
- "eval_loss": 1.8689138889312744,
66
- "eval_runtime": 23.7275,
67
- "eval_samples_per_second": 4.215,
68
- "eval_steps_per_second": 0.548,
69
- "step": 80
70
- },
71
- {
72
- "epoch": 0.006135345726731701,
73
- "grad_norm": 0.08481493592262268,
74
- "learning_rate": 0.00019883392659874802,
75
- "loss": 1.8773,
76
- "step": 100
77
- },
78
- {
79
- "epoch": 0.006135345726731701,
80
- "eval_loss": 1.8180441856384277,
81
- "eval_runtime": 23.5705,
82
- "eval_samples_per_second": 4.243,
83
- "eval_steps_per_second": 0.552,
84
- "step": 100
85
- },
86
- {
87
- "epoch": 0.007362414872078042,
88
- "grad_norm": 0.11568216979503632,
89
- "learning_rate": 0.00019858843746164233,
90
- "loss": 1.7827,
91
- "step": 120
92
- },
93
- {
94
- "epoch": 0.007362414872078042,
95
- "eval_loss": 1.774067997932434,
96
- "eval_runtime": 23.9722,
97
- "eval_samples_per_second": 4.172,
98
- "eval_steps_per_second": 0.542,
99
- "step": 120
100
- },
101
- {
102
- "epoch": 0.008589484017424381,
103
- "grad_norm": 0.10869361460208893,
104
- "learning_rate": 0.00019834294832453666,
105
- "loss": 1.812,
106
- "step": 140
107
- },
108
- {
109
- "epoch": 0.008589484017424381,
110
- "eval_loss": 1.737804889678955,
111
- "eval_runtime": 23.519,
112
- "eval_samples_per_second": 4.252,
113
- "eval_steps_per_second": 0.553,
114
- "step": 140
115
- },
116
- {
117
- "epoch": 0.009816553162770723,
118
- "grad_norm": 0.0976206362247467,
119
- "learning_rate": 0.00019809745918743097,
120
- "loss": 1.74,
121
- "step": 160
122
- },
123
- {
124
- "epoch": 0.009816553162770723,
125
- "eval_loss": 1.700899600982666,
126
- "eval_runtime": 23.2347,
127
- "eval_samples_per_second": 4.304,
128
- "eval_steps_per_second": 0.56,
129
- "step": 160
130
- },
131
- {
132
- "epoch": 0.011043622308117063,
133
- "grad_norm": 0.1123971939086914,
134
- "learning_rate": 0.00019785197005032528,
135
- "loss": 1.787,
136
- "step": 180
137
- },
138
- {
139
- "epoch": 0.011043622308117063,
140
- "eval_loss": 1.6765294075012207,
141
- "eval_runtime": 23.6403,
142
- "eval_samples_per_second": 4.23,
143
- "eval_steps_per_second": 0.55,
144
- "step": 180
145
- },
146
- {
147
- "epoch": 0.012270691453463402,
148
- "grad_norm": 0.10320968925952911,
149
- "learning_rate": 0.0001976064809132196,
150
- "loss": 1.7804,
151
- "step": 200
152
- },
153
- {
154
- "epoch": 0.012270691453463402,
155
- "eval_loss": 1.6563650369644165,
156
- "eval_runtime": 23.6381,
157
- "eval_samples_per_second": 4.23,
158
- "eval_steps_per_second": 0.55,
159
- "step": 200
160
- },
161
- {
162
- "epoch": 0.013497760598809742,
163
- "grad_norm": 0.14491896331310272,
164
- "learning_rate": 0.00019736099177611392,
165
- "loss": 1.7043,
166
- "step": 220
167
- },
168
- {
169
- "epoch": 0.013497760598809742,
170
- "eval_loss": 1.6346065998077393,
171
- "eval_runtime": 23.7121,
172
  "eval_samples_per_second": 4.217,
173
  "eval_steps_per_second": 0.548,
174
- "step": 220
175
- },
176
- {
177
- "epoch": 0.014724829744156084,
178
- "grad_norm": 0.12502990663051605,
179
- "learning_rate": 0.00019711550263900825,
180
- "loss": 1.7345,
181
- "step": 240
182
- },
183
- {
184
- "epoch": 0.014724829744156084,
185
- "eval_loss": 1.6147732734680176,
186
- "eval_runtime": 23.5936,
187
- "eval_samples_per_second": 4.238,
188
- "eval_steps_per_second": 0.551,
189
- "step": 240
190
- },
191
- {
192
- "epoch": 0.015951898889502422,
193
- "grad_norm": 0.1230228915810585,
194
- "learning_rate": 0.00019687001350190256,
195
- "loss": 1.7338,
196
- "step": 260
197
- },
198
- {
199
- "epoch": 0.015951898889502422,
200
- "eval_loss": 1.5957908630371094,
201
- "eval_runtime": 23.389,
202
- "eval_samples_per_second": 4.276,
203
- "eval_steps_per_second": 0.556,
204
- "step": 260
205
- },
206
- {
207
- "epoch": 0.017178968034848762,
208
- "grad_norm": 0.12000931799411774,
209
- "learning_rate": 0.00019662452436479687,
210
- "loss": 1.7143,
211
- "step": 280
212
- },
213
- {
214
- "epoch": 0.017178968034848762,
215
- "eval_loss": 1.585697889328003,
216
- "eval_runtime": 23.566,
217
- "eval_samples_per_second": 4.243,
218
- "eval_steps_per_second": 0.552,
219
- "step": 280
220
- },
221
- {
222
- "epoch": 0.018406037180195105,
223
- "grad_norm": 0.1442350149154663,
224
- "learning_rate": 0.00019637903522769118,
225
- "loss": 1.6406,
226
- "step": 300
227
- },
228
- {
229
- "epoch": 0.018406037180195105,
230
- "eval_loss": 1.5710804462432861,
231
- "eval_runtime": 23.5083,
232
- "eval_samples_per_second": 4.254,
233
- "eval_steps_per_second": 0.553,
234
- "step": 300
235
- },
236
- {
237
- "epoch": 0.019633106325541445,
238
- "grad_norm": 0.09555982798337936,
239
- "learning_rate": 0.00019613354609058549,
240
- "loss": 1.6213,
241
- "step": 320
242
- },
243
- {
244
- "epoch": 0.019633106325541445,
245
- "eval_loss": 1.5556869506835938,
246
- "eval_runtime": 23.5239,
247
- "eval_samples_per_second": 4.251,
248
- "eval_steps_per_second": 0.553,
249
- "step": 320
250
- },
251
- {
252
- "epoch": 0.020860175470887785,
253
- "grad_norm": 0.13320715725421906,
254
- "learning_rate": 0.00019588805695347982,
255
- "loss": 1.6956,
256
- "step": 340
257
- },
258
- {
259
- "epoch": 0.020860175470887785,
260
- "eval_loss": 1.5424914360046387,
261
- "eval_runtime": 23.6064,
262
- "eval_samples_per_second": 4.236,
263
- "eval_steps_per_second": 0.551,
264
- "step": 340
265
- },
266
- {
267
- "epoch": 0.022087244616234125,
268
- "grad_norm": 0.12061001360416412,
269
- "learning_rate": 0.00019564256781637413,
270
- "loss": 1.6589,
271
- "step": 360
272
- },
273
- {
274
- "epoch": 0.022087244616234125,
275
- "eval_loss": 1.528477430343628,
276
- "eval_runtime": 23.6796,
277
- "eval_samples_per_second": 4.223,
278
- "eval_steps_per_second": 0.549,
279
- "step": 360
280
- },
281
- {
282
- "epoch": 0.023314313761580465,
283
- "grad_norm": 0.14327766001224518,
284
- "learning_rate": 0.00019539707867926844,
285
- "loss": 1.5946,
286
- "step": 380
287
- },
288
- {
289
- "epoch": 0.023314313761580465,
290
- "eval_loss": 1.52202570438385,
291
- "eval_runtime": 23.6756,
292
- "eval_samples_per_second": 4.224,
293
- "eval_steps_per_second": 0.549,
294
- "step": 380
295
- },
296
- {
297
- "epoch": 0.024541382906926805,
298
- "grad_norm": 0.12291988730430603,
299
- "learning_rate": 0.00019515158954216277,
300
- "loss": 1.5366,
301
- "step": 400
302
- },
303
- {
304
- "epoch": 0.024541382906926805,
305
- "eval_loss": 1.507960319519043,
306
- "eval_runtime": 23.6216,
307
- "eval_samples_per_second": 4.233,
308
- "eval_steps_per_second": 0.55,
309
- "step": 400
310
- },
311
- {
312
- "epoch": 0.025768452052273145,
313
- "grad_norm": 0.15288175642490387,
314
- "learning_rate": 0.00019490610040505708,
315
- "loss": 1.5829,
316
- "step": 420
317
- },
318
- {
319
- "epoch": 0.025768452052273145,
320
- "eval_loss": 1.4994325637817383,
321
- "eval_runtime": 23.6368,
322
- "eval_samples_per_second": 4.231,
323
- "eval_steps_per_second": 0.55,
324
- "step": 420
325
- },
326
- {
327
- "epoch": 0.026995521197619485,
328
- "grad_norm": 0.13319191336631775,
329
- "learning_rate": 0.0001946606112679514,
330
- "loss": 1.5523,
331
- "step": 440
332
- },
333
- {
334
- "epoch": 0.026995521197619485,
335
- "eval_loss": 1.4956778287887573,
336
- "eval_runtime": 23.6921,
337
- "eval_samples_per_second": 4.221,
338
- "eval_steps_per_second": 0.549,
339
- "step": 440
340
- },
341
- {
342
- "epoch": 0.028222590342965825,
343
- "grad_norm": 0.14759239554405212,
344
- "learning_rate": 0.00019441512213084572,
345
- "loss": 1.5735,
346
- "step": 460
347
- },
348
- {
349
- "epoch": 0.028222590342965825,
350
- "eval_loss": 1.486402988433838,
351
- "eval_runtime": 23.2911,
352
- "eval_samples_per_second": 4.293,
353
- "eval_steps_per_second": 0.558,
354
- "step": 460
355
- },
356
- {
357
- "epoch": 0.029449659488312168,
358
- "grad_norm": 0.11428073793649673,
359
- "learning_rate": 0.00019416963299374006,
360
- "loss": 1.5788,
361
- "step": 480
362
- },
363
- {
364
- "epoch": 0.029449659488312168,
365
- "eval_loss": 1.4712104797363281,
366
- "eval_runtime": 23.4851,
367
- "eval_samples_per_second": 4.258,
368
- "eval_steps_per_second": 0.554,
369
- "step": 480
370
- },
371
- {
372
- "epoch": 0.030676728633658508,
373
- "grad_norm": 0.11649870127439499,
374
- "learning_rate": 0.00019392414385663436,
375
- "loss": 1.5667,
376
- "step": 500
377
- },
378
- {
379
- "epoch": 0.030676728633658508,
380
- "eval_loss": 1.4620152711868286,
381
- "eval_runtime": 23.5455,
382
- "eval_samples_per_second": 4.247,
383
- "eval_steps_per_second": 0.552,
384
- "step": 500
385
- },
386
- {
387
- "epoch": 0.031903797779004844,
388
- "grad_norm": 0.16019868850708008,
389
- "learning_rate": 0.00019367865471952867,
390
- "loss": 1.4778,
391
- "step": 520
392
- },
393
- {
394
- "epoch": 0.031903797779004844,
395
- "eval_loss": 1.4597880840301514,
396
- "eval_runtime": 23.624,
397
- "eval_samples_per_second": 4.233,
398
- "eval_steps_per_second": 0.55,
399
- "step": 520
400
- },
401
- {
402
- "epoch": 0.03313086692435119,
403
- "grad_norm": 0.1370091289281845,
404
- "learning_rate": 0.00019343316558242298,
405
- "loss": 1.5531,
406
- "step": 540
407
- },
408
- {
409
- "epoch": 0.03313086692435119,
410
- "eval_loss": 1.443243384361267,
411
- "eval_runtime": 23.5537,
412
- "eval_samples_per_second": 4.246,
413
- "eval_steps_per_second": 0.552,
414
- "step": 540
415
- },
416
- {
417
- "epoch": 0.034357936069697524,
418
- "grad_norm": 0.1211417093873024,
419
- "learning_rate": 0.0001931876764453173,
420
- "loss": 1.5879,
421
- "step": 560
422
- },
423
- {
424
- "epoch": 0.034357936069697524,
425
- "eval_loss": 1.4466437101364136,
426
- "eval_runtime": 23.8508,
427
- "eval_samples_per_second": 4.193,
428
- "eval_steps_per_second": 0.545,
429
- "step": 560
430
- },
431
- {
432
- "epoch": 0.03558500521504387,
433
- "grad_norm": 0.14397528767585754,
434
- "learning_rate": 0.00019294218730821162,
435
- "loss": 1.5352,
436
- "step": 580
437
- },
438
- {
439
- "epoch": 0.03558500521504387,
440
- "eval_loss": 1.4339115619659424,
441
- "eval_runtime": 23.649,
442
- "eval_samples_per_second": 4.229,
443
- "eval_steps_per_second": 0.55,
444
- "step": 580
445
- },
446
- {
447
- "epoch": 0.03681207436039021,
448
- "grad_norm": 0.12468410283327103,
449
- "learning_rate": 0.00019269669817110593,
450
- "loss": 1.5045,
451
- "step": 600
452
- },
453
- {
454
- "epoch": 0.03681207436039021,
455
- "eval_loss": 1.4277862310409546,
456
- "eval_runtime": 23.647,
457
- "eval_samples_per_second": 4.229,
458
- "eval_steps_per_second": 0.55,
459
- "step": 600
460
- },
461
- {
462
- "epoch": 0.03803914350573655,
463
- "grad_norm": 0.1577584445476532,
464
- "learning_rate": 0.00019245120903400024,
465
- "loss": 1.5497,
466
- "step": 620
467
- },
468
- {
469
- "epoch": 0.03803914350573655,
470
- "eval_loss": 1.4203659296035767,
471
- "eval_runtime": 23.8622,
472
- "eval_samples_per_second": 4.191,
473
- "eval_steps_per_second": 0.545,
474
- "step": 620
475
- },
476
- {
477
- "epoch": 0.03926621265108289,
478
- "grad_norm": 0.12410438805818558,
479
- "learning_rate": 0.00019220571989689457,
480
- "loss": 1.503,
481
- "step": 640
482
- },
483
- {
484
- "epoch": 0.03926621265108289,
485
- "eval_loss": 1.4154139757156372,
486
- "eval_runtime": 23.4706,
487
- "eval_samples_per_second": 4.261,
488
- "eval_steps_per_second": 0.554,
489
- "step": 640
490
- },
491
- {
492
- "epoch": 0.04049328179642923,
493
- "grad_norm": 0.13563913106918335,
494
- "learning_rate": 0.00019196023075978888,
495
- "loss": 1.4851,
496
- "step": 660
497
- },
498
- {
499
- "epoch": 0.04049328179642923,
500
- "eval_loss": 1.414802074432373,
501
- "eval_runtime": 23.3961,
502
- "eval_samples_per_second": 4.274,
503
- "eval_steps_per_second": 0.556,
504
- "step": 660
505
- },
506
- {
507
- "epoch": 0.04172035094177557,
508
- "grad_norm": 0.13915061950683594,
509
- "learning_rate": 0.00019171474162268321,
510
- "loss": 1.4847,
511
- "step": 680
512
- },
513
- {
514
- "epoch": 0.04172035094177557,
515
- "eval_loss": 1.4029760360717773,
516
- "eval_runtime": 23.6066,
517
- "eval_samples_per_second": 4.236,
518
- "eval_steps_per_second": 0.551,
519
- "step": 680
520
- },
521
- {
522
- "epoch": 0.04294742008712191,
523
- "grad_norm": 0.14418162405490875,
524
- "learning_rate": 0.00019146925248557752,
525
- "loss": 1.4724,
526
- "step": 700
527
- },
528
- {
529
- "epoch": 0.04294742008712191,
530
- "eval_loss": 1.4029196500778198,
531
- "eval_runtime": 23.5244,
532
- "eval_samples_per_second": 4.251,
533
- "eval_steps_per_second": 0.553,
534
- "step": 700
535
  }
536
  ],
537
  "logging_steps": 20,
@@ -539,7 +44,7 @@
539
  "num_input_tokens_seen": 0,
540
  "num_train_epochs": 1,
541
  "save_steps": 50,
542
- "total_flos": 5.08603168290816e+16,
543
  "train_batch_size": 2,
544
  "trial_name": null,
545
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.0030676728633658506,
5
  "eval_steps": 20,
6
+ "global_step": 50,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0012270691453463403,
13
+ "grad_norm": 0.07455573976039886,
14
  "learning_rate": 0.00019981588314717073,
15
+ "loss": 2.6233,
16
  "step": 20
17
  },
18
  {
19
  "epoch": 0.0012270691453463403,
20
+ "eval_loss": 2.2018821239471436,
21
+ "eval_runtime": 23.583,
22
+ "eval_samples_per_second": 4.24,
23
+ "eval_steps_per_second": 0.551,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.0024541382906926807,
28
+ "grad_norm": 0.0842185765504837,
29
  "learning_rate": 0.00019957039401006504,
30
+ "loss": 2.1109,
31
  "step": 40
32
  },
33
  {
34
  "epoch": 0.0024541382906926807,
35
+ "eval_loss": 1.9889661073684692,
36
+ "eval_runtime": 23.7122,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  "eval_samples_per_second": 4.217,
38
  "eval_steps_per_second": 0.548,
39
+ "step": 40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  }
41
  ],
42
  "logging_steps": 20,
 
44
  "num_input_tokens_seen": 0,
45
  "num_train_epochs": 1,
46
  "save_steps": 50,
47
+ "total_flos": 3276518679244800.0,
48
  "train_batch_size": 2,
49
  "trial_name": null,
50
  "trial_params": null