sedrickkeh commited on
Commit
1efacab
·
verified ·
1 Parent(s): 7598eae

End of training

Browse files
README.md CHANGED
@@ -4,6 +4,7 @@ license: llama3.1
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: OH_DCFT_V3_wo_evol_instruct_140k
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # OH_DCFT_V3_wo_evol_instruct_140k
17
 
18
- This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 0.6134
21
 
 
4
  base_model: meta-llama/Llama-3.1-8B
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: OH_DCFT_V3_wo_evol_instruct_140k
 
16
 
17
  # OH_DCFT_V3_wo_evol_instruct_140k
18
 
19
+ This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the mlfoundations-dev/OH_DCFT_V3_wo_evol_instruct_140k dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.6134
22
 
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_loss": 0.6164913177490234,
4
- "eval_runtime": 203.8494,
5
- "eval_samples_per_second": 49.836,
6
  "eval_steps_per_second": 0.392,
7
  "total_flos": 1894048365281280.0,
8
- "train_loss": 0.5957063028603713,
9
- "train_runtime": 34171.7918,
10
- "train_samples_per_second": 16.944,
11
  "train_steps_per_second": 0.033
12
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_loss": 0.6133805513381958,
4
+ "eval_runtime": 204.0253,
5
+ "eval_samples_per_second": 49.793,
6
  "eval_steps_per_second": 0.392,
7
  "total_flos": 1894048365281280.0,
8
+ "train_loss": 0.5970700682000076,
9
+ "train_runtime": 33949.4437,
10
+ "train_samples_per_second": 17.055,
11
  "train_steps_per_second": 0.033
12
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_loss": 0.6164913177490234,
4
- "eval_runtime": 203.8494,
5
- "eval_samples_per_second": 49.836,
6
  "eval_steps_per_second": 0.392
7
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_loss": 0.6133805513381958,
4
+ "eval_runtime": 204.0253,
5
+ "eval_samples_per_second": 49.793,
6
  "eval_steps_per_second": 0.392
7
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 3.0,
3
  "total_flos": 1894048365281280.0,
4
- "train_loss": 0.5957063028603713,
5
- "train_runtime": 34171.7918,
6
- "train_samples_per_second": 16.944,
7
  "train_steps_per_second": 0.033
8
  }
 
1
  {
2
  "epoch": 3.0,
3
  "total_flos": 1894048365281280.0,
4
+ "train_loss": 0.5970700682000076,
5
+ "train_runtime": 33949.4437,
6
+ "train_samples_per_second": 17.055,
7
  "train_steps_per_second": 0.033
8
  }
trainer_state.json CHANGED
@@ -10,826 +10,826 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.026525198938992044,
13
- "grad_norm": 2.329897814074136,
14
  "learning_rate": 5e-06,
15
- "loss": 0.8884,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.05305039787798409,
20
- "grad_norm": 3.8595726803473838,
21
  "learning_rate": 5e-06,
22
- "loss": 0.7961,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.07957559681697612,
27
- "grad_norm": 1.096843401953888,
28
  "learning_rate": 5e-06,
29
- "loss": 0.7478,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.10610079575596817,
34
- "grad_norm": 1.1211395081081943,
35
  "learning_rate": 5e-06,
36
- "loss": 0.719,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.13262599469496023,
41
- "grad_norm": 1.4282528066175646,
42
  "learning_rate": 5e-06,
43
- "loss": 0.7052,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.15915119363395225,
48
- "grad_norm": 0.7596233206271922,
49
  "learning_rate": 5e-06,
50
- "loss": 0.694,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.1856763925729443,
55
- "grad_norm": 1.2777068139599712,
56
  "learning_rate": 5e-06,
57
- "loss": 0.6882,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.21220159151193635,
62
- "grad_norm": 1.1383877936666695,
63
  "learning_rate": 5e-06,
64
- "loss": 0.6771,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.23872679045092837,
69
- "grad_norm": 1.0060703202699885,
70
  "learning_rate": 5e-06,
71
- "loss": 0.6806,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.26525198938992045,
76
- "grad_norm": 1.0179544972489034,
77
  "learning_rate": 5e-06,
78
- "loss": 0.6665,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.2917771883289125,
83
- "grad_norm": 0.6720815026983992,
84
  "learning_rate": 5e-06,
85
- "loss": 0.6637,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.3183023872679045,
90
- "grad_norm": 0.9241587286794601,
91
  "learning_rate": 5e-06,
92
- "loss": 0.6546,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.3448275862068966,
97
- "grad_norm": 0.8824991950738043,
98
  "learning_rate": 5e-06,
99
- "loss": 0.6522,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.3713527851458886,
104
- "grad_norm": 0.8302884563721783,
105
  "learning_rate": 5e-06,
106
- "loss": 0.653,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.3978779840848806,
111
- "grad_norm": 0.8327701504024553,
112
  "learning_rate": 5e-06,
113
- "loss": 0.6455,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.4244031830238727,
118
- "grad_norm": 0.5492876289580202,
119
  "learning_rate": 5e-06,
120
- "loss": 0.6553,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.4509283819628647,
125
- "grad_norm": 0.7119323938911869,
126
  "learning_rate": 5e-06,
127
- "loss": 0.6448,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.47745358090185674,
132
- "grad_norm": 0.8042997280772343,
133
  "learning_rate": 5e-06,
134
- "loss": 0.6403,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.5039787798408488,
139
- "grad_norm": 0.7810674802506331,
140
  "learning_rate": 5e-06,
141
- "loss": 0.655,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.5305039787798409,
146
- "grad_norm": 0.5391416674289061,
147
  "learning_rate": 5e-06,
148
- "loss": 0.6398,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.5570291777188329,
153
- "grad_norm": 0.6885851666539189,
154
  "learning_rate": 5e-06,
155
- "loss": 0.6452,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.583554376657825,
160
- "grad_norm": 0.6168676023297931,
161
  "learning_rate": 5e-06,
162
- "loss": 0.6443,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.610079575596817,
167
- "grad_norm": 0.5971017602698001,
168
  "learning_rate": 5e-06,
169
- "loss": 0.636,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.636604774535809,
174
- "grad_norm": 0.6022582014461996,
175
  "learning_rate": 5e-06,
176
- "loss": 0.642,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.6631299734748011,
181
- "grad_norm": 0.500260575545034,
182
  "learning_rate": 5e-06,
183
- "loss": 0.6344,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.6896551724137931,
188
- "grad_norm": 0.537336125918856,
189
  "learning_rate": 5e-06,
190
- "loss": 0.639,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.7161803713527851,
195
- "grad_norm": 0.5365620197699547,
196
  "learning_rate": 5e-06,
197
- "loss": 0.6403,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.7427055702917772,
202
- "grad_norm": 0.7683273733374665,
203
  "learning_rate": 5e-06,
204
- "loss": 0.6401,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.7692307692307693,
209
- "grad_norm": 0.5276716709186424,
210
  "learning_rate": 5e-06,
211
- "loss": 0.6338,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.7957559681697612,
216
- "grad_norm": 0.5074059045052902,
217
  "learning_rate": 5e-06,
218
- "loss": 0.6371,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.8222811671087533,
223
- "grad_norm": 0.67204833216127,
224
  "learning_rate": 5e-06,
225
- "loss": 0.639,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.8488063660477454,
230
- "grad_norm": 0.5619464110309291,
231
  "learning_rate": 5e-06,
232
- "loss": 0.6427,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.8753315649867374,
237
- "grad_norm": 0.5724558992220249,
238
  "learning_rate": 5e-06,
239
- "loss": 0.6294,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.9018567639257294,
244
- "grad_norm": 0.619777380357849,
245
  "learning_rate": 5e-06,
246
- "loss": 0.6334,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.9283819628647215,
251
- "grad_norm": 0.7434000365286857,
252
  "learning_rate": 5e-06,
253
- "loss": 0.6329,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.9549071618037135,
258
- "grad_norm": 0.5490872236122448,
259
  "learning_rate": 5e-06,
260
- "loss": 0.6286,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.9814323607427056,
265
- "grad_norm": 0.584736306154939,
266
  "learning_rate": 5e-06,
267
- "loss": 0.6296,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 1.0,
272
- "eval_loss": 0.6237567663192749,
273
- "eval_runtime": 202.8984,
274
- "eval_samples_per_second": 50.069,
275
- "eval_steps_per_second": 0.394,
276
  "step": 377
277
  },
278
  {
279
  "epoch": 1.0079575596816976,
280
- "grad_norm": 1.012512560233075,
281
  "learning_rate": 5e-06,
282
- "loss": 0.614,
283
  "step": 380
284
  },
285
  {
286
  "epoch": 1.0344827586206897,
287
- "grad_norm": 0.6423484790332888,
288
  "learning_rate": 5e-06,
289
- "loss": 0.5872,
290
  "step": 390
291
  },
292
  {
293
  "epoch": 1.0610079575596818,
294
- "grad_norm": 0.6707809448038629,
295
  "learning_rate": 5e-06,
296
- "loss": 0.5889,
297
  "step": 400
298
  },
299
  {
300
  "epoch": 1.0875331564986737,
301
- "grad_norm": 0.6650816829620442,
302
  "learning_rate": 5e-06,
303
- "loss": 0.5807,
304
  "step": 410
305
  },
306
  {
307
  "epoch": 1.1140583554376657,
308
- "grad_norm": 0.6224384353734153,
309
  "learning_rate": 5e-06,
310
- "loss": 0.5915,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.1405835543766578,
315
- "grad_norm": 0.5684723509980193,
316
  "learning_rate": 5e-06,
317
- "loss": 0.5868,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.16710875331565,
322
- "grad_norm": 0.7059253953598508,
323
  "learning_rate": 5e-06,
324
- "loss": 0.5766,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.193633952254642,
329
- "grad_norm": 0.523842539770435,
330
  "learning_rate": 5e-06,
331
- "loss": 0.5848,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.2201591511936338,
336
- "grad_norm": 0.6714226112627073,
337
  "learning_rate": 5e-06,
338
- "loss": 0.5832,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.246684350132626,
343
- "grad_norm": 0.7286525903858807,
344
  "learning_rate": 5e-06,
345
- "loss": 0.589,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.273209549071618,
350
- "grad_norm": 0.52664344746201,
351
  "learning_rate": 5e-06,
352
- "loss": 0.5781,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.29973474801061,
357
- "grad_norm": 0.5394135765946012,
358
  "learning_rate": 5e-06,
359
- "loss": 0.5793,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.3262599469496021,
364
- "grad_norm": 0.5473942505095254,
365
  "learning_rate": 5e-06,
366
- "loss": 0.5865,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.3527851458885942,
371
- "grad_norm": 0.5239209443855906,
372
  "learning_rate": 5e-06,
373
- "loss": 0.5851,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.3793103448275863,
378
- "grad_norm": 0.638268567207473,
379
  "learning_rate": 5e-06,
380
- "loss": 0.5868,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.4058355437665782,
385
- "grad_norm": 0.5569448150591939,
386
  "learning_rate": 5e-06,
387
- "loss": 0.5851,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.4323607427055702,
392
- "grad_norm": 0.5739975276281734,
393
  "learning_rate": 5e-06,
394
- "loss": 0.5807,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.4588859416445623,
399
- "grad_norm": 0.5662395827613842,
400
  "learning_rate": 5e-06,
401
- "loss": 0.5768,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.4854111405835544,
406
- "grad_norm": 0.7163603774804506,
407
  "learning_rate": 5e-06,
408
- "loss": 0.5843,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.5119363395225465,
413
- "grad_norm": 0.7163733609893056,
414
  "learning_rate": 5e-06,
415
- "loss": 0.5917,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.5384615384615383,
420
- "grad_norm": 0.6180542754631709,
421
  "learning_rate": 5e-06,
422
- "loss": 0.5817,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.5649867374005306,
427
- "grad_norm": 0.5638514267037327,
428
  "learning_rate": 5e-06,
429
- "loss": 0.5862,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.5915119363395225,
434
- "grad_norm": 0.561015542053421,
435
  "learning_rate": 5e-06,
436
- "loss": 0.5815,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.6180371352785146,
441
- "grad_norm": 0.5019925924846618,
442
  "learning_rate": 5e-06,
443
- "loss": 0.5854,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.6445623342175066,
448
- "grad_norm": 0.6456145062380878,
449
  "learning_rate": 5e-06,
450
- "loss": 0.584,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.6710875331564987,
455
- "grad_norm": 0.6273758065445275,
456
  "learning_rate": 5e-06,
457
- "loss": 0.5794,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.6976127320954908,
462
- "grad_norm": 0.47537875001219887,
463
  "learning_rate": 5e-06,
464
- "loss": 0.5831,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.7241379310344827,
469
- "grad_norm": 0.5799224686148554,
470
  "learning_rate": 5e-06,
471
- "loss": 0.5876,
472
  "step": 650
473
  },
474
  {
475
  "epoch": 1.750663129973475,
476
- "grad_norm": 0.6477709788042249,
477
  "learning_rate": 5e-06,
478
- "loss": 0.5786,
479
  "step": 660
480
  },
481
  {
482
  "epoch": 1.7771883289124668,
483
- "grad_norm": 0.5398722777232823,
484
  "learning_rate": 5e-06,
485
- "loss": 0.5788,
486
  "step": 670
487
  },
488
  {
489
  "epoch": 1.8037135278514589,
490
- "grad_norm": 0.503790352480121,
491
  "learning_rate": 5e-06,
492
- "loss": 0.5789,
493
  "step": 680
494
  },
495
  {
496
  "epoch": 1.830238726790451,
497
- "grad_norm": 0.5980642131822352,
498
  "learning_rate": 5e-06,
499
- "loss": 0.5836,
500
  "step": 690
501
  },
502
  {
503
  "epoch": 1.8567639257294428,
504
- "grad_norm": 0.5144731472377694,
505
  "learning_rate": 5e-06,
506
- "loss": 0.5818,
507
  "step": 700
508
  },
509
  {
510
  "epoch": 1.8832891246684351,
511
- "grad_norm": 0.748547689970371,
512
  "learning_rate": 5e-06,
513
- "loss": 0.5882,
514
  "step": 710
515
  },
516
  {
517
  "epoch": 1.909814323607427,
518
- "grad_norm": 0.5454719348703825,
519
  "learning_rate": 5e-06,
520
- "loss": 0.5857,
521
  "step": 720
522
  },
523
  {
524
  "epoch": 1.936339522546419,
525
- "grad_norm": 0.537701148828446,
526
  "learning_rate": 5e-06,
527
- "loss": 0.5919,
528
  "step": 730
529
  },
530
  {
531
  "epoch": 1.9628647214854111,
532
- "grad_norm": 0.5333812418899406,
533
  "learning_rate": 5e-06,
534
- "loss": 0.5804,
535
  "step": 740
536
  },
537
  {
538
  "epoch": 1.9893899204244032,
539
- "grad_norm": 0.5945192673823688,
540
  "learning_rate": 5e-06,
541
- "loss": 0.5875,
542
  "step": 750
543
  },
544
  {
545
  "epoch": 2.0,
546
- "eval_loss": 0.6138430833816528,
547
- "eval_runtime": 204.1448,
548
- "eval_samples_per_second": 49.764,
549
- "eval_steps_per_second": 0.392,
550
  "step": 754
551
  },
552
  {
553
  "epoch": 2.0159151193633953,
554
- "grad_norm": 0.70666737481995,
555
  "learning_rate": 5e-06,
556
- "loss": 0.5507,
557
  "step": 760
558
  },
559
  {
560
  "epoch": 2.042440318302387,
561
- "grad_norm": 0.6653710012047342,
562
  "learning_rate": 5e-06,
563
- "loss": 0.5366,
564
  "step": 770
565
  },
566
  {
567
  "epoch": 2.0689655172413794,
568
- "grad_norm": 0.6392050869043902,
569
  "learning_rate": 5e-06,
570
- "loss": 0.5362,
571
  "step": 780
572
  },
573
  {
574
  "epoch": 2.0954907161803713,
575
- "grad_norm": 0.552417268591106,
576
  "learning_rate": 5e-06,
577
- "loss": 0.537,
578
  "step": 790
579
  },
580
  {
581
  "epoch": 2.1220159151193636,
582
- "grad_norm": 0.5237418246816455,
583
  "learning_rate": 5e-06,
584
- "loss": 0.5387,
585
  "step": 800
586
  },
587
  {
588
  "epoch": 2.1485411140583555,
589
- "grad_norm": 0.5931401358357342,
590
  "learning_rate": 5e-06,
591
- "loss": 0.5383,
592
  "step": 810
593
  },
594
  {
595
  "epoch": 2.1750663129973473,
596
- "grad_norm": 0.5859178511093684,
597
  "learning_rate": 5e-06,
598
- "loss": 0.5385,
599
  "step": 820
600
  },
601
  {
602
  "epoch": 2.2015915119363396,
603
- "grad_norm": 0.54174318871251,
604
  "learning_rate": 5e-06,
605
- "loss": 0.536,
606
  "step": 830
607
  },
608
  {
609
  "epoch": 2.2281167108753315,
610
- "grad_norm": 0.6719837144625476,
611
  "learning_rate": 5e-06,
612
- "loss": 0.5443,
613
  "step": 840
614
  },
615
  {
616
  "epoch": 2.2546419098143238,
617
- "grad_norm": 0.5913384023217347,
618
  "learning_rate": 5e-06,
619
- "loss": 0.5275,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.2811671087533156,
624
- "grad_norm": 0.5589992603235397,
625
  "learning_rate": 5e-06,
626
- "loss": 0.5324,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.3076923076923075,
631
- "grad_norm": 0.7009013738100872,
632
  "learning_rate": 5e-06,
633
- "loss": 0.5358,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.3342175066313,
638
- "grad_norm": 0.5192518963562719,
639
  "learning_rate": 5e-06,
640
- "loss": 0.5397,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.3607427055702916,
645
- "grad_norm": 0.5673317232053819,
646
  "learning_rate": 5e-06,
647
- "loss": 0.5441,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.387267904509284,
652
- "grad_norm": 0.6565705800504016,
653
  "learning_rate": 5e-06,
654
- "loss": 0.5376,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.413793103448276,
659
- "grad_norm": 0.6497220724543952,
660
  "learning_rate": 5e-06,
661
- "loss": 0.5309,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.4403183023872677,
666
- "grad_norm": 0.5907005582908587,
667
  "learning_rate": 5e-06,
668
- "loss": 0.535,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.46684350132626,
673
- "grad_norm": 0.6229830599060672,
674
  "learning_rate": 5e-06,
675
- "loss": 0.5497,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.493368700265252,
680
- "grad_norm": 0.5474753923784902,
681
  "learning_rate": 5e-06,
682
- "loss": 0.5409,
683
  "step": 940
684
  },
685
  {
686
  "epoch": 2.519893899204244,
687
- "grad_norm": 0.5472523872985887,
688
  "learning_rate": 5e-06,
689
- "loss": 0.536,
690
  "step": 950
691
  },
692
  {
693
  "epoch": 2.546419098143236,
694
- "grad_norm": 0.9882268568658281,
695
  "learning_rate": 5e-06,
696
- "loss": 0.5375,
697
  "step": 960
698
  },
699
  {
700
  "epoch": 2.5729442970822283,
701
- "grad_norm": 0.6076358216087694,
702
  "learning_rate": 5e-06,
703
- "loss": 0.5417,
704
  "step": 970
705
  },
706
  {
707
  "epoch": 2.59946949602122,
708
- "grad_norm": 0.6233857713542503,
709
  "learning_rate": 5e-06,
710
- "loss": 0.543,
711
  "step": 980
712
  },
713
  {
714
  "epoch": 2.6259946949602124,
715
- "grad_norm": 0.662156931341361,
716
  "learning_rate": 5e-06,
717
- "loss": 0.5437,
718
  "step": 990
719
  },
720
  {
721
  "epoch": 2.6525198938992043,
722
- "grad_norm": 0.520667113721229,
723
  "learning_rate": 5e-06,
724
- "loss": 0.5392,
725
  "step": 1000
726
  },
727
  {
728
  "epoch": 2.679045092838196,
729
- "grad_norm": 0.5971607456495881,
730
  "learning_rate": 5e-06,
731
- "loss": 0.5357,
732
  "step": 1010
733
  },
734
  {
735
  "epoch": 2.7055702917771884,
736
- "grad_norm": 0.5939890330376003,
737
  "learning_rate": 5e-06,
738
- "loss": 0.5371,
739
  "step": 1020
740
  },
741
  {
742
  "epoch": 2.7320954907161803,
743
- "grad_norm": 0.5773060764723967,
744
  "learning_rate": 5e-06,
745
- "loss": 0.5359,
746
  "step": 1030
747
  },
748
  {
749
  "epoch": 2.7586206896551726,
750
- "grad_norm": 0.5827565949490094,
751
  "learning_rate": 5e-06,
752
- "loss": 0.5431,
753
  "step": 1040
754
  },
755
  {
756
  "epoch": 2.7851458885941645,
757
- "grad_norm": 0.6314529885890559,
758
  "learning_rate": 5e-06,
759
- "loss": 0.5444,
760
  "step": 1050
761
  },
762
  {
763
  "epoch": 2.8116710875331563,
764
- "grad_norm": 0.5422009039427094,
765
  "learning_rate": 5e-06,
766
- "loss": 0.54,
767
  "step": 1060
768
  },
769
  {
770
  "epoch": 2.8381962864721486,
771
- "grad_norm": 0.6028981522236778,
772
  "learning_rate": 5e-06,
773
- "loss": 0.5433,
774
  "step": 1070
775
  },
776
  {
777
  "epoch": 2.8647214854111405,
778
- "grad_norm": 0.5970004720986751,
779
  "learning_rate": 5e-06,
780
- "loss": 0.5461,
781
  "step": 1080
782
  },
783
  {
784
  "epoch": 2.8912466843501328,
785
- "grad_norm": 0.574798524733574,
786
  "learning_rate": 5e-06,
787
- "loss": 0.5397,
788
  "step": 1090
789
  },
790
  {
791
  "epoch": 2.9177718832891246,
792
- "grad_norm": 0.5790776127397061,
793
  "learning_rate": 5e-06,
794
- "loss": 0.5404,
795
  "step": 1100
796
  },
797
  {
798
  "epoch": 2.9442970822281165,
799
- "grad_norm": 0.5249016599382924,
800
  "learning_rate": 5e-06,
801
- "loss": 0.5461,
802
  "step": 1110
803
  },
804
  {
805
  "epoch": 2.970822281167109,
806
- "grad_norm": 0.6578189789009287,
807
  "learning_rate": 5e-06,
808
- "loss": 0.5387,
809
  "step": 1120
810
  },
811
  {
812
  "epoch": 2.9973474801061006,
813
- "grad_norm": 0.5226173945421619,
814
  "learning_rate": 5e-06,
815
- "loss": 0.5391,
816
  "step": 1130
817
  },
818
  {
819
  "epoch": 3.0,
820
- "eval_loss": 0.6164913177490234,
821
- "eval_runtime": 204.5208,
822
- "eval_samples_per_second": 49.672,
823
- "eval_steps_per_second": 0.391,
824
  "step": 1131
825
  },
826
  {
827
  "epoch": 3.0,
828
  "step": 1131,
829
  "total_flos": 1894048365281280.0,
830
- "train_loss": 0.5957063028603713,
831
- "train_runtime": 34171.7918,
832
- "train_samples_per_second": 16.944,
833
  "train_steps_per_second": 0.033
834
  }
835
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.026525198938992044,
13
+ "grad_norm": 2.780399544451733,
14
  "learning_rate": 5e-06,
15
+ "loss": 0.8608,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.05305039787798409,
20
+ "grad_norm": 1.606321904164578,
21
  "learning_rate": 5e-06,
22
+ "loss": 0.7691,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.07957559681697612,
27
+ "grad_norm": 1.4733236101897926,
28
  "learning_rate": 5e-06,
29
+ "loss": 0.731,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.10610079575596817,
34
+ "grad_norm": 1.0466130239794242,
35
  "learning_rate": 5e-06,
36
+ "loss": 0.7107,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.13262599469496023,
41
+ "grad_norm": 0.9068095404759675,
42
  "learning_rate": 5e-06,
43
+ "loss": 0.6977,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.15915119363395225,
48
+ "grad_norm": 0.9537695534962456,
49
  "learning_rate": 5e-06,
50
+ "loss": 0.6882,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.1856763925729443,
55
+ "grad_norm": 0.6703381757321946,
56
  "learning_rate": 5e-06,
57
+ "loss": 0.6835,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.21220159151193635,
62
+ "grad_norm": 1.00882019650449,
63
  "learning_rate": 5e-06,
64
+ "loss": 0.6724,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.23872679045092837,
69
+ "grad_norm": 1.0611084342807717,
70
  "learning_rate": 5e-06,
71
+ "loss": 0.6764,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.26525198938992045,
76
+ "grad_norm": 0.8654567865290824,
77
  "learning_rate": 5e-06,
78
+ "loss": 0.663,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.2917771883289125,
83
+ "grad_norm": 0.8547675860872707,
84
  "learning_rate": 5e-06,
85
+ "loss": 0.6602,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.3183023872679045,
90
+ "grad_norm": 0.6764470147171867,
91
  "learning_rate": 5e-06,
92
+ "loss": 0.6511,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.3448275862068966,
97
+ "grad_norm": 0.4996824578122701,
98
  "learning_rate": 5e-06,
99
+ "loss": 0.649,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.3713527851458886,
104
+ "grad_norm": 0.5555661729991889,
105
  "learning_rate": 5e-06,
106
+ "loss": 0.6499,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.3978779840848806,
111
+ "grad_norm": 0.662354493585409,
112
  "learning_rate": 5e-06,
113
+ "loss": 0.6428,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.4244031830238727,
118
+ "grad_norm": 0.47582415706089126,
119
  "learning_rate": 5e-06,
120
+ "loss": 0.6527,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.4509283819628647,
125
+ "grad_norm": 1.3542813303379895,
126
  "learning_rate": 5e-06,
127
+ "loss": 0.6423,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.47745358090185674,
132
+ "grad_norm": 0.5202115494812097,
133
  "learning_rate": 5e-06,
134
+ "loss": 0.6379,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.5039787798408488,
139
+ "grad_norm": 0.5270125702790065,
140
  "learning_rate": 5e-06,
141
+ "loss": 0.6523,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.5305039787798409,
146
+ "grad_norm": 0.4787233056340413,
147
  "learning_rate": 5e-06,
148
+ "loss": 0.6375,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.5570291777188329,
153
+ "grad_norm": 0.5279042698103554,
154
  "learning_rate": 5e-06,
155
+ "loss": 0.643,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.583554376657825,
160
+ "grad_norm": 0.7066879723483964,
161
  "learning_rate": 5e-06,
162
+ "loss": 0.6424,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.610079575596817,
167
+ "grad_norm": 0.7948472968894127,
168
  "learning_rate": 5e-06,
169
+ "loss": 0.6341,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.636604774535809,
174
+ "grad_norm": 0.5390839840399401,
175
  "learning_rate": 5e-06,
176
+ "loss": 0.64,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.6631299734748011,
181
+ "grad_norm": 0.6650275203652714,
182
  "learning_rate": 5e-06,
183
+ "loss": 0.6325,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.6896551724137931,
188
+ "grad_norm": 0.5941647978617659,
189
  "learning_rate": 5e-06,
190
+ "loss": 0.6373,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.7161803713527851,
195
+ "grad_norm": 0.5582608177644663,
196
  "learning_rate": 5e-06,
197
+ "loss": 0.6385,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.7427055702917772,
202
+ "grad_norm": 0.5805453901667452,
203
  "learning_rate": 5e-06,
204
+ "loss": 0.638,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.7692307692307693,
209
+ "grad_norm": 0.4514952002612208,
210
  "learning_rate": 5e-06,
211
+ "loss": 0.6321,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.7957559681697612,
216
+ "grad_norm": 0.5044213856224046,
217
  "learning_rate": 5e-06,
218
+ "loss": 0.6355,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.8222811671087533,
223
+ "grad_norm": 0.5268464944312369,
224
  "learning_rate": 5e-06,
225
+ "loss": 0.6374,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.8488063660477454,
230
+ "grad_norm": 0.47822319615489534,
231
  "learning_rate": 5e-06,
232
+ "loss": 0.6409,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.8753315649867374,
237
+ "grad_norm": 0.6797410044487332,
238
  "learning_rate": 5e-06,
239
+ "loss": 0.6278,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.9018567639257294,
244
+ "grad_norm": 0.5170899802801728,
245
  "learning_rate": 5e-06,
246
+ "loss": 0.6318,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.9283819628647215,
251
+ "grad_norm": 0.6765922367059989,
252
  "learning_rate": 5e-06,
253
+ "loss": 0.6314,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.9549071618037135,
258
+ "grad_norm": 0.6459795123201603,
259
  "learning_rate": 5e-06,
260
+ "loss": 0.6274,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.9814323607427056,
265
+ "grad_norm": 0.44017798959016546,
266
  "learning_rate": 5e-06,
267
+ "loss": 0.6282,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 1.0,
272
+ "eval_loss": 0.622368574142456,
273
+ "eval_runtime": 203.4748,
274
+ "eval_samples_per_second": 49.928,
275
+ "eval_steps_per_second": 0.393,
276
  "step": 377
277
  },
278
  {
279
  "epoch": 1.0079575596816976,
280
+ "grad_norm": 0.8095066775828493,
281
  "learning_rate": 5e-06,
282
+ "loss": 0.6139,
283
  "step": 380
284
  },
285
  {
286
  "epoch": 1.0344827586206897,
287
+ "grad_norm": 0.5482836222129767,
288
  "learning_rate": 5e-06,
289
+ "loss": 0.5899,
290
  "step": 390
291
  },
292
  {
293
  "epoch": 1.0610079575596818,
294
+ "grad_norm": 0.5390710199271481,
295
  "learning_rate": 5e-06,
296
+ "loss": 0.5918,
297
  "step": 400
298
  },
299
  {
300
  "epoch": 1.0875331564986737,
301
+ "grad_norm": 0.5745480856709776,
302
  "learning_rate": 5e-06,
303
+ "loss": 0.5836,
304
  "step": 410
305
  },
306
  {
307
  "epoch": 1.1140583554376657,
308
+ "grad_norm": 0.5786473175674526,
309
  "learning_rate": 5e-06,
310
+ "loss": 0.5942,
311
  "step": 420
312
  },
313
  {
314
  "epoch": 1.1405835543766578,
315
+ "grad_norm": 0.4754533900180701,
316
  "learning_rate": 5e-06,
317
+ "loss": 0.5894,
318
  "step": 430
319
  },
320
  {
321
  "epoch": 1.16710875331565,
322
+ "grad_norm": 0.626662902645802,
323
  "learning_rate": 5e-06,
324
+ "loss": 0.5792,
325
  "step": 440
326
  },
327
  {
328
  "epoch": 1.193633952254642,
329
+ "grad_norm": 0.4520996771126915,
330
  "learning_rate": 5e-06,
331
+ "loss": 0.5877,
332
  "step": 450
333
  },
334
  {
335
  "epoch": 1.2201591511936338,
336
+ "grad_norm": 0.48945343637681654,
337
  "learning_rate": 5e-06,
338
+ "loss": 0.5859,
339
  "step": 460
340
  },
341
  {
342
  "epoch": 1.246684350132626,
343
+ "grad_norm": 0.464325446341802,
344
  "learning_rate": 5e-06,
345
+ "loss": 0.5915,
346
  "step": 470
347
  },
348
  {
349
  "epoch": 1.273209549071618,
350
+ "grad_norm": 0.46045963914114574,
351
  "learning_rate": 5e-06,
352
+ "loss": 0.5807,
353
  "step": 480
354
  },
355
  {
356
  "epoch": 1.29973474801061,
357
+ "grad_norm": 0.49722516653390847,
358
  "learning_rate": 5e-06,
359
+ "loss": 0.5816,
360
  "step": 490
361
  },
362
  {
363
  "epoch": 1.3262599469496021,
364
+ "grad_norm": 0.5455802289844164,
365
  "learning_rate": 5e-06,
366
+ "loss": 0.5887,
367
  "step": 500
368
  },
369
  {
370
  "epoch": 1.3527851458885942,
371
+ "grad_norm": 0.4427674181447264,
372
  "learning_rate": 5e-06,
373
+ "loss": 0.5875,
374
  "step": 510
375
  },
376
  {
377
  "epoch": 1.3793103448275863,
378
+ "grad_norm": 0.6045736193729062,
379
  "learning_rate": 5e-06,
380
+ "loss": 0.5891,
381
  "step": 520
382
  },
383
  {
384
  "epoch": 1.4058355437665782,
385
+ "grad_norm": 0.45208550166771494,
386
  "learning_rate": 5e-06,
387
+ "loss": 0.5876,
388
  "step": 530
389
  },
390
  {
391
  "epoch": 1.4323607427055702,
392
+ "grad_norm": 0.5217117204135557,
393
  "learning_rate": 5e-06,
394
+ "loss": 0.5831,
395
  "step": 540
396
  },
397
  {
398
  "epoch": 1.4588859416445623,
399
+ "grad_norm": 0.5393363462032307,
400
  "learning_rate": 5e-06,
401
+ "loss": 0.579,
402
  "step": 550
403
  },
404
  {
405
  "epoch": 1.4854111405835544,
406
+ "grad_norm": 0.6515785188868304,
407
  "learning_rate": 5e-06,
408
+ "loss": 0.5867,
409
  "step": 560
410
  },
411
  {
412
  "epoch": 1.5119363395225465,
413
+ "grad_norm": 0.6394000137643647,
414
  "learning_rate": 5e-06,
415
+ "loss": 0.5939,
416
  "step": 570
417
  },
418
  {
419
  "epoch": 1.5384615384615383,
420
+ "grad_norm": 0.5074512553300523,
421
  "learning_rate": 5e-06,
422
+ "loss": 0.5839,
423
  "step": 580
424
  },
425
  {
426
  "epoch": 1.5649867374005306,
427
+ "grad_norm": 0.5172554765605915,
428
  "learning_rate": 5e-06,
429
+ "loss": 0.5886,
430
  "step": 590
431
  },
432
  {
433
  "epoch": 1.5915119363395225,
434
+ "grad_norm": 0.4946796812261883,
435
  "learning_rate": 5e-06,
436
+ "loss": 0.5837,
437
  "step": 600
438
  },
439
  {
440
  "epoch": 1.6180371352785146,
441
+ "grad_norm": 0.45275057698879645,
442
  "learning_rate": 5e-06,
443
+ "loss": 0.5875,
444
  "step": 610
445
  },
446
  {
447
  "epoch": 1.6445623342175066,
448
+ "grad_norm": 0.6141728231437198,
449
  "learning_rate": 5e-06,
450
+ "loss": 0.5863,
451
  "step": 620
452
  },
453
  {
454
  "epoch": 1.6710875331564987,
455
+ "grad_norm": 0.5764229361365383,
456
  "learning_rate": 5e-06,
457
+ "loss": 0.5818,
458
  "step": 630
459
  },
460
  {
461
  "epoch": 1.6976127320954908,
462
+ "grad_norm": 0.43948836202023195,
463
  "learning_rate": 5e-06,
464
+ "loss": 0.5854,
465
  "step": 640
466
  },
467
  {
468
  "epoch": 1.7241379310344827,
469
+ "grad_norm": 0.6212110766567985,
470
  "learning_rate": 5e-06,
471
+ "loss": 0.5899,
472
  "step": 650
473
  },
474
  {
475
  "epoch": 1.750663129973475,
476
+ "grad_norm": 0.5230421236325133,
477
  "learning_rate": 5e-06,
478
+ "loss": 0.5809,
479
  "step": 660
480
  },
481
  {
482
  "epoch": 1.7771883289124668,
483
+ "grad_norm": 0.42596513265710756,
484
  "learning_rate": 5e-06,
485
+ "loss": 0.581,
486
  "step": 670
487
  },
488
  {
489
  "epoch": 1.8037135278514589,
490
+ "grad_norm": 0.5019437503087736,
491
  "learning_rate": 5e-06,
492
+ "loss": 0.5811,
493
  "step": 680
494
  },
495
  {
496
  "epoch": 1.830238726790451,
497
+ "grad_norm": 0.5695097062913548,
498
  "learning_rate": 5e-06,
499
+ "loss": 0.5858,
500
  "step": 690
501
  },
502
  {
503
  "epoch": 1.8567639257294428,
504
+ "grad_norm": 0.48071601140752834,
505
  "learning_rate": 5e-06,
506
+ "loss": 0.584,
507
  "step": 700
508
  },
509
  {
510
  "epoch": 1.8832891246684351,
511
+ "grad_norm": 0.6322423860046313,
512
  "learning_rate": 5e-06,
513
+ "loss": 0.5901,
514
  "step": 710
515
  },
516
  {
517
  "epoch": 1.909814323607427,
518
+ "grad_norm": 0.504386977138495,
519
  "learning_rate": 5e-06,
520
+ "loss": 0.5878,
521
  "step": 720
522
  },
523
  {
524
  "epoch": 1.936339522546419,
525
+ "grad_norm": 0.5132407169571725,
526
  "learning_rate": 5e-06,
527
+ "loss": 0.5939,
528
  "step": 730
529
  },
530
  {
531
  "epoch": 1.9628647214854111,
532
+ "grad_norm": 0.49747549542012004,
533
  "learning_rate": 5e-06,
534
+ "loss": 0.5824,
535
  "step": 740
536
  },
537
  {
538
  "epoch": 1.9893899204244032,
539
+ "grad_norm": 0.6053057729936807,
540
  "learning_rate": 5e-06,
541
+ "loss": 0.5895,
542
  "step": 750
543
  },
544
  {
545
  "epoch": 2.0,
546
+ "eval_loss": 0.6123443841934204,
547
+ "eval_runtime": 203.0015,
548
+ "eval_samples_per_second": 50.044,
549
+ "eval_steps_per_second": 0.394,
550
  "step": 754
551
  },
552
  {
553
  "epoch": 2.0159151193633953,
554
+ "grad_norm": 0.6355273791432589,
555
  "learning_rate": 5e-06,
556
+ "loss": 0.5558,
557
  "step": 760
558
  },
559
  {
560
  "epoch": 2.042440318302387,
561
+ "grad_norm": 0.5643951304254625,
562
  "learning_rate": 5e-06,
563
+ "loss": 0.5439,
564
  "step": 770
565
  },
566
  {
567
  "epoch": 2.0689655172413794,
568
+ "grad_norm": 0.4960175423111283,
569
  "learning_rate": 5e-06,
570
+ "loss": 0.5434,
571
  "step": 780
572
  },
573
  {
574
  "epoch": 2.0954907161803713,
575
+ "grad_norm": 0.5284981337718996,
576
  "learning_rate": 5e-06,
577
+ "loss": 0.5441,
578
  "step": 790
579
  },
580
  {
581
  "epoch": 2.1220159151193636,
582
+ "grad_norm": 0.5187766913446101,
583
  "learning_rate": 5e-06,
584
+ "loss": 0.5457,
585
  "step": 800
586
  },
587
  {
588
  "epoch": 2.1485411140583555,
589
+ "grad_norm": 0.4676393193911655,
590
  "learning_rate": 5e-06,
591
+ "loss": 0.5453,
592
  "step": 810
593
  },
594
  {
595
  "epoch": 2.1750663129973473,
596
+ "grad_norm": 0.447245777123748,
597
  "learning_rate": 5e-06,
598
+ "loss": 0.5454,
599
  "step": 820
600
  },
601
  {
602
  "epoch": 2.2015915119363396,
603
+ "grad_norm": 0.4803013114072548,
604
  "learning_rate": 5e-06,
605
+ "loss": 0.5427,
606
  "step": 830
607
  },
608
  {
609
  "epoch": 2.2281167108753315,
610
+ "grad_norm": 0.6305967900882212,
611
  "learning_rate": 5e-06,
612
+ "loss": 0.5512,
613
  "step": 840
614
  },
615
  {
616
  "epoch": 2.2546419098143238,
617
+ "grad_norm": 0.4660714087302693,
618
  "learning_rate": 5e-06,
619
+ "loss": 0.5337,
620
  "step": 850
621
  },
622
  {
623
  "epoch": 2.2811671087533156,
624
+ "grad_norm": 0.5499183474925715,
625
  "learning_rate": 5e-06,
626
+ "loss": 0.539,
627
  "step": 860
628
  },
629
  {
630
  "epoch": 2.3076923076923075,
631
+ "grad_norm": 0.6029739750306148,
632
  "learning_rate": 5e-06,
633
+ "loss": 0.5423,
634
  "step": 870
635
  },
636
  {
637
  "epoch": 2.3342175066313,
638
+ "grad_norm": 0.5459822127048733,
639
  "learning_rate": 5e-06,
640
+ "loss": 0.5462,
641
  "step": 880
642
  },
643
  {
644
  "epoch": 2.3607427055702916,
645
+ "grad_norm": 0.4843657467722299,
646
  "learning_rate": 5e-06,
647
+ "loss": 0.5504,
648
  "step": 890
649
  },
650
  {
651
  "epoch": 2.387267904509284,
652
+ "grad_norm": 0.6497699445469315,
653
  "learning_rate": 5e-06,
654
+ "loss": 0.5436,
655
  "step": 900
656
  },
657
  {
658
  "epoch": 2.413793103448276,
659
+ "grad_norm": 0.4716521124138746,
660
  "learning_rate": 5e-06,
661
+ "loss": 0.537,
662
  "step": 910
663
  },
664
  {
665
  "epoch": 2.4403183023872677,
666
+ "grad_norm": 0.5180881653993036,
667
  "learning_rate": 5e-06,
668
+ "loss": 0.5412,
669
  "step": 920
670
  },
671
  {
672
  "epoch": 2.46684350132626,
673
+ "grad_norm": 0.6139310358311121,
674
  "learning_rate": 5e-06,
675
+ "loss": 0.5563,
676
  "step": 930
677
  },
678
  {
679
  "epoch": 2.493368700265252,
680
+ "grad_norm": 0.5199070076942287,
681
  "learning_rate": 5e-06,
682
+ "loss": 0.547,
683
  "step": 940
684
  },
685
  {
686
  "epoch": 2.519893899204244,
687
+ "grad_norm": 0.47708958463613943,
688
  "learning_rate": 5e-06,
689
+ "loss": 0.5419,
690
  "step": 950
691
  },
692
  {
693
  "epoch": 2.546419098143236,
694
+ "grad_norm": 0.6389465583810601,
695
  "learning_rate": 5e-06,
696
+ "loss": 0.5433,
697
  "step": 960
698
  },
699
  {
700
  "epoch": 2.5729442970822283,
701
+ "grad_norm": 0.46642283536695006,
702
  "learning_rate": 5e-06,
703
+ "loss": 0.5476,
704
  "step": 970
705
  },
706
  {
707
  "epoch": 2.59946949602122,
708
+ "grad_norm": 0.5177895417014805,
709
  "learning_rate": 5e-06,
710
+ "loss": 0.549,
711
  "step": 980
712
  },
713
  {
714
  "epoch": 2.6259946949602124,
715
+ "grad_norm": 0.5889398648766363,
716
  "learning_rate": 5e-06,
717
+ "loss": 0.5497,
718
  "step": 990
719
  },
720
  {
721
  "epoch": 2.6525198938992043,
722
+ "grad_norm": 0.49922378313664745,
723
  "learning_rate": 5e-06,
724
+ "loss": 0.5451,
725
  "step": 1000
726
  },
727
  {
728
  "epoch": 2.679045092838196,
729
+ "grad_norm": 0.5386720121866165,
730
  "learning_rate": 5e-06,
731
+ "loss": 0.5415,
732
  "step": 1010
733
  },
734
  {
735
  "epoch": 2.7055702917771884,
736
+ "grad_norm": 0.500783203019607,
737
  "learning_rate": 5e-06,
738
+ "loss": 0.5428,
739
  "step": 1020
740
  },
741
  {
742
  "epoch": 2.7320954907161803,
743
+ "grad_norm": 0.4931032038157372,
744
  "learning_rate": 5e-06,
745
+ "loss": 0.5416,
746
  "step": 1030
747
  },
748
  {
749
  "epoch": 2.7586206896551726,
750
+ "grad_norm": 0.5391229756876829,
751
  "learning_rate": 5e-06,
752
+ "loss": 0.5487,
753
  "step": 1040
754
  },
755
  {
756
  "epoch": 2.7851458885941645,
757
+ "grad_norm": 0.5489614966946841,
758
  "learning_rate": 5e-06,
759
+ "loss": 0.5499,
760
  "step": 1050
761
  },
762
  {
763
  "epoch": 2.8116710875331563,
764
+ "grad_norm": 0.4779229991518947,
765
  "learning_rate": 5e-06,
766
+ "loss": 0.5457,
767
  "step": 1060
768
  },
769
  {
770
  "epoch": 2.8381962864721486,
771
+ "grad_norm": 0.52170064853256,
772
  "learning_rate": 5e-06,
773
+ "loss": 0.549,
774
  "step": 1070
775
  },
776
  {
777
  "epoch": 2.8647214854111405,
778
+ "grad_norm": 0.5091309568444623,
779
  "learning_rate": 5e-06,
780
+ "loss": 0.552,
781
  "step": 1080
782
  },
783
  {
784
  "epoch": 2.8912466843501328,
785
+ "grad_norm": 0.45335922455307415,
786
  "learning_rate": 5e-06,
787
+ "loss": 0.5453,
788
  "step": 1090
789
  },
790
  {
791
  "epoch": 2.9177718832891246,
792
+ "grad_norm": 0.565478737240957,
793
  "learning_rate": 5e-06,
794
+ "loss": 0.546,
795
  "step": 1100
796
  },
797
  {
798
  "epoch": 2.9442970822281165,
799
+ "grad_norm": 0.5673285594805738,
800
  "learning_rate": 5e-06,
801
+ "loss": 0.5517,
802
  "step": 1110
803
  },
804
  {
805
  "epoch": 2.970822281167109,
806
+ "grad_norm": 0.503611299440423,
807
  "learning_rate": 5e-06,
808
+ "loss": 0.5444,
809
  "step": 1120
810
  },
811
  {
812
  "epoch": 2.9973474801061006,
813
+ "grad_norm": 0.5499097253524401,
814
  "learning_rate": 5e-06,
815
+ "loss": 0.5447,
816
  "step": 1130
817
  },
818
  {
819
  "epoch": 3.0,
820
+ "eval_loss": 0.6133805513381958,
821
+ "eval_runtime": 203.8433,
822
+ "eval_samples_per_second": 49.837,
823
+ "eval_steps_per_second": 0.392,
824
  "step": 1131
825
  },
826
  {
827
  "epoch": 3.0,
828
  "step": 1131,
829
  "total_flos": 1894048365281280.0,
830
+ "train_loss": 0.5970700682000076,
831
+ "train_runtime": 33949.4437,
832
+ "train_samples_per_second": 17.055,
833
  "train_steps_per_second": 0.033
834
  }
835
  ],
training_eval_loss.png CHANGED
training_loss.png CHANGED