amuvarma commited on
Commit
5ff9fe9
·
verified ·
1 Parent(s): e50e4bf

Update model

Browse files
Files changed (1) hide show
  1. trainer_state.json +993 -993
trainer_state.json CHANGED
@@ -10,2234 +10,2234 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.004032258064516129,
13
- "grad_norm": 107.96044921875,
14
  "learning_rate": 4.999799414013322e-06,
15
- "loss": 3.1763,
16
- "mean_token_accuracy": 0.48242291808128357,
17
- "num_tokens": 1851.0,
18
  "step": 1
19
  },
20
  {
21
  "epoch": 0.008064516129032258,
22
- "grad_norm": 44.63172149658203,
23
  "learning_rate": 4.999197688241076e-06,
24
- "loss": 2.6469,
25
- "mean_token_accuracy": 0.4947773516178131,
26
- "num_tokens": 3672.0,
27
  "step": 2
28
  },
29
  {
30
  "epoch": 0.012096774193548387,
31
- "grad_norm": 34.54632568359375,
32
  "learning_rate": 4.998194919241471e-06,
33
- "loss": 1.9867,
34
- "mean_token_accuracy": 0.5458515286445618,
35
- "num_tokens": 5506.0,
36
  "step": 3
37
  },
38
  {
39
  "epoch": 0.016129032258064516,
40
- "grad_norm": 12.74654769897461,
41
  "learning_rate": 4.996791267927632e-06,
42
- "loss": 1.7959,
43
- "mean_token_accuracy": 0.5700883269309998,
44
- "num_tokens": 7320.0,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.020161290322580645,
49
- "grad_norm": 24.671205520629883,
50
  "learning_rate": 4.994986959541788e-06,
51
- "loss": 1.946,
52
- "mean_token_accuracy": 0.5244379043579102,
53
- "num_tokens": 9368.0,
54
  "step": 5
55
  },
56
  {
57
  "epoch": 0.024193548387096774,
58
- "grad_norm": 8.909770011901855,
59
  "learning_rate": 4.9927822836191185e-06,
60
- "loss": 1.7736,
61
- "mean_token_accuracy": 0.5628244876861572,
62
- "num_tokens": 11296.0,
63
  "step": 6
64
  },
65
  {
66
  "epoch": 0.028225806451612902,
67
- "grad_norm": 8.721368789672852,
68
  "learning_rate": 4.990177593941303e-06,
69
- "loss": 1.7561,
70
- "mean_token_accuracy": 0.5577777624130249,
71
- "num_tokens": 13098.0,
72
  "step": 7
73
  },
74
  {
75
  "epoch": 0.03225806451612903,
76
- "grad_norm": 8.095498085021973,
77
  "learning_rate": 4.987173308479738e-06,
78
- "loss": 1.6833,
79
- "mean_token_accuracy": 0.5811339020729065,
80
- "num_tokens": 15146.0,
81
  "step": 8
82
  },
83
  {
84
  "epoch": 0.036290322580645164,
85
- "grad_norm": 7.460132598876953,
86
  "learning_rate": 4.9837699093284765e-06,
87
- "loss": 1.6155,
88
- "mean_token_accuracy": 0.5812332630157471,
89
- "num_tokens": 17013.0,
90
  "step": 9
91
  },
92
  {
93
  "epoch": 0.04032258064516129,
94
- "grad_norm": 11.340461730957031,
95
  "learning_rate": 4.9799679426268575e-06,
96
- "loss": 1.7714,
97
- "mean_token_accuracy": 0.5604277849197388,
98
- "num_tokens": 18885.0,
99
  "step": 10
100
  },
101
  {
102
  "epoch": 0.04435483870967742,
103
- "grad_norm": 7.0455827713012695,
104
  "learning_rate": 4.975768018471877e-06,
105
- "loss": 1.6119,
106
- "mean_token_accuracy": 0.5844630002975464,
107
- "num_tokens": 20805.0,
108
  "step": 11
109
  },
110
  {
111
  "epoch": 0.04838709677419355,
112
- "grad_norm": 10.100363731384277,
113
  "learning_rate": 4.971170810820279e-06,
114
- "loss": 1.6173,
115
- "mean_token_accuracy": 0.5839415788650513,
116
- "num_tokens": 22451.0,
117
  "step": 12
118
  },
119
  {
120
  "epoch": 0.05241935483870968,
121
- "grad_norm": 7.86726713180542,
122
  "learning_rate": 4.966177057380409e-06,
123
- "loss": 1.6026,
124
- "mean_token_accuracy": 0.5944099426269531,
125
- "num_tokens": 24063.0,
126
  "step": 13
127
  },
128
  {
129
  "epoch": 0.056451612903225805,
130
- "grad_norm": 7.701871871948242,
131
  "learning_rate": 4.960787559493836e-06,
132
- "loss": 1.544,
133
- "mean_token_accuracy": 0.5955132842063904,
134
- "num_tokens": 25536.0,
135
  "step": 14
136
  },
137
  {
138
  "epoch": 0.06048387096774194,
139
- "grad_norm": 6.793758869171143,
140
  "learning_rate": 4.955003182006761e-06,
141
- "loss": 1.5746,
142
- "mean_token_accuracy": 0.5850556492805481,
143
- "num_tokens": 27425.0,
144
  "step": 15
145
  },
146
  {
147
  "epoch": 0.06451612903225806,
148
- "grad_norm": 7.042947292327881,
149
  "learning_rate": 4.948824853131237e-06,
150
- "loss": 1.6941,
151
- "mean_token_accuracy": 0.5564345717430115,
152
- "num_tokens": 29323.0,
153
  "step": 16
154
  },
155
  {
156
  "epoch": 0.06854838709677419,
157
- "grad_norm": 6.403198719024658,
158
  "learning_rate": 4.942253564296217e-06,
159
- "loss": 1.491,
160
- "mean_token_accuracy": 0.6023738980293274,
161
- "num_tokens": 31010.0,
162
  "step": 17
163
  },
164
  {
165
  "epoch": 0.07258064516129033,
166
- "grad_norm": 6.8317694664001465,
167
  "learning_rate": 4.935290369988468e-06,
168
- "loss": 1.6276,
169
- "mean_token_accuracy": 0.5830005407333374,
170
- "num_tokens": 32765.0,
171
  "step": 18
172
  },
173
  {
174
  "epoch": 0.07661290322580645,
175
- "grad_norm": 6.971025466918945,
176
  "learning_rate": 4.927936387583348e-06,
177
- "loss": 1.6332,
178
- "mean_token_accuracy": 0.5976441502571106,
179
- "num_tokens": 34380.0,
180
  "step": 19
181
  },
182
  {
183
  "epoch": 0.08064516129032258,
184
- "grad_norm": 6.04213809967041,
185
  "learning_rate": 4.920192797165511e-06,
186
- "loss": 1.6032,
187
- "mean_token_accuracy": 0.5835776925086975,
188
- "num_tokens": 36428.0,
189
  "step": 20
190
  },
191
  {
192
  "epoch": 0.0846774193548387,
193
- "grad_norm": 6.803706169128418,
194
  "learning_rate": 4.912060841339536e-06,
195
- "loss": 1.5274,
196
- "mean_token_accuracy": 0.607098400592804,
197
- "num_tokens": 38036.0,
198
  "step": 21
199
  },
200
  {
201
  "epoch": 0.08870967741935484,
202
- "grad_norm": 7.13438606262207,
203
  "learning_rate": 4.9035418250305314e-06,
204
- "loss": 1.616,
205
- "mean_token_accuracy": 0.576968252658844,
206
- "num_tokens": 39740.0,
207
  "step": 22
208
  },
209
  {
210
  "epoch": 0.09274193548387097,
211
- "grad_norm": 6.349973678588867,
212
  "learning_rate": 4.894637115274728e-06,
213
- "loss": 1.5672,
214
- "mean_token_accuracy": 0.592720091342926,
215
- "num_tokens": 41775.0,
216
  "step": 23
217
  },
218
  {
219
  "epoch": 0.0967741935483871,
220
- "grad_norm": 6.1270012855529785,
221
  "learning_rate": 4.8853481410001225e-06,
222
- "loss": 1.611,
223
- "mean_token_accuracy": 0.5788690447807312,
224
- "num_tokens": 43793.0,
225
  "step": 24
226
  },
227
  {
228
  "epoch": 0.10080645161290322,
229
- "grad_norm": 6.584856986999512,
230
  "learning_rate": 4.875676392797169e-06,
231
- "loss": 1.507,
232
- "mean_token_accuracy": 0.6083052754402161,
233
- "num_tokens": 45577.0,
234
  "step": 25
235
  },
236
  {
237
  "epoch": 0.10483870967741936,
238
- "grad_norm": 6.3819260597229,
239
  "learning_rate": 4.865623422679593e-06,
240
- "loss": 1.5968,
241
- "mean_token_accuracy": 0.595588207244873,
242
- "num_tokens": 47483.0,
243
  "step": 26
244
  },
245
  {
246
  "epoch": 0.10887096774193548,
247
- "grad_norm": 7.157993793487549,
248
  "learning_rate": 4.855190843835338e-06,
249
- "loss": 1.4946,
250
- "mean_token_accuracy": 0.6049535870552063,
251
- "num_tokens": 49100.0,
252
  "step": 27
253
  },
254
  {
255
  "epoch": 0.11290322580645161,
256
- "grad_norm": 6.9199981689453125,
257
  "learning_rate": 4.844380330367701e-06,
258
- "loss": 1.4496,
259
- "mean_token_accuracy": 0.6191275119781494,
260
- "num_tokens": 50890.0,
261
  "step": 28
262
  },
263
  {
264
  "epoch": 0.11693548387096774,
265
- "grad_norm": 6.741199970245361,
266
  "learning_rate": 4.833193617026692e-06,
267
- "loss": 1.6398,
268
- "mean_token_accuracy": 0.5777652859687805,
269
- "num_tokens": 52673.0,
270
  "step": 29
271
  },
272
  {
273
  "epoch": 0.12096774193548387,
274
- "grad_norm": 6.519247531890869,
275
  "learning_rate": 4.821632498930656e-06,
276
- "loss": 1.5068,
277
- "mean_token_accuracy": 0.6147789359092712,
278
- "num_tokens": 54326.0,
279
  "step": 30
280
  },
281
  {
282
  "epoch": 0.125,
283
- "grad_norm": 6.327451705932617,
284
  "learning_rate": 4.809698831278217e-06,
285
- "loss": 1.6198,
286
- "mean_token_accuracy": 0.585949182510376,
287
- "num_tokens": 56335.0,
288
  "step": 31
289
  },
290
  {
291
  "epoch": 0.12903225806451613,
292
- "grad_norm": 6.036173343658447,
293
  "learning_rate": 4.797394529050577e-06,
294
- "loss": 1.5194,
295
- "mean_token_accuracy": 0.606656551361084,
296
- "num_tokens": 58320.0,
297
  "step": 32
298
  },
299
  {
300
  "epoch": 0.13306451612903225,
301
- "grad_norm": 7.737890720367432,
302
  "learning_rate": 4.784721566704217e-06,
303
- "loss": 1.6831,
304
- "mean_token_accuracy": 0.5820170044898987,
305
- "num_tokens": 59968.0,
306
  "step": 33
307
  },
308
  {
309
  "epoch": 0.13709677419354838,
310
- "grad_norm": 6.581600189208984,
311
  "learning_rate": 4.771681977854062e-06,
312
- "loss": 1.5672,
313
- "mean_token_accuracy": 0.592617928981781,
314
- "num_tokens": 61433.0,
315
  "step": 34
316
  },
317
  {
318
  "epoch": 0.14112903225806453,
319
- "grad_norm": 6.525332927703857,
320
  "learning_rate": 4.75827785494715e-06,
321
- "loss": 1.6133,
322
- "mean_token_accuracy": 0.5874263048171997,
323
- "num_tokens": 63471.0,
324
  "step": 35
325
  },
326
  {
327
  "epoch": 0.14516129032258066,
328
- "grad_norm": 6.5660905838012695,
329
  "learning_rate": 4.744511348926855e-06,
330
- "loss": 1.4583,
331
- "mean_token_accuracy": 0.6023869514465332,
332
- "num_tokens": 65065.0,
333
  "step": 36
334
  },
335
  {
336
  "epoch": 0.14919354838709678,
337
- "grad_norm": 5.877732753753662,
338
  "learning_rate": 4.730384668887731e-06,
339
- "loss": 1.5884,
340
- "mean_token_accuracy": 0.575619637966156,
341
- "num_tokens": 67044.0,
342
  "step": 37
343
  },
344
  {
345
  "epoch": 0.1532258064516129,
346
- "grad_norm": 6.293325901031494,
347
  "learning_rate": 4.715900081721021e-06,
348
- "loss": 1.5918,
349
- "mean_token_accuracy": 0.5884907841682434,
350
- "num_tokens": 68888.0,
351
  "step": 38
352
  },
353
  {
354
  "epoch": 0.15725806451612903,
355
- "grad_norm": 5.8657426834106445,
356
  "learning_rate": 4.7010599117508936e-06,
357
- "loss": 1.481,
358
- "mean_token_accuracy": 0.6016027331352234,
359
- "num_tokens": 70637.0,
360
  "step": 39
361
  },
362
  {
363
  "epoch": 0.16129032258064516,
364
- "grad_norm": 6.863683223724365,
365
  "learning_rate": 4.685866540361456e-06,
366
- "loss": 1.6415,
367
- "mean_token_accuracy": 0.5896656513214111,
368
- "num_tokens": 72284.0,
369
  "step": 40
370
  },
371
  {
372
  "epoch": 0.16532258064516128,
373
- "grad_norm": 6.902022838592529,
374
  "learning_rate": 4.670322405614621e-06,
375
- "loss": 1.496,
376
- "mean_token_accuracy": 0.6054931282997131,
377
- "num_tokens": 73888.0,
378
  "step": 41
379
  },
380
  {
381
  "epoch": 0.1693548387096774,
382
- "grad_norm": 5.99281120300293,
383
  "learning_rate": 4.654430001858874e-06,
384
- "loss": 1.514,
385
- "mean_token_accuracy": 0.6034939289093018,
386
- "num_tokens": 75779.0,
387
  "step": 42
388
  },
389
  {
390
  "epoch": 0.17338709677419356,
391
- "grad_norm": 6.133821487426758,
392
  "learning_rate": 4.638191879329005e-06,
393
- "loss": 1.5616,
394
- "mean_token_accuracy": 0.6049129962921143,
395
- "num_tokens": 77735.0,
396
  "step": 43
397
  },
398
  {
399
  "epoch": 0.1774193548387097,
400
- "grad_norm": 6.4369282722473145,
401
  "learning_rate": 4.621610643736878e-06,
402
- "loss": 1.4358,
403
- "mean_token_accuracy": 0.6190981268882751,
404
- "num_tokens": 79622.0,
405
  "step": 44
406
  },
407
  {
408
  "epoch": 0.1814516129032258,
409
- "grad_norm": 11.392876625061035,
410
  "learning_rate": 4.6046889558532925e-06,
411
- "loss": 1.2083,
412
- "mean_token_accuracy": 0.6125714182853699,
413
- "num_tokens": 80499.0,
414
  "step": 45
415
  },
416
  {
417
  "epoch": 0.18548387096774194,
418
- "grad_norm": 5.731788158416748,
419
  "learning_rate": 4.587429531081019e-06,
420
- "loss": 1.5584,
421
- "mean_token_accuracy": 0.5816226601600647,
422
- "num_tokens": 82547.0,
423
  "step": 46
424
  },
425
  {
426
  "epoch": 0.18951612903225806,
427
- "grad_norm": 6.6911821365356445,
428
  "learning_rate": 4.569835139019054e-06,
429
- "loss": 1.5732,
430
- "mean_token_accuracy": 0.6054380536079407,
431
- "num_tokens": 84204.0,
432
  "step": 47
433
  },
434
  {
435
  "epoch": 0.1935483870967742,
436
- "grad_norm": 5.956648349761963,
437
  "learning_rate": 4.551908603018191e-06,
438
- "loss": 1.4579,
439
- "mean_token_accuracy": 0.6021164059638977,
440
- "num_tokens": 86096.0,
441
  "step": 48
442
  },
443
  {
444
  "epoch": 0.1975806451612903,
445
- "grad_norm": 5.765907287597656,
446
  "learning_rate": 4.53365279972796e-06,
447
- "loss": 1.5831,
448
- "mean_token_accuracy": 0.5886736512184143,
449
- "num_tokens": 88111.0,
450
  "step": 49
451
  },
452
  {
453
  "epoch": 0.20161290322580644,
454
- "grad_norm": 5.812353134155273,
455
  "learning_rate": 4.515070658635013e-06,
456
- "loss": 1.5898,
457
- "mean_token_accuracy": 0.5925537347793579,
458
- "num_tokens": 90020.0,
459
  "step": 50
460
  },
461
  {
462
  "epoch": 0.2056451612903226,
463
- "grad_norm": 6.308623313903809,
464
  "learning_rate": 4.4961651615930344e-06,
465
- "loss": 1.5961,
466
- "mean_token_accuracy": 0.575691819190979,
467
- "num_tokens": 91865.0,
468
  "step": 51
469
  },
470
  {
471
  "epoch": 0.20967741935483872,
472
- "grad_norm": 5.844722270965576,
473
  "learning_rate": 4.476939342344246e-06,
474
- "loss": 1.5373,
475
- "mean_token_accuracy": 0.6010256409645081,
476
- "num_tokens": 93817.0,
477
  "step": 52
478
  },
479
  {
480
  "epoch": 0.21370967741935484,
481
- "grad_norm": 5.775147438049316,
482
  "learning_rate": 4.457396286032589e-06,
483
- "loss": 1.5615,
484
- "mean_token_accuracy": 0.5845070481300354,
485
- "num_tokens": 95807.0,
486
  "step": 53
487
  },
488
  {
489
  "epoch": 0.21774193548387097,
490
- "grad_norm": 6.662364959716797,
491
  "learning_rate": 4.437539128708647e-06,
492
- "loss": 1.7192,
493
- "mean_token_accuracy": 0.5652173757553101,
494
- "num_tokens": 97695.0,
495
  "step": 54
496
  },
497
  {
498
  "epoch": 0.2217741935483871,
499
- "grad_norm": 6.161769390106201,
500
  "learning_rate": 4.417371056826417e-06,
501
- "loss": 1.4881,
502
- "mean_token_accuracy": 0.6129223704338074,
503
- "num_tokens": 99384.0,
504
  "step": 55
505
  },
506
  {
507
  "epoch": 0.22580645161290322,
508
- "grad_norm": 5.920022487640381,
509
  "learning_rate": 4.396895306731978e-06,
510
- "loss": 1.5958,
511
- "mean_token_accuracy": 0.5787709355354309,
512
- "num_tokens": 101176.0,
513
  "step": 56
514
  },
515
  {
516
  "epoch": 0.22983870967741934,
517
- "grad_norm": 6.026185035705566,
518
  "learning_rate": 4.376115164144157e-06,
519
- "loss": 1.4987,
520
- "mean_token_accuracy": 0.610188901424408,
521
- "num_tokens": 102925.0,
522
  "step": 57
523
  },
524
  {
525
  "epoch": 0.23387096774193547,
526
- "grad_norm": 6.1018853187561035,
527
  "learning_rate": 4.355033963627277e-06,
528
- "loss": 1.4963,
529
- "mean_token_accuracy": 0.5976627469062805,
530
- "num_tokens": 104724.0,
531
  "step": 58
532
  },
533
  {
534
  "epoch": 0.23790322580645162,
535
- "grad_norm": 5.7301926612854,
536
  "learning_rate": 4.333655088056065e-06,
537
- "loss": 1.5099,
538
- "mean_token_accuracy": 0.5928535461425781,
539
- "num_tokens": 106713.0,
540
  "step": 59
541
  },
542
  {
543
  "epoch": 0.24193548387096775,
544
- "grad_norm": 6.505590915679932,
545
  "learning_rate": 4.3119819680728e-06,
546
- "loss": 1.5809,
547
- "mean_token_accuracy": 0.5960374474525452,
548
- "num_tokens": 108532.0,
549
  "step": 60
550
  },
551
  {
552
  "epoch": 0.24596774193548387,
553
- "grad_norm": 5.877953052520752,
554
  "learning_rate": 4.290018081536807e-06,
555
- "loss": 1.5485,
556
- "mean_token_accuracy": 0.5997865796089172,
557
- "num_tokens": 110408.0,
558
  "step": 61
559
  },
560
  {
561
  "epoch": 0.25,
562
- "grad_norm": 6.6848344802856445,
563
  "learning_rate": 4.267766952966369e-06,
564
- "loss": 1.6196,
565
- "mean_token_accuracy": 0.5770323872566223,
566
- "num_tokens": 112169.0,
567
  "step": 62
568
  },
569
  {
570
  "epoch": 0.2540322580645161,
571
- "grad_norm": 6.186676979064941,
572
  "learning_rate": 4.245232152973148e-06,
573
- "loss": 1.5803,
574
- "mean_token_accuracy": 0.5931076407432556,
575
- "num_tokens": 113825.0,
576
  "step": 63
577
  },
578
  {
579
  "epoch": 0.25806451612903225,
580
- "grad_norm": 6.039516448974609,
581
  "learning_rate": 4.222417297689217e-06,
582
- "loss": 1.5961,
583
- "mean_token_accuracy": 0.5947338342666626,
584
- "num_tokens": 115574.0,
585
  "step": 64
586
  },
587
  {
588
  "epoch": 0.2620967741935484,
589
- "grad_norm": 5.791101455688477,
590
  "learning_rate": 4.199326048186783e-06,
591
- "loss": 1.5687,
592
- "mean_token_accuracy": 0.5985589027404785,
593
- "num_tokens": 117519.0,
594
  "step": 65
595
  },
596
  {
597
  "epoch": 0.2661290322580645,
598
- "grad_norm": 6.39521598815918,
599
  "learning_rate": 4.175962109890697e-06,
600
- "loss": 1.4316,
601
- "mean_token_accuracy": 0.6014805436134338,
602
- "num_tokens": 119142.0,
603
  "step": 66
604
  },
605
  {
606
  "epoch": 0.2701612903225806,
607
- "grad_norm": 7.0771164894104,
608
  "learning_rate": 4.152329231983852e-06,
609
- "loss": 1.476,
610
- "mean_token_accuracy": 0.606376051902771,
611
- "num_tokens": 120681.0,
612
  "step": 67
613
  },
614
  {
615
  "epoch": 0.27419354838709675,
616
- "grad_norm": 6.331075668334961,
617
  "learning_rate": 4.128431206805556e-06,
618
- "loss": 1.411,
619
- "mean_token_accuracy": 0.6274386644363403,
620
- "num_tokens": 122272.0,
621
  "step": 68
622
  },
623
  {
624
  "epoch": 0.2782258064516129,
625
- "grad_norm": 5.685318470001221,
626
  "learning_rate": 4.104271869242975e-06,
627
- "loss": 1.5592,
628
- "mean_token_accuracy": 0.5986500382423401,
629
- "num_tokens": 124200.0,
630
  "step": 69
631
  },
632
  {
633
  "epoch": 0.28225806451612906,
634
- "grad_norm": 6.1029181480407715,
635
  "learning_rate": 4.07985509611576e-06,
636
- "loss": 1.4496,
637
- "mean_token_accuracy": 0.6037967205047607,
638
- "num_tokens": 125835.0,
639
  "step": 70
640
  },
641
  {
642
  "epoch": 0.2862903225806452,
643
- "grad_norm": 5.715236663818359,
644
  "learning_rate": 4.0551848055539345e-06,
645
- "loss": 1.4618,
646
- "mean_token_accuracy": 0.6144641041755676,
647
- "num_tokens": 127759.0,
648
  "step": 71
649
  },
650
  {
651
  "epoch": 0.2903225806451613,
652
- "grad_norm": 6.0016326904296875,
653
  "learning_rate": 4.030264956369158e-06,
654
- "loss": 1.4941,
655
- "mean_token_accuracy": 0.5982388257980347,
656
- "num_tokens": 129578.0,
657
  "step": 72
658
  },
659
  {
660
  "epoch": 0.29435483870967744,
661
- "grad_norm": 5.980151653289795,
662
  "learning_rate": 4.005099547419458e-06,
663
- "loss": 1.5329,
664
- "mean_token_accuracy": 0.6055718660354614,
665
- "num_tokens": 131626.0,
666
  "step": 73
667
  },
668
  {
669
  "epoch": 0.29838709677419356,
670
- "grad_norm": 6.0370402336120605,
671
  "learning_rate": 3.979692616967543e-06,
672
- "loss": 1.6052,
673
- "mean_token_accuracy": 0.5672669410705566,
674
- "num_tokens": 133516.0,
675
  "step": 74
676
  },
677
  {
678
  "epoch": 0.3024193548387097,
679
- "grad_norm": 6.851926803588867,
680
  "learning_rate": 3.9540482420327845e-06,
681
- "loss": 1.7212,
682
- "mean_token_accuracy": 0.564525306224823,
683
- "num_tokens": 135277.0,
684
  "step": 75
685
  },
686
  {
687
  "epoch": 0.3064516129032258,
688
- "grad_norm": 6.928350925445557,
689
  "learning_rate": 3.9281705377369814e-06,
690
- "loss": 1.4745,
691
- "mean_token_accuracy": 0.607520580291748,
692
- "num_tokens": 136981.0,
693
  "step": 76
694
  },
695
  {
696
  "epoch": 0.31048387096774194,
697
- "grad_norm": 6.389596462249756,
698
  "learning_rate": 3.902063656644012e-06,
699
- "loss": 1.5524,
700
- "mean_token_accuracy": 0.5836354494094849,
701
- "num_tokens": 138914.0,
702
  "step": 77
703
  },
704
  {
705
  "epoch": 0.31451612903225806,
706
- "grad_norm": 6.1231184005737305,
707
  "learning_rate": 3.875731788093478e-06,
708
- "loss": 1.5595,
709
- "mean_token_accuracy": 0.5914567112922668,
710
- "num_tokens": 140742.0,
711
  "step": 78
712
  },
713
  {
714
  "epoch": 0.3185483870967742,
715
- "grad_norm": 6.400425910949707,
716
  "learning_rate": 3.84917915752845e-06,
717
- "loss": 1.4809,
718
- "mean_token_accuracy": 0.5991902947425842,
719
- "num_tokens": 142226.0,
720
  "step": 79
721
  },
722
  {
723
  "epoch": 0.3225806451612903,
724
- "grad_norm": 5.729289531707764,
725
  "learning_rate": 3.8224100258174066e-06,
726
- "loss": 1.508,
727
- "mean_token_accuracy": 0.5901980996131897,
728
- "num_tokens": 144146.0,
729
  "step": 80
730
  },
731
  {
732
  "epoch": 0.32661290322580644,
733
- "grad_norm": 6.0282745361328125,
734
  "learning_rate": 3.795428688570505e-06,
735
- "loss": 1.5134,
736
- "mean_token_accuracy": 0.5803259611129761,
737
- "num_tokens": 145866.0,
738
  "step": 81
739
  },
740
  {
741
  "epoch": 0.33064516129032256,
742
- "grad_norm": 6.180048942565918,
743
  "learning_rate": 3.7682394754502687e-06,
744
- "loss": 1.5839,
745
- "mean_token_accuracy": 0.5930232405662537,
746
- "num_tokens": 147760.0,
747
  "step": 82
748
  },
749
  {
750
  "epoch": 0.3346774193548387,
751
- "grad_norm": 6.051394462585449,
752
  "learning_rate": 3.7408467494768104e-06,
753
- "loss": 1.5338,
754
- "mean_token_accuracy": 0.5994550585746765,
755
- "num_tokens": 149597.0,
756
  "step": 83
757
  },
758
  {
759
  "epoch": 0.3387096774193548,
760
- "grad_norm": 6.172335147857666,
761
  "learning_rate": 3.7132549063277033e-06,
762
- "loss": 1.5743,
763
- "mean_token_accuracy": 0.589076042175293,
764
- "num_tokens": 151558.0,
765
  "step": 84
766
  },
767
  {
768
  "epoch": 0.34274193548387094,
769
- "grad_norm": 6.0946173667907715,
770
  "learning_rate": 3.685468373632613e-06,
771
- "loss": 1.6523,
772
- "mean_token_accuracy": 0.5849376916885376,
773
- "num_tokens": 153326.0,
774
  "step": 85
775
  },
776
  {
777
  "epoch": 0.3467741935483871,
778
- "grad_norm": 5.9409685134887695,
779
  "learning_rate": 3.657491610262802e-06,
780
- "loss": 1.5014,
781
- "mean_token_accuracy": 0.5996649861335754,
782
- "num_tokens": 155119.0,
783
  "step": 86
784
  },
785
  {
786
  "epoch": 0.35080645161290325,
787
- "grad_norm": 5.449697017669678,
788
  "learning_rate": 3.6293291056156178e-06,
789
- "loss": 1.4594,
790
- "mean_token_accuracy": 0.6026719212532043,
791
- "num_tokens": 157142.0,
792
  "step": 87
793
  },
794
  {
795
  "epoch": 0.3548387096774194,
796
- "grad_norm": 5.716015338897705,
797
  "learning_rate": 3.600985378894086e-06,
798
- "loss": 1.574,
799
- "mean_token_accuracy": 0.5821114182472229,
800
- "num_tokens": 159190.0,
801
  "step": 88
802
  },
803
  {
804
  "epoch": 0.3588709677419355,
805
- "grad_norm": 5.608844757080078,
806
  "learning_rate": 3.572464978381719e-06,
807
- "loss": 1.4763,
808
- "mean_token_accuracy": 0.6000996232032776,
809
- "num_tokens": 161200.0,
810
  "step": 89
811
  },
812
  {
813
  "epoch": 0.3629032258064516,
814
- "grad_norm": 6.2373504638671875,
815
  "learning_rate": 3.5437724807126583e-06,
816
- "loss": 1.507,
817
- "mean_token_accuracy": 0.6010498404502869,
818
- "num_tokens": 163107.0,
819
  "step": 90
820
  },
821
  {
822
  "epoch": 0.36693548387096775,
823
- "grad_norm": 5.765131950378418,
824
  "learning_rate": 3.514912490137268e-06,
825
- "loss": 1.3591,
826
- "mean_token_accuracy": 0.6406074166297913,
827
- "num_tokens": 164887.0,
828
  "step": 91
829
  },
830
  {
831
  "epoch": 0.3709677419354839,
832
- "grad_norm": 6.011144638061523,
833
  "learning_rate": 3.4858896377832966e-06,
834
- "loss": 1.5002,
835
- "mean_token_accuracy": 0.6023555994033813,
836
- "num_tokens": 166672.0,
837
  "step": 92
838
  },
839
  {
840
  "epoch": 0.375,
841
- "grad_norm": 5.688225269317627,
842
  "learning_rate": 3.4567085809127247e-06,
843
- "loss": 1.5489,
844
- "mean_token_accuracy": 0.5985626578330994,
845
- "num_tokens": 168622.0,
846
  "step": 93
847
  },
848
  {
849
  "epoch": 0.3790322580645161,
850
- "grad_norm": 6.043911933898926,
851
  "learning_rate": 3.42737400217442e-06,
852
- "loss": 1.6204,
853
- "mean_token_accuracy": 0.5738548636436462,
854
- "num_tokens": 170567.0,
855
  "step": 94
856
  },
857
  {
858
  "epoch": 0.38306451612903225,
859
- "grad_norm": 6.654932498931885,
860
  "learning_rate": 3.397890608852718e-06,
861
- "loss": 1.5397,
862
- "mean_token_accuracy": 0.5841359496116638,
863
- "num_tokens": 172334.0,
864
  "step": 95
865
  },
866
  {
867
  "epoch": 0.3870967741935484,
868
- "grad_norm": 6.631567001342773,
869
  "learning_rate": 3.3682631321120507e-06,
870
- "loss": 1.4801,
871
- "mean_token_accuracy": 0.6001339554786682,
872
- "num_tokens": 173829.0,
873
  "step": 96
874
  },
875
  {
876
  "epoch": 0.3911290322580645,
877
- "grad_norm": 6.069915771484375,
878
  "learning_rate": 3.3384963262377434e-06,
879
- "loss": 1.445,
880
- "mean_token_accuracy": 0.6049661636352539,
881
- "num_tokens": 175603.0,
882
  "step": 97
883
  },
884
  {
885
  "epoch": 0.3951612903225806,
886
- "grad_norm": 5.770754814147949,
887
  "learning_rate": 3.3085949678730953e-06,
888
- "loss": 1.3734,
889
- "mean_token_accuracy": 0.6360360383987427,
890
- "num_tokens": 177270.0,
891
  "step": 98
892
  },
893
  {
894
  "epoch": 0.39919354838709675,
895
- "grad_norm": 6.306713104248047,
896
  "learning_rate": 3.278563855252885e-06,
897
- "loss": 1.3932,
898
- "mean_token_accuracy": 0.6313309669494629,
899
- "num_tokens": 178970.0,
900
  "step": 99
901
  },
902
  {
903
  "epoch": 0.4032258064516129,
904
- "grad_norm": 6.053974628448486,
905
  "learning_rate": 3.248407807433396e-06,
906
- "loss": 1.503,
907
- "mean_token_accuracy": 0.596764862537384,
908
- "num_tokens": 180703.0,
909
  "step": 100
910
  },
911
  {
912
  "epoch": 0.40725806451612906,
913
- "grad_norm": 6.127834796905518,
914
  "learning_rate": 3.2181316635191125e-06,
915
- "loss": 1.4419,
916
- "mean_token_accuracy": 0.6175858378410339,
917
- "num_tokens": 182627.0,
918
  "step": 101
919
  },
920
  {
921
  "epoch": 0.4112903225806452,
922
- "grad_norm": 5.551910877227783,
923
  "learning_rate": 3.1877402818861954e-06,
924
- "loss": 1.4529,
925
- "mean_token_accuracy": 0.6182200908660889,
926
- "num_tokens": 184528.0,
927
  "step": 102
928
  },
929
  {
930
  "epoch": 0.4153225806451613,
931
- "grad_norm": 5.696438312530518,
932
  "learning_rate": 3.157238539402862e-06,
933
- "loss": 1.4396,
934
- "mean_token_accuracy": 0.5976768732070923,
935
- "num_tokens": 186424.0,
936
  "step": 103
937
  },
938
  {
939
  "epoch": 0.41935483870967744,
940
- "grad_norm": 6.004664897918701,
941
  "learning_rate": 3.1266313306468018e-06,
942
- "loss": 1.4536,
943
- "mean_token_accuracy": 0.6109510064125061,
944
- "num_tokens": 188161.0,
945
  "step": 104
946
  },
947
  {
948
  "epoch": 0.42338709677419356,
949
- "grad_norm": 5.998103141784668,
950
  "learning_rate": 3.095923567119748e-06,
951
- "loss": 1.5062,
952
- "mean_token_accuracy": 0.5847597718238831,
953
- "num_tokens": 189974.0,
954
  "step": 105
955
  },
956
  {
957
  "epoch": 0.4274193548387097,
958
- "grad_norm": 5.998319625854492,
959
  "learning_rate": 3.0651201764593375e-06,
960
- "loss": 1.5443,
961
- "mean_token_accuracy": 0.6007860898971558,
962
- "num_tokens": 191757.0,
963
  "step": 106
964
  },
965
  {
966
  "epoch": 0.4314516129032258,
967
- "grad_norm": 5.7663469314575195,
968
  "learning_rate": 3.034226101648377e-06,
969
- "loss": 1.5604,
970
- "mean_token_accuracy": 0.5880815386772156,
971
- "num_tokens": 193672.0,
972
  "step": 107
973
  },
974
  {
975
  "epoch": 0.43548387096774194,
976
- "grad_norm": 5.39485502243042,
977
  "learning_rate": 3.0032463002216504e-06,
978
- "loss": 1.3204,
979
- "mean_token_accuracy": 0.626024603843689,
980
- "num_tokens": 195626.0,
981
  "step": 108
982
  },
983
  {
984
  "epoch": 0.43951612903225806,
985
- "grad_norm": 5.5706071853637695,
986
  "learning_rate": 2.972185743470386e-06,
987
- "loss": 1.488,
988
- "mean_token_accuracy": 0.6072655916213989,
989
- "num_tokens": 197665.0,
990
  "step": 109
991
  },
992
  {
993
  "epoch": 0.4435483870967742,
994
- "grad_norm": 6.337919235229492,
995
  "learning_rate": 2.941049415644522e-06,
996
- "loss": 1.4812,
997
- "mean_token_accuracy": 0.6153846383094788,
998
- "num_tokens": 199188.0,
999
  "step": 110
1000
  },
1001
  {
1002
  "epoch": 0.4475806451612903,
1003
- "grad_norm": 5.9900031089782715,
1004
  "learning_rate": 2.909842313152888e-06,
1005
- "loss": 1.5086,
1006
- "mean_token_accuracy": 0.602787435054779,
1007
- "num_tokens": 200912.0,
1008
  "step": 111
1009
  },
1010
  {
1011
  "epoch": 0.45161290322580644,
1012
- "grad_norm": 5.438904285430908,
1013
  "learning_rate": 2.878569443761442e-06,
1014
- "loss": 1.4478,
1015
- "mean_token_accuracy": 0.6037735939025879,
1016
- "num_tokens": 202928.0,
1017
  "step": 112
1018
  },
1019
  {
1020
  "epoch": 0.45564516129032256,
1021
- "grad_norm": 5.8790202140808105,
1022
  "learning_rate": 2.847235825789673e-06,
1023
- "loss": 1.4787,
1024
- "mean_token_accuracy": 0.6100966930389404,
1025
- "num_tokens": 204792.0,
1026
  "step": 113
1027
  },
1028
  {
1029
  "epoch": 0.4596774193548387,
1030
- "grad_norm": 6.044961452484131,
1031
  "learning_rate": 2.8158464873053236e-06,
1032
- "loss": 1.5838,
1033
- "mean_token_accuracy": 0.5734103918075562,
1034
- "num_tokens": 206524.0,
1035
  "step": 114
1036
  },
1037
  {
1038
  "epoch": 0.4637096774193548,
1039
- "grad_norm": 6.442505836486816,
1040
  "learning_rate": 2.784406465317538e-06,
1041
- "loss": 1.3986,
1042
- "mean_token_accuracy": 0.6293754577636719,
1043
- "num_tokens": 207983.0,
1044
  "step": 115
1045
  },
1046
  {
1047
  "epoch": 0.46774193548387094,
1048
- "grad_norm": 5.556701183319092,
1049
  "learning_rate": 2.752920804968581e-06,
1050
- "loss": 1.4771,
1051
- "mean_token_accuracy": 0.5975479483604431,
1052
- "num_tokens": 209861.0,
1053
  "step": 116
1054
  },
1055
  {
1056
  "epoch": 0.4717741935483871,
1057
- "grad_norm": 5.995906829833984,
1058
  "learning_rate": 2.7213945587242507e-06,
1059
- "loss": 1.5932,
1060
- "mean_token_accuracy": 0.5834970474243164,
1061
- "num_tokens": 211899.0,
1062
  "step": 117
1063
  },
1064
  {
1065
  "epoch": 0.47580645161290325,
1066
- "grad_norm": 5.339012622833252,
1067
  "learning_rate": 2.689832785563116e-06,
1068
- "loss": 1.5322,
1069
- "mean_token_accuracy": 0.5938416719436646,
1070
- "num_tokens": 213947.0,
1071
  "step": 118
1072
  },
1073
  {
1074
  "epoch": 0.4798387096774194,
1075
- "grad_norm": 5.315260887145996,
1076
  "learning_rate": 2.658240550164704e-06,
1077
- "loss": 1.5155,
1078
- "mean_token_accuracy": 0.6036168336868286,
1079
- "num_tokens": 215995.0,
1080
  "step": 119
1081
  },
1082
  {
1083
  "epoch": 0.4838709677419355,
1084
- "grad_norm": 5.855271816253662,
1085
  "learning_rate": 2.626622922096782e-06,
1086
- "loss": 1.528,
1087
- "mean_token_accuracy": 0.5960825681686401,
1088
- "num_tokens": 217886.0,
1089
  "step": 120
1090
  },
1091
  {
1092
  "epoch": 0.4879032258064516,
1093
- "grad_norm": 5.652218818664551,
1094
  "learning_rate": 2.5949849750018486e-06,
1095
- "loss": 1.3299,
1096
- "mean_token_accuracy": 0.6447288393974304,
1097
- "num_tokens": 219529.0,
1098
  "step": 121
1099
  },
1100
  {
1101
  "epoch": 0.49193548387096775,
1102
- "grad_norm": 5.632993221282959,
1103
  "learning_rate": 2.56333178578297e-06,
1104
- "loss": 1.6213,
1105
- "mean_token_accuracy": 0.5840754508972168,
1106
- "num_tokens": 221440.0,
1107
  "step": 122
1108
  },
1109
  {
1110
  "epoch": 0.4959677419354839,
1111
- "grad_norm": 5.944242000579834,
1112
  "learning_rate": 2.5316684337891005e-06,
1113
- "loss": 1.3382,
1114
- "mean_token_accuracy": 0.6456548571586609,
1115
- "num_tokens": 223076.0,
1116
  "step": 123
1117
  },
1118
  {
1119
  "epoch": 0.5,
1120
- "grad_norm": 6.07872200012207,
1121
  "learning_rate": 2.5e-06,
1122
- "loss": 1.4408,
1123
- "mean_token_accuracy": 0.6191074848175049,
1124
- "num_tokens": 224669.0,
1125
  "step": 124
1126
  },
1127
  {
1128
  "epoch": 0.5040322580645161,
1129
- "grad_norm": 5.755851745605469,
1130
  "learning_rate": 2.4683315662109003e-06,
1131
- "loss": 1.5241,
1132
- "mean_token_accuracy": 0.5973829627037048,
1133
- "num_tokens": 226658.0,
1134
  "step": 125
1135
  },
1136
  {
1137
  "epoch": 0.5080645161290323,
1138
- "grad_norm": 5.648890495300293,
1139
  "learning_rate": 2.436668214217031e-06,
1140
- "loss": 1.4993,
1141
- "mean_token_accuracy": 0.5903743505477905,
1142
- "num_tokens": 228530.0,
1143
  "step": 126
1144
  },
1145
  {
1146
  "epoch": 0.5120967741935484,
1147
- "grad_norm": 5.533483982086182,
1148
  "learning_rate": 2.4050150249981522e-06,
1149
- "loss": 1.4514,
1150
- "mean_token_accuracy": 0.603741466999054,
1151
- "num_tokens": 230296.0,
1152
  "step": 127
1153
  },
1154
  {
1155
  "epoch": 0.5161290322580645,
1156
- "grad_norm": 5.521024227142334,
1157
  "learning_rate": 2.3733770779032185e-06,
1158
- "loss": 1.6105,
1159
- "mean_token_accuracy": 0.5796676278114319,
1160
- "num_tokens": 232344.0,
1161
  "step": 128
1162
  },
1163
  {
1164
  "epoch": 0.5201612903225806,
1165
- "grad_norm": 5.75255012512207,
1166
  "learning_rate": 2.341759449835297e-06,
1167
- "loss": 1.4744,
1168
- "mean_token_accuracy": 0.6114369630813599,
1169
- "num_tokens": 234392.0,
1170
  "step": 129
1171
  },
1172
  {
1173
  "epoch": 0.5241935483870968,
1174
- "grad_norm": 5.678014755249023,
1175
  "learning_rate": 2.310167214436885e-06,
1176
- "loss": 1.499,
1177
- "mean_token_accuracy": 0.6071619391441345,
1178
- "num_tokens": 236265.0,
1179
  "step": 130
1180
  },
1181
  {
1182
  "epoch": 0.5282258064516129,
1183
- "grad_norm": 5.571830749511719,
1184
  "learning_rate": 2.27860544127575e-06,
1185
- "loss": 1.4699,
1186
- "mean_token_accuracy": 0.5967920422554016,
1187
- "num_tokens": 238075.0,
1188
  "step": 131
1189
  },
1190
  {
1191
  "epoch": 0.532258064516129,
1192
- "grad_norm": 5.447475433349609,
1193
  "learning_rate": 2.24707919503142e-06,
1194
- "loss": 1.3888,
1195
- "mean_token_accuracy": 0.6393354535102844,
1196
- "num_tokens": 239943.0,
1197
  "step": 132
1198
  },
1199
  {
1200
  "epoch": 0.5362903225806451,
1201
- "grad_norm": 5.604481220245361,
1202
  "learning_rate": 2.2155935346824634e-06,
1203
- "loss": 1.4842,
1204
- "mean_token_accuracy": 0.6134101748466492,
1205
- "num_tokens": 241854.0,
1206
  "step": 133
1207
  },
1208
  {
1209
  "epoch": 0.5403225806451613,
1210
- "grad_norm": 6.123833179473877,
1211
  "learning_rate": 2.1841535126946777e-06,
1212
- "loss": 1.5383,
1213
- "mean_token_accuracy": 0.6017410159111023,
1214
- "num_tokens": 243694.0,
1215
  "step": 134
1216
  },
1217
  {
1218
  "epoch": 0.5443548387096774,
1219
- "grad_norm": 5.753880977630615,
1220
  "learning_rate": 2.1527641742103282e-06,
1221
- "loss": 1.3975,
1222
- "mean_token_accuracy": 0.6189274191856384,
1223
- "num_tokens": 245281.0,
1224
  "step": 135
1225
  },
1226
  {
1227
  "epoch": 0.5483870967741935,
1228
- "grad_norm": 6.030308246612549,
1229
  "learning_rate": 2.1214305562385592e-06,
1230
- "loss": 1.6492,
1231
- "mean_token_accuracy": 0.5873786211013794,
1232
- "num_tokens": 247137.0,
1233
  "step": 136
1234
  },
1235
  {
1236
  "epoch": 0.5524193548387096,
1237
- "grad_norm": 5.8744072914123535,
1238
  "learning_rate": 2.0901576868471125e-06,
1239
- "loss": 1.6567,
1240
- "mean_token_accuracy": 0.5649746060371399,
1241
- "num_tokens": 249109.0,
1242
  "step": 137
1243
  },
1244
  {
1245
  "epoch": 0.5564516129032258,
1246
- "grad_norm": 5.910853385925293,
1247
  "learning_rate": 2.05895058435548e-06,
1248
- "loss": 1.3588,
1249
- "mean_token_accuracy": 0.6202830076217651,
1250
- "num_tokens": 250807.0,
1251
  "step": 138
1252
  },
1253
  {
1254
  "epoch": 0.5604838709677419,
1255
- "grad_norm": 5.257957935333252,
1256
  "learning_rate": 2.0278142565296153e-06,
1257
- "loss": 1.4603,
1258
- "mean_token_accuracy": 0.6119257211685181,
1259
- "num_tokens": 252855.0,
1260
  "step": 139
1261
  },
1262
  {
1263
  "epoch": 0.5645161290322581,
1264
- "grad_norm": 5.51725959777832,
1265
  "learning_rate": 1.9967536997783495e-06,
1266
- "loss": 1.4978,
1267
- "mean_token_accuracy": 0.6072477698326111,
1268
- "num_tokens": 254899.0,
1269
  "step": 140
1270
  },
1271
  {
1272
  "epoch": 0.5685483870967742,
1273
- "grad_norm": 5.905327796936035,
1274
  "learning_rate": 1.9657738983516227e-06,
1275
- "loss": 1.4752,
1276
- "mean_token_accuracy": 0.6073774695396423,
1277
- "num_tokens": 256880.0,
1278
  "step": 141
1279
  },
1280
  {
1281
  "epoch": 0.5725806451612904,
1282
- "grad_norm": 5.558453559875488,
1283
  "learning_rate": 1.934879823540663e-06,
1284
- "loss": 1.4172,
1285
- "mean_token_accuracy": 0.6178310513496399,
1286
- "num_tokens": 258800.0,
1287
  "step": 142
1288
  },
1289
  {
1290
  "epoch": 0.5766129032258065,
1291
- "grad_norm": 5.4025092124938965,
1292
  "learning_rate": 1.9040764328802523e-06,
1293
- "loss": 1.2946,
1294
- "mean_token_accuracy": 0.633697509765625,
1295
- "num_tokens": 260541.0,
1296
  "step": 143
1297
  },
1298
  {
1299
  "epoch": 0.5806451612903226,
1300
- "grad_norm": 5.80867338180542,
1301
  "learning_rate": 1.8733686693531986e-06,
1302
- "loss": 1.5323,
1303
- "mean_token_accuracy": 0.5953912138938904,
1304
- "num_tokens": 262409.0,
1305
  "step": 144
1306
  },
1307
  {
1308
  "epoch": 0.5846774193548387,
1309
- "grad_norm": 5.8681960105896,
1310
  "learning_rate": 1.842761460597138e-06,
1311
- "loss": 1.4026,
1312
- "mean_token_accuracy": 0.6375969052314758,
1313
- "num_tokens": 263959.0,
1314
  "step": 145
1315
  },
1316
  {
1317
  "epoch": 0.5887096774193549,
1318
- "grad_norm": 7.02825403213501,
1319
  "learning_rate": 1.812259718113805e-06,
1320
- "loss": 1.5044,
1321
- "mean_token_accuracy": 0.5856146216392517,
1322
- "num_tokens": 265824.0,
1323
  "step": 146
1324
  },
1325
  {
1326
  "epoch": 0.592741935483871,
1327
- "grad_norm": 6.359186172485352,
1328
  "learning_rate": 1.7818683364808883e-06,
1329
- "loss": 1.6111,
1330
- "mean_token_accuracy": 0.5825688242912292,
1331
- "num_tokens": 267570.0,
1332
  "step": 147
1333
  },
1334
  {
1335
  "epoch": 0.5967741935483871,
1336
- "grad_norm": 6.150012493133545,
1337
  "learning_rate": 1.7515921925666053e-06,
1338
- "loss": 1.522,
1339
- "mean_token_accuracy": 0.6050228476524353,
1340
- "num_tokens": 269324.0,
1341
  "step": 148
1342
  },
1343
  {
1344
  "epoch": 0.6008064516129032,
1345
- "grad_norm": 5.904049873352051,
1346
  "learning_rate": 1.7214361447471156e-06,
1347
- "loss": 1.5305,
1348
- "mean_token_accuracy": 0.5844994783401489,
1349
- "num_tokens": 271184.0,
1350
  "step": 149
1351
  },
1352
  {
1353
  "epoch": 0.6048387096774194,
1354
- "grad_norm": 5.567264080047607,
1355
  "learning_rate": 1.6914050321269049e-06,
1356
- "loss": 1.3721,
1357
- "mean_token_accuracy": 0.6215351819992065,
1358
- "num_tokens": 273062.0,
1359
  "step": 150
1360
  },
1361
  {
1362
  "epoch": 0.6088709677419355,
1363
- "grad_norm": 6.086479187011719,
1364
  "learning_rate": 1.6615036737622574e-06,
1365
- "loss": 1.5675,
1366
- "mean_token_accuracy": 0.5931184887886047,
1367
- "num_tokens": 274895.0,
1368
  "step": 151
1369
  },
1370
  {
1371
  "epoch": 0.6129032258064516,
1372
- "grad_norm": 5.990588188171387,
1373
  "learning_rate": 1.6317368678879497e-06,
1374
- "loss": 1.5735,
1375
- "mean_token_accuracy": 0.5785550475120544,
1376
- "num_tokens": 276641.0,
1377
  "step": 152
1378
  },
1379
  {
1380
  "epoch": 0.6169354838709677,
1381
- "grad_norm": 5.858160972595215,
1382
  "learning_rate": 1.6021093911472825e-06,
1383
- "loss": 1.5325,
1384
- "mean_token_accuracy": 0.605381190776825,
1385
- "num_tokens": 278427.0,
1386
  "step": 153
1387
  },
1388
  {
1389
  "epoch": 0.6209677419354839,
1390
- "grad_norm": 5.8571929931640625,
1391
  "learning_rate": 1.572625997825581e-06,
1392
- "loss": 1.4536,
1393
- "mean_token_accuracy": 0.6045584082603455,
1394
- "num_tokens": 280184.0,
1395
  "step": 154
1396
  },
1397
  {
1398
  "epoch": 0.625,
1399
- "grad_norm": 5.734030246734619,
1400
  "learning_rate": 1.5432914190872757e-06,
1401
- "loss": 1.3984,
1402
- "mean_token_accuracy": 0.6304709315299988,
1403
- "num_tokens": 281991.0,
1404
  "step": 155
1405
  },
1406
  {
1407
  "epoch": 0.6290322580645161,
1408
- "grad_norm": 5.276788234710693,
1409
  "learning_rate": 1.5141103622167042e-06,
1410
- "loss": 1.4146,
1411
- "mean_token_accuracy": 0.6243386268615723,
1412
- "num_tokens": 283883.0,
1413
  "step": 156
1414
  },
1415
  {
1416
  "epoch": 0.6330645161290323,
1417
- "grad_norm": 6.132949352264404,
1418
  "learning_rate": 1.4850875098627326e-06,
1419
- "loss": 1.5479,
1420
- "mean_token_accuracy": 0.6136772632598877,
1421
- "num_tokens": 285552.0,
1422
  "step": 157
1423
  },
1424
  {
1425
  "epoch": 0.6370967741935484,
1426
- "grad_norm": 5.336827754974365,
1427
  "learning_rate": 1.456227519287343e-06,
1428
- "loss": 1.4708,
1429
- "mean_token_accuracy": 0.6063033938407898,
1430
- "num_tokens": 287426.0,
1431
  "step": 158
1432
  },
1433
  {
1434
  "epoch": 0.6411290322580645,
1435
- "grad_norm": 5.892852306365967,
1436
  "learning_rate": 1.4275350216182824e-06,
1437
- "loss": 1.3438,
1438
- "mean_token_accuracy": 0.628995418548584,
1439
- "num_tokens": 289180.0,
1440
  "step": 159
1441
  },
1442
  {
1443
  "epoch": 0.6451612903225806,
1444
- "grad_norm": 5.262267112731934,
1445
  "learning_rate": 1.3990146211059141e-06,
1446
- "loss": 1.4585,
1447
- "mean_token_accuracy": 0.6095186471939087,
1448
- "num_tokens": 291031.0,
1449
  "step": 160
1450
  },
1451
  {
1452
  "epoch": 0.6491935483870968,
1453
- "grad_norm": 5.676732063293457,
1454
  "learning_rate": 1.3706708943843822e-06,
1455
- "loss": 1.4394,
1456
- "mean_token_accuracy": 0.6081903576850891,
1457
- "num_tokens": 292840.0,
1458
  "step": 161
1459
  },
1460
  {
1461
  "epoch": 0.6532258064516129,
1462
- "grad_norm": 5.9136505126953125,
1463
  "learning_rate": 1.3425083897371983e-06,
1464
- "loss": 1.4198,
1465
- "mean_token_accuracy": 0.6248492002487183,
1466
- "num_tokens": 294500.0,
1467
  "step": 162
1468
  },
1469
  {
1470
  "epoch": 0.657258064516129,
1471
- "grad_norm": 5.905851364135742,
1472
  "learning_rate": 1.3145316263673874e-06,
1473
- "loss": 1.5303,
1474
- "mean_token_accuracy": 0.5844226479530334,
1475
- "num_tokens": 296338.0,
1476
  "step": 163
1477
  },
1478
  {
1479
  "epoch": 0.6612903225806451,
1480
- "grad_norm": 5.872440338134766,
1481
  "learning_rate": 1.286745093672298e-06,
1482
- "loss": 1.4809,
1483
- "mean_token_accuracy": 0.6081771850585938,
1484
- "num_tokens": 298101.0,
1485
  "step": 164
1486
  },
1487
  {
1488
  "epoch": 0.6653225806451613,
1489
- "grad_norm": 5.722828388214111,
1490
  "learning_rate": 1.2591532505231906e-06,
1491
- "loss": 1.4752,
1492
- "mean_token_accuracy": 0.5961538553237915,
1493
- "num_tokens": 300131.0,
1494
  "step": 165
1495
  },
1496
  {
1497
  "epoch": 0.6693548387096774,
1498
- "grad_norm": 5.679011821746826,
1499
  "learning_rate": 1.2317605245497324e-06,
1500
- "loss": 1.4752,
1501
- "mean_token_accuracy": 0.5989610552787781,
1502
- "num_tokens": 302058.0,
1503
  "step": 166
1504
  },
1505
  {
1506
  "epoch": 0.6733870967741935,
1507
- "grad_norm": 5.3445844650268555,
1508
  "learning_rate": 1.204571311429496e-06,
1509
- "loss": 1.4345,
1510
- "mean_token_accuracy": 0.5995935201644897,
1511
- "num_tokens": 304028.0,
1512
  "step": 167
1513
  },
1514
  {
1515
  "epoch": 0.6774193548387096,
1516
- "grad_norm": 6.3857421875,
1517
  "learning_rate": 1.1775899741825947e-06,
1518
- "loss": 1.4429,
1519
- "mean_token_accuracy": 0.6027944087982178,
1520
- "num_tokens": 305533.0,
1521
  "step": 168
1522
  },
1523
  {
1524
  "epoch": 0.6814516129032258,
1525
- "grad_norm": 6.405001640319824,
1526
  "learning_rate": 1.1508208424715511e-06,
1527
- "loss": 1.5413,
1528
- "mean_token_accuracy": 0.5933043956756592,
1529
- "num_tokens": 307148.0,
1530
  "step": 169
1531
  },
1532
  {
1533
  "epoch": 0.6854838709677419,
1534
- "grad_norm": 5.383761882781982,
1535
  "learning_rate": 1.1242682119065217e-06,
1536
- "loss": 1.3979,
1537
- "mean_token_accuracy": 0.6169666051864624,
1538
- "num_tokens": 309095.0,
1539
  "step": 170
1540
  },
1541
  {
1542
  "epoch": 0.6895161290322581,
1543
- "grad_norm": 5.891480445861816,
1544
  "learning_rate": 1.0979363433559892e-06,
1545
- "loss": 1.4874,
1546
- "mean_token_accuracy": 0.6015037298202515,
1547
- "num_tokens": 310826.0,
1548
  "step": 171
1549
  },
1550
  {
1551
  "epoch": 0.6935483870967742,
1552
- "grad_norm": 6.480978488922119,
1553
  "learning_rate": 1.0718294622630188e-06,
1554
- "loss": 1.4797,
1555
- "mean_token_accuracy": 0.6166783571243286,
1556
- "num_tokens": 312255.0,
1557
  "step": 172
1558
  },
1559
  {
1560
  "epoch": 0.6975806451612904,
1561
- "grad_norm": 5.803843021392822,
1562
  "learning_rate": 1.045951757967215e-06,
1563
- "loss": 1.4709,
1564
- "mean_token_accuracy": 0.6082949042320251,
1565
- "num_tokens": 314210.0,
1566
  "step": 173
1567
  },
1568
  {
1569
  "epoch": 0.7016129032258065,
1570
- "grad_norm": 6.1190619468688965,
1571
  "learning_rate": 1.0203073830324566e-06,
1572
- "loss": 1.412,
1573
- "mean_token_accuracy": 0.6229611039161682,
1574
- "num_tokens": 315806.0,
1575
  "step": 174
1576
  },
1577
  {
1578
  "epoch": 0.7056451612903226,
1579
- "grad_norm": 5.825779438018799,
1580
  "learning_rate": 9.949004525805423e-07,
1581
- "loss": 1.3616,
1582
- "mean_token_accuracy": 0.6321473717689514,
1583
- "num_tokens": 317382.0,
1584
  "step": 175
1585
  },
1586
  {
1587
  "epoch": 0.7096774193548387,
1588
- "grad_norm": 6.114853858947754,
1589
  "learning_rate": 9.697350436308428e-07,
1590
- "loss": 1.369,
1591
- "mean_token_accuracy": 0.6252273917198181,
1592
- "num_tokens": 319033.0,
1593
  "step": 176
1594
  },
1595
  {
1596
  "epoch": 0.7137096774193549,
1597
- "grad_norm": 6.001349449157715,
1598
  "learning_rate": 9.448151944460657e-07,
1599
- "loss": 1.5904,
1600
- "mean_token_accuracy": 0.5789205431938171,
1601
- "num_tokens": 320999.0,
1602
  "step": 177
1603
  },
1604
  {
1605
  "epoch": 0.717741935483871,
1606
- "grad_norm": 5.847578525543213,
1607
  "learning_rate": 9.201449038842403e-07,
1608
- "loss": 1.5322,
1609
- "mean_token_accuracy": 0.6073808073997498,
1610
- "num_tokens": 322952.0,
1611
  "step": 178
1612
  },
1613
  {
1614
  "epoch": 0.7217741935483871,
1615
- "grad_norm": 6.095054626464844,
1616
  "learning_rate": 8.957281307570254e-07,
1617
- "loss": 1.4049,
1618
- "mean_token_accuracy": 0.6155827641487122,
1619
- "num_tokens": 324507.0,
1620
  "step": 179
1621
  },
1622
  {
1623
  "epoch": 0.7258064516129032,
1624
- "grad_norm": 5.704720497131348,
1625
  "learning_rate": 8.71568793194445e-07,
1626
- "loss": 1.4795,
1627
- "mean_token_accuracy": 0.6071779727935791,
1628
- "num_tokens": 326543.0,
1629
  "step": 180
1630
  },
1631
  {
1632
  "epoch": 0.7298387096774194,
1633
- "grad_norm": 5.767334461212158,
1634
  "learning_rate": 8.476707680161486e-07,
1635
- "loss": 1.4597,
1636
- "mean_token_accuracy": 0.6028138399124146,
1637
- "num_tokens": 328393.0,
1638
  "step": 181
1639
  },
1640
  {
1641
  "epoch": 0.7338709677419355,
1642
- "grad_norm": 5.690002918243408,
1643
  "learning_rate": 8.240378901093035e-07,
1644
- "loss": 1.5312,
1645
- "mean_token_accuracy": 0.6054233312606812,
1646
- "num_tokens": 330202.0,
1647
  "step": 182
1648
  },
1649
  {
1650
  "epoch": 0.7379032258064516,
1651
- "grad_norm": 6.5543599128723145,
1652
  "learning_rate": 8.006739518132179e-07,
1653
- "loss": 1.5492,
1654
- "mean_token_accuracy": 0.5909899473190308,
1655
- "num_tokens": 331891.0,
1656
  "step": 183
1657
  },
1658
  {
1659
  "epoch": 0.7419354838709677,
1660
- "grad_norm": 6.568686008453369,
1661
  "learning_rate": 7.775827023107835e-07,
1662
- "loss": 1.4621,
1663
- "mean_token_accuracy": 0.6218130588531494,
1664
- "num_tokens": 333305.0,
1665
  "step": 184
1666
  },
1667
  {
1668
  "epoch": 0.7459677419354839,
1669
- "grad_norm": 5.713389873504639,
1670
  "learning_rate": 7.547678470268526e-07,
1671
- "loss": 1.4554,
1672
- "mean_token_accuracy": 0.6157278418540955,
1673
- "num_tokens": 335100.0,
1674
  "step": 185
1675
  },
1676
  {
1677
  "epoch": 0.75,
1678
- "grad_norm": 5.921985626220703,
1679
  "learning_rate": 7.322330470336314e-07,
1680
- "loss": 1.5999,
1681
- "mean_token_accuracy": 0.5684261918067932,
1682
- "num_tokens": 337148.0,
1683
  "step": 186
1684
  },
1685
  {
1686
  "epoch": 0.7540322580645161,
1687
- "grad_norm": 5.191997051239014,
1688
  "learning_rate": 7.099819184631929e-07,
1689
- "loss": 1.4016,
1690
- "mean_token_accuracy": 0.6217559576034546,
1691
- "num_tokens": 338961.0,
1692
  "step": 187
1693
  },
1694
  {
1695
  "epoch": 0.7580645161290323,
1696
- "grad_norm": 5.541125297546387,
1697
  "learning_rate": 6.880180319272006e-07,
1698
- "loss": 1.3727,
1699
- "mean_token_accuracy": 0.6247433423995972,
1700
- "num_tokens": 340911.0,
1701
  "step": 188
1702
  },
1703
  {
1704
  "epoch": 0.7620967741935484,
1705
- "grad_norm": 5.37374210357666,
1706
  "learning_rate": 6.663449119439358e-07,
1707
- "loss": 1.4549,
1708
- "mean_token_accuracy": 0.6195766925811768,
1709
- "num_tokens": 342803.0,
1710
  "step": 189
1711
  },
1712
  {
1713
  "epoch": 0.7661290322580645,
1714
- "grad_norm": 5.33998966217041,
1715
  "learning_rate": 6.449660363727236e-07,
1716
- "loss": 1.4693,
1717
- "mean_token_accuracy": 0.6066945791244507,
1718
- "num_tokens": 344717.0,
1719
  "step": 190
1720
  },
1721
  {
1722
  "epoch": 0.7701612903225806,
1723
- "grad_norm": 6.046936511993408,
1724
  "learning_rate": 6.238848358558439e-07,
1725
- "loss": 1.4392,
1726
- "mean_token_accuracy": 0.6037507653236389,
1727
- "num_tokens": 346372.0,
1728
  "step": 191
1729
  },
1730
  {
1731
  "epoch": 0.7741935483870968,
1732
- "grad_norm": 6.447813034057617,
1733
  "learning_rate": 6.031046932680229e-07,
1734
- "loss": 1.4287,
1735
- "mean_token_accuracy": 0.6267870664596558,
1736
- "num_tokens": 347703.0,
1737
  "step": 192
1738
  },
1739
  {
1740
  "epoch": 0.7782258064516129,
1741
- "grad_norm": 5.176486492156982,
1742
  "learning_rate": 5.826289431735832e-07,
1743
- "loss": 1.4166,
1744
- "mean_token_accuracy": 0.6180344223976135,
1745
- "num_tokens": 349679.0,
1746
  "step": 193
1747
  },
1748
  {
1749
  "epoch": 0.782258064516129,
1750
- "grad_norm": 6.081844329833984,
1751
  "learning_rate": 5.624608712913531e-07,
1752
- "loss": 1.3443,
1753
- "mean_token_accuracy": 0.6172370314598083,
1754
- "num_tokens": 351259.0,
1755
  "step": 194
1756
  },
1757
  {
1758
  "epoch": 0.7862903225806451,
1759
- "grad_norm": 5.20806884765625,
1760
  "learning_rate": 5.426037139674117e-07,
1761
- "loss": 1.4234,
1762
- "mean_token_accuracy": 0.6199596524238586,
1763
- "num_tokens": 353245.0,
1764
  "step": 195
1765
  },
1766
  {
1767
  "epoch": 0.7903225806451613,
1768
- "grad_norm": 5.874090671539307,
1769
  "learning_rate": 5.23060657655754e-07,
1770
- "loss": 1.4924,
1771
- "mean_token_accuracy": 0.6056056022644043,
1772
- "num_tokens": 355245.0,
1773
  "step": 196
1774
  },
1775
  {
1776
  "epoch": 0.7943548387096774,
1777
- "grad_norm": 5.100249767303467,
1778
  "learning_rate": 5.038348384069663e-07,
1779
- "loss": 1.3869,
1780
- "mean_token_accuracy": 0.6326630711555481,
1781
- "num_tokens": 357286.0,
1782
  "step": 197
1783
  },
1784
  {
1785
  "epoch": 0.7983870967741935,
1786
- "grad_norm": 5.905794620513916,
1787
  "learning_rate": 4.84929341364988e-07,
1788
- "loss": 1.4721,
1789
- "mean_token_accuracy": 0.6025428175926208,
1790
- "num_tokens": 359097.0,
1791
  "step": 198
1792
  },
1793
  {
1794
  "epoch": 0.8024193548387096,
1795
- "grad_norm": 5.788403034210205,
1796
  "learning_rate": 4.6634720027204093e-07,
1797
- "loss": 1.4298,
1798
- "mean_token_accuracy": 0.6162790656089783,
1799
- "num_tokens": 360905.0,
1800
  "step": 199
1801
  },
1802
  {
1803
  "epoch": 0.8064516129032258,
1804
- "grad_norm": 5.737428188323975,
1805
  "learning_rate": 4.480913969818099e-07,
1806
- "loss": 1.4908,
1807
- "mean_token_accuracy": 0.5982195734977722,
1808
- "num_tokens": 362592.0,
1809
  "step": 200
1810
  },
1811
  {
1812
  "epoch": 0.8104838709677419,
1813
- "grad_norm": 5.1596360206604,
1814
  "learning_rate": 4.3016486098094667e-07,
1815
- "loss": 1.4329,
1816
- "mean_token_accuracy": 0.6217008829116821,
1817
- "num_tokens": 364640.0,
1818
  "step": 201
1819
  },
1820
  {
1821
  "epoch": 0.8145161290322581,
1822
- "grad_norm": 5.4481964111328125,
1823
  "learning_rate": 4.125704689189819e-07,
1824
- "loss": 1.4815,
1825
- "mean_token_accuracy": 0.5962854623794556,
1826
- "num_tokens": 366688.0,
1827
  "step": 202
1828
  },
1829
  {
1830
  "epoch": 0.8185483870967742,
1831
- "grad_norm": 5.777848243713379,
1832
  "learning_rate": 3.953110441467073e-07,
1833
- "loss": 1.3747,
1834
- "mean_token_accuracy": 0.6329723000526428,
1835
- "num_tokens": 368352.0,
1836
  "step": 203
1837
  },
1838
  {
1839
  "epoch": 0.8225806451612904,
1840
- "grad_norm": 5.9793500900268555,
1841
  "learning_rate": 3.7838935626312246e-07,
1842
- "loss": 1.4336,
1843
- "mean_token_accuracy": 0.6180328130722046,
1844
- "num_tokens": 370184.0,
1845
  "step": 204
1846
  },
1847
  {
1848
  "epoch": 0.8266129032258065,
1849
- "grad_norm": 5.642481327056885,
1850
  "learning_rate": 3.6180812067099477e-07,
1851
- "loss": 1.3679,
1852
- "mean_token_accuracy": 0.6316390633583069,
1853
- "num_tokens": 371937.0,
1854
  "step": 205
1855
  },
1856
  {
1857
  "epoch": 0.8306451612903226,
1858
- "grad_norm": 5.729284763336182,
1859
  "learning_rate": 3.455699981411259e-07,
1860
- "loss": 1.5629,
1861
- "mean_token_accuracy": 0.5879765152931213,
1862
- "num_tokens": 373985.0,
1863
  "step": 206
1864
  },
1865
  {
1866
  "epoch": 0.8346774193548387,
1867
- "grad_norm": 5.723310947418213,
1868
  "learning_rate": 3.296775943853789e-07,
1869
- "loss": 1.3882,
1870
- "mean_token_accuracy": 0.6189857125282288,
1871
- "num_tokens": 375525.0,
1872
  "step": 207
1873
  },
1874
  {
1875
  "epoch": 0.8387096774193549,
1876
- "grad_norm": 5.779016494750977,
1877
  "learning_rate": 3.141334596385448e-07,
1878
- "loss": 1.461,
1879
- "mean_token_accuracy": 0.6056337952613831,
1880
- "num_tokens": 377373.0,
1881
  "step": 208
1882
  },
1883
  {
1884
  "epoch": 0.842741935483871,
1885
- "grad_norm": 5.4905500411987305,
1886
  "learning_rate": 2.9894008824910726e-07,
1887
- "loss": 1.521,
1888
- "mean_token_accuracy": 0.5928640961647034,
1889
- "num_tokens": 379421.0,
1890
  "step": 209
1891
  },
1892
  {
1893
  "epoch": 0.8467741935483871,
1894
- "grad_norm": 5.458898544311523,
1895
  "learning_rate": 2.840999182789797e-07,
1896
- "loss": 1.3915,
1897
- "mean_token_accuracy": 0.6187683343887329,
1898
- "num_tokens": 381469.0,
1899
  "step": 210
1900
  },
1901
  {
1902
  "epoch": 0.8508064516129032,
1903
- "grad_norm": 5.632810592651367,
1904
  "learning_rate": 2.696153311122704e-07,
1905
- "loss": 1.6075,
1906
- "mean_token_accuracy": 0.5782422423362732,
1907
- "num_tokens": 383337.0,
1908
  "step": 211
1909
  },
1910
  {
1911
  "epoch": 0.8548387096774194,
1912
- "grad_norm": 5.911554336547852,
1913
  "learning_rate": 2.5548865107314606e-07,
1914
- "loss": 1.4003,
1915
- "mean_token_accuracy": 0.6060606241226196,
1916
- "num_tokens": 385055.0,
1917
  "step": 212
1918
  },
1919
  {
1920
  "epoch": 0.8588709677419355,
1921
- "grad_norm": 5.624124050140381,
1922
  "learning_rate": 2.4172214505285006e-07,
1923
- "loss": 1.4583,
1924
- "mean_token_accuracy": 0.6126724481582642,
1925
- "num_tokens": 387014.0,
1926
  "step": 213
1927
  },
1928
  {
1929
  "epoch": 0.8629032258064516,
1930
- "grad_norm": 5.617734432220459,
1931
  "learning_rate": 2.2831802214593774e-07,
1932
- "loss": 1.5721,
1933
- "mean_token_accuracy": 0.5764281749725342,
1934
- "num_tokens": 388959.0,
1935
  "step": 214
1936
  },
1937
  {
1938
  "epoch": 0.8669354838709677,
1939
- "grad_norm": 5.280562877655029,
1940
  "learning_rate": 2.1527843329578328e-07,
1941
- "loss": 1.5147,
1942
- "mean_token_accuracy": 0.6034653186798096,
1943
- "num_tokens": 390981.0,
1944
  "step": 215
1945
  },
1946
  {
1947
  "epoch": 0.8709677419354839,
1948
- "grad_norm": 5.687208652496338,
1949
  "learning_rate": 2.026054709494235e-07,
1950
- "loss": 1.4534,
1951
- "mean_token_accuracy": 0.6150367259979248,
1952
- "num_tokens": 392752.0,
1953
  "step": 216
1954
  },
1955
  {
1956
  "epoch": 0.875,
1957
- "grad_norm": 5.705509185791016,
1958
  "learning_rate": 1.9030116872178317e-07,
1959
- "loss": 1.4152,
1960
- "mean_token_accuracy": 0.6127232313156128,
1961
- "num_tokens": 394546.0,
1962
  "step": 217
1963
  },
1964
  {
1965
  "epoch": 0.8790322580645161,
1966
- "grad_norm": 5.683897972106934,
1967
  "learning_rate": 1.7836750106934475e-07,
1968
- "loss": 1.4271,
1969
- "mean_token_accuracy": 0.6148231625556946,
1970
- "num_tokens": 396329.0,
1971
  "step": 218
1972
  },
1973
  {
1974
  "epoch": 0.8830645161290323,
1975
- "grad_norm": 6.243759632110596,
1976
  "learning_rate": 1.6680638297330854e-07,
1977
- "loss": 1.4465,
1978
- "mean_token_accuracy": 0.5965217351913452,
1979
- "num_tokens": 398056.0,
1980
  "step": 219
1981
  },
1982
  {
1983
  "epoch": 0.8870967741935484,
1984
- "grad_norm": 6.134514331817627,
1985
  "learning_rate": 1.5561966963229925e-07,
1986
- "loss": 1.3789,
1987
- "mean_token_accuracy": 0.6306666731834412,
1988
- "num_tokens": 399558.0,
1989
  "step": 220
1990
  },
1991
  {
1992
  "epoch": 0.8911290322580645,
1993
- "grad_norm": 5.514354228973389,
1994
  "learning_rate": 1.448091561646628e-07,
1995
- "loss": 1.4516,
1996
- "mean_token_accuracy": 0.629768967628479,
1997
- "num_tokens": 401421.0,
1998
  "step": 221
1999
  },
2000
  {
2001
  "epoch": 0.8951612903225806,
2002
- "grad_norm": 6.08192253112793,
2003
  "learning_rate": 1.3437657732040783e-07,
2004
- "loss": 1.3599,
2005
- "mean_token_accuracy": 0.627013623714447,
2006
- "num_tokens": 403037.0,
2007
  "step": 222
2008
  },
2009
  {
2010
  "epoch": 0.8991935483870968,
2011
- "grad_norm": 5.9470624923706055,
2012
  "learning_rate": 1.243236072028317e-07,
2013
- "loss": 1.4422,
2014
- "mean_token_accuracy": 0.6150983572006226,
2015
- "num_tokens": 404920.0,
2016
  "step": 223
2017
  },
2018
  {
2019
  "epoch": 0.9032258064516129,
2020
- "grad_norm": 5.818566799163818,
2021
  "learning_rate": 1.1465185899987797e-07,
2022
- "loss": 1.5879,
2023
- "mean_token_accuracy": 0.5864583253860474,
2024
- "num_tokens": 406842.0,
2025
  "step": 224
2026
  },
2027
  {
2028
  "epoch": 0.907258064516129,
2029
- "grad_norm": 5.502862930297852,
2030
  "learning_rate": 1.0536288472527162e-07,
2031
- "loss": 1.5814,
2032
- "mean_token_accuracy": 0.6052631735801697,
2033
- "num_tokens": 408782.0,
2034
  "step": 225
2035
  },
2036
  {
2037
  "epoch": 0.9112903225806451,
2038
- "grad_norm": 5.415470123291016,
2039
  "learning_rate": 9.645817496946902e-08,
2040
- "loss": 1.444,
2041
- "mean_token_accuracy": 0.6195147037506104,
2042
- "num_tokens": 410721.0,
2043
  "step": 226
2044
  },
2045
  {
2046
  "epoch": 0.9153225806451613,
2047
- "grad_norm": 5.986811637878418,
2048
  "learning_rate": 8.79391586604636e-08,
2049
- "loss": 1.5418,
2050
- "mean_token_accuracy": 0.5859031081199646,
2051
- "num_tokens": 412539.0,
2052
  "step": 227
2053
  },
2054
  {
2055
  "epoch": 0.9193548387096774,
2056
- "grad_norm": 5.945112228393555,
2057
  "learning_rate": 7.980720283448957e-08,
2058
- "loss": 1.6551,
2059
- "mean_token_accuracy": 0.5674486756324768,
2060
- "num_tokens": 414587.0,
2061
  "step": 228
2062
  },
2063
  {
2064
  "epoch": 0.9233870967741935,
2065
- "grad_norm": 6.400421619415283,
2066
  "learning_rate": 7.206361241665266e-08,
2067
- "loss": 1.4483,
2068
- "mean_token_accuracy": 0.6121463179588318,
2069
- "num_tokens": 416038.0,
2070
  "step": 229
2071
  },
2072
  {
2073
  "epoch": 0.9274193548387096,
2074
- "grad_norm": 5.438772678375244,
2075
  "learning_rate": 6.470963001153268e-08,
2076
- "loss": 1.4672,
2077
- "mean_token_accuracy": 0.6242893934249878,
2078
- "num_tokens": 417975.0,
2079
  "step": 230
2080
  },
2081
  {
2082
  "epoch": 0.9314516129032258,
2083
- "grad_norm": 5.7500715255737305,
2084
  "learning_rate": 5.774643570378296e-08,
2085
- "loss": 1.2723,
2086
- "mean_token_accuracy": 0.6415220499038696,
2087
- "num_tokens": 419475.0,
2088
  "step": 231
2089
  },
2090
  {
2091
  "epoch": 0.9354838709677419,
2092
- "grad_norm": 5.149988651275635,
2093
  "learning_rate": 5.117514686876379e-08,
2094
- "loss": 1.4171,
2095
- "mean_token_accuracy": 0.6119639873504639,
2096
- "num_tokens": 421366.0,
2097
  "step": 232
2098
  },
2099
  {
2100
  "epoch": 0.9395161290322581,
2101
- "grad_norm": 5.256503105163574,
2102
  "learning_rate": 4.4996817993239464e-08,
2103
- "loss": 1.5328,
2104
- "mean_token_accuracy": 0.5879765152931213,
2105
- "num_tokens": 423414.0,
2106
  "step": 233
2107
  },
2108
  {
2109
  "epoch": 0.9435483870967742,
2110
- "grad_norm": 6.25666618347168,
2111
  "learning_rate": 3.9212440506164465e-08,
2112
- "loss": 1.4684,
2113
- "mean_token_accuracy": 0.6087219715118408,
2114
- "num_tokens": 425067.0,
2115
  "step": 234
2116
  },
2117
  {
2118
  "epoch": 0.9475806451612904,
2119
- "grad_norm": 5.981427192687988,
2120
  "learning_rate": 3.382294261959157e-08,
2121
- "loss": 1.5281,
2122
- "mean_token_accuracy": 0.6133871674537659,
2123
- "num_tokens": 426802.0,
2124
  "step": 235
2125
  },
2126
  {
2127
  "epoch": 0.9516129032258065,
2128
- "grad_norm": 6.321535587310791,
2129
  "learning_rate": 2.8829189179721552e-08,
2130
- "loss": 1.4104,
2131
- "mean_token_accuracy": 0.6142162680625916,
2132
- "num_tokens": 428450.0,
2133
  "step": 236
2134
  },
2135
  {
2136
  "epoch": 0.9556451612903226,
2137
- "grad_norm": 5.302191257476807,
2138
  "learning_rate": 2.423198152812306e-08,
2139
- "loss": 1.3648,
2140
- "mean_token_accuracy": 0.6049119234085083,
2141
- "num_tokens": 430325.0,
2142
  "step": 237
2143
  },
2144
  {
2145
  "epoch": 0.9596774193548387,
2146
- "grad_norm": 5.734409809112549,
2147
  "learning_rate": 2.0032057373142453e-08,
2148
- "loss": 1.4529,
2149
- "mean_token_accuracy": 0.6051889657974243,
2150
- "num_tokens": 432100.0,
2151
  "step": 238
2152
  },
2153
  {
2154
  "epoch": 0.9637096774193549,
2155
- "grad_norm": 6.134054660797119,
2156
  "learning_rate": 1.6230090671524312e-08,
2157
- "loss": 1.4594,
2158
- "mean_token_accuracy": 0.606150209903717,
2159
- "num_tokens": 433793.0,
2160
  "step": 239
2161
  },
2162
  {
2163
  "epoch": 0.967741935483871,
2164
- "grad_norm": 6.0663161277771,
2165
  "learning_rate": 1.2826691520262114e-08,
2166
- "loss": 1.5932,
2167
- "mean_token_accuracy": 0.594725489616394,
2168
- "num_tokens": 435653.0,
2169
  "step": 240
2170
  },
2171
  {
2172
  "epoch": 0.9717741935483871,
2173
- "grad_norm": 5.548215866088867,
2174
  "learning_rate": 9.822406058697665e-09,
2175
- "loss": 1.5462,
2176
- "mean_token_accuracy": 0.6009804010391235,
2177
- "num_tokens": 437695.0,
2178
  "step": 241
2179
  },
2180
  {
2181
  "epoch": 0.9758064516129032,
2182
- "grad_norm": 5.297504901885986,
2183
  "learning_rate": 7.217716380881479e-09,
2184
- "loss": 1.4292,
2185
- "mean_token_accuracy": 0.605381190776825,
2186
- "num_tokens": 439704.0,
2187
  "step": 242
2188
  },
2189
  {
2190
  "epoch": 0.9798387096774194,
2191
- "grad_norm": 6.025217533111572,
2192
  "learning_rate": 5.0130404582127144e-09,
2193
- "loss": 1.354,
2194
- "mean_token_accuracy": 0.6391494870185852,
2195
- "num_tokens": 441305.0,
2196
  "step": 243
2197
  },
2198
  {
2199
  "epoch": 0.9838709677419355,
2200
- "grad_norm": 5.9209885597229,
2201
  "learning_rate": 3.208732072368104e-09,
2202
- "loss": 1.5128,
2203
- "mean_token_accuracy": 0.6028446555137634,
2204
- "num_tokens": 443135.0,
2205
  "step": 244
2206
  },
2207
  {
2208
  "epoch": 0.9879032258064516,
2209
- "grad_norm": 5.476851463317871,
2210
  "learning_rate": 1.8050807585293095e-09,
2211
- "loss": 1.4337,
2212
- "mean_token_accuracy": 0.5929054021835327,
2213
- "num_tokens": 444913.0,
2214
  "step": 245
2215
  },
2216
  {
2217
  "epoch": 0.9919354838709677,
2218
- "grad_norm": 5.278493881225586,
2219
  "learning_rate": 8.023117589237017e-10,
2220
- "loss": 1.4681,
2221
- "mean_token_accuracy": 0.6177908182144165,
2222
- "num_tokens": 446961.0,
2223
  "step": 246
2224
  },
2225
  {
2226
  "epoch": 0.9959677419354839,
2227
- "grad_norm": 5.931204319000244,
2228
  "learning_rate": 2.0058598667854755e-10,
2229
- "loss": 1.3652,
2230
- "mean_token_accuracy": 0.6268472671508789,
2231
- "num_tokens": 448587.0,
2232
  "step": 247
2233
  },
2234
  {
2235
  "epoch": 1.0,
2236
- "grad_norm": 6.229298114776611,
2237
  "learning_rate": 0.0,
2238
- "loss": 1.3911,
2239
- "mean_token_accuracy": 0.5912636518478394,
2240
- "num_tokens": 449229.0,
2241
  "step": 248
2242
  }
2243
  ],
@@ -2258,7 +2258,7 @@
2258
  "attributes": {}
2259
  }
2260
  },
2261
- "total_flos": 1.0128604195192832e+16,
2262
  "train_batch_size": 1,
2263
  "trial_name": null,
2264
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.004032258064516129,
13
+ "grad_norm": 99.0960693359375,
14
  "learning_rate": 4.999799414013322e-06,
15
+ "loss": 3.2792,
16
+ "mean_token_accuracy": 0.4705558717250824,
17
+ "num_tokens": 1819.0,
18
  "step": 1
19
  },
20
  {
21
  "epoch": 0.008064516129032258,
22
+ "grad_norm": 47.43061828613281,
23
  "learning_rate": 4.999197688241076e-06,
24
+ "loss": 2.6949,
25
+ "mean_token_accuracy": 0.4918949007987976,
26
+ "num_tokens": 3610.0,
27
  "step": 2
28
  },
29
  {
30
  "epoch": 0.012096774193548387,
31
+ "grad_norm": 40.37574005126953,
32
  "learning_rate": 4.998194919241471e-06,
33
+ "loss": 2.1322,
34
+ "mean_token_accuracy": 0.5253481864929199,
35
+ "num_tokens": 5407.0,
36
  "step": 3
37
  },
38
  {
39
  "epoch": 0.016129032258064516,
40
+ "grad_norm": 21.76544189453125,
41
  "learning_rate": 4.996791267927632e-06,
42
+ "loss": 1.924,
43
+ "mean_token_accuracy": 0.5501400828361511,
44
+ "num_tokens": 7194.0,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.020161290322580645,
49
+ "grad_norm": 30.451719284057617,
50
  "learning_rate": 4.994986959541788e-06,
51
+ "loss": 2.0233,
52
+ "mean_token_accuracy": 0.5161290168762207,
53
+ "num_tokens": 9242.0,
54
  "step": 5
55
  },
56
  {
57
  "epoch": 0.024193548387096774,
58
+ "grad_norm": 8.43026065826416,
59
  "learning_rate": 4.9927822836191185e-06,
60
+ "loss": 1.8731,
61
+ "mean_token_accuracy": 0.5502092242240906,
62
+ "num_tokens": 11156.0,
63
  "step": 6
64
  },
65
  {
66
  "epoch": 0.028225806451612902,
67
+ "grad_norm": 8.538537979125977,
68
  "learning_rate": 4.990177593941303e-06,
69
+ "loss": 1.8274,
70
+ "mean_token_accuracy": 0.5549915432929993,
71
+ "num_tokens": 12931.0,
72
  "step": 7
73
  },
74
  {
75
  "epoch": 0.03225806451612903,
76
+ "grad_norm": 8.681036949157715,
77
  "learning_rate": 4.987173308479738e-06,
78
+ "loss": 1.8148,
79
+ "mean_token_accuracy": 0.563049852848053,
80
+ "num_tokens": 14979.0,
81
  "step": 8
82
  },
83
  {
84
  "epoch": 0.036290322580645164,
85
+ "grad_norm": 7.441074848175049,
86
  "learning_rate": 4.9837699093284765e-06,
87
+ "loss": 1.7176,
88
+ "mean_token_accuracy": 0.572820782661438,
89
+ "num_tokens": 16828.0,
90
  "step": 9
91
  },
92
  {
93
  "epoch": 0.04032258064516129,
94
+ "grad_norm": 7.5016093254089355,
95
  "learning_rate": 4.9799679426268575e-06,
96
+ "loss": 1.826,
97
+ "mean_token_accuracy": 0.5537056922912598,
98
+ "num_tokens": 18692.0,
99
  "step": 10
100
  },
101
  {
102
  "epoch": 0.04435483870967742,
103
+ "grad_norm": 7.0608062744140625,
104
  "learning_rate": 4.975768018471877e-06,
105
+ "loss": 1.6788,
106
+ "mean_token_accuracy": 0.5741225481033325,
107
+ "num_tokens": 20603.0,
108
  "step": 11
109
  },
110
  {
111
  "epoch": 0.04838709677419355,
112
+ "grad_norm": 7.283655166625977,
113
  "learning_rate": 4.971170810820279e-06,
114
+ "loss": 1.6694,
115
+ "mean_token_accuracy": 0.5708954930305481,
116
+ "num_tokens": 22213.0,
117
  "step": 12
118
  },
119
  {
120
  "epoch": 0.05241935483870968,
121
+ "grad_norm": 7.800687313079834,
122
  "learning_rate": 4.966177057380409e-06,
123
+ "loss": 1.7202,
124
+ "mean_token_accuracy": 0.5798588991165161,
125
+ "num_tokens": 23774.0,
126
  "step": 13
127
  },
128
  {
129
  "epoch": 0.056451612903225805,
130
+ "grad_norm": 7.64439058303833,
131
  "learning_rate": 4.960787559493836e-06,
132
+ "loss": 1.6408,
133
+ "mean_token_accuracy": 0.5935754179954529,
134
+ "num_tokens": 25208.0,
135
  "step": 14
136
  },
137
  {
138
  "epoch": 0.06048387096774194,
139
+ "grad_norm": 6.813995361328125,
140
  "learning_rate": 4.955003182006761e-06,
141
+ "loss": 1.6397,
142
+ "mean_token_accuracy": 0.5809217691421509,
143
+ "num_tokens": 27076.0,
144
  "step": 15
145
  },
146
  {
147
  "epoch": 0.06451612903225806,
148
+ "grad_norm": 7.024792194366455,
149
  "learning_rate": 4.948824853131237e-06,
150
+ "loss": 1.7807,
151
+ "mean_token_accuracy": 0.5546666383743286,
152
+ "num_tokens": 28953.0,
153
  "step": 16
154
  },
155
  {
156
  "epoch": 0.06854838709677419,
157
+ "grad_norm": 6.979730606079102,
158
  "learning_rate": 4.942253564296217e-06,
159
+ "loss": 1.5861,
160
+ "mean_token_accuracy": 0.5993921160697937,
161
+ "num_tokens": 30600.0,
162
  "step": 17
163
  },
164
  {
165
  "epoch": 0.07258064516129033,
166
+ "grad_norm": 6.971430778503418,
167
  "learning_rate": 4.935290369988468e-06,
168
+ "loss": 1.7106,
169
+ "mean_token_accuracy": 0.5747259259223938,
170
+ "num_tokens": 32335.0,
171
  "step": 18
172
  },
173
  {
174
  "epoch": 0.07661290322580645,
175
+ "grad_norm": 7.085752487182617,
176
  "learning_rate": 4.927936387583348e-06,
177
+ "loss": 1.7075,
178
+ "mean_token_accuracy": 0.5898089408874512,
179
+ "num_tokens": 33907.0,
180
  "step": 19
181
  },
182
  {
183
  "epoch": 0.08064516129032258,
184
+ "grad_norm": 6.59229040145874,
185
  "learning_rate": 4.920192797165511e-06,
186
+ "loss": 1.7039,
187
+ "mean_token_accuracy": 0.569328784942627,
188
+ "num_tokens": 35950.0,
189
  "step": 20
190
  },
191
  {
192
  "epoch": 0.0846774193548387,
193
+ "grad_norm": 6.934664249420166,
194
  "learning_rate": 4.912060841339536e-06,
195
+ "loss": 1.582,
196
+ "mean_token_accuracy": 0.6080306172370911,
197
+ "num_tokens": 37521.0,
198
  "step": 21
199
  },
200
  {
201
  "epoch": 0.08870967741935484,
202
+ "grad_norm": 7.222076416015625,
203
  "learning_rate": 4.9035418250305314e-06,
204
+ "loss": 1.6762,
205
+ "mean_token_accuracy": 0.5719424486160278,
206
+ "num_tokens": 39191.0,
207
  "step": 22
208
  },
209
  {
210
  "epoch": 0.09274193548387097,
211
+ "grad_norm": 6.332627773284912,
212
  "learning_rate": 4.894637115274728e-06,
213
+ "loss": 1.6376,
214
+ "mean_token_accuracy": 0.5898959636688232,
215
+ "num_tokens": 41212.0,
216
  "step": 23
217
  },
218
  {
219
  "epoch": 0.0967741935483871,
220
+ "grad_norm": 6.263752460479736,
221
  "learning_rate": 4.8853481410001225e-06,
222
+ "loss": 1.6816,
223
+ "mean_token_accuracy": 0.5732259750366211,
224
+ "num_tokens": 43201.0,
225
  "step": 24
226
  },
227
  {
228
  "epoch": 0.10080645161290322,
229
+ "grad_norm": 6.8030476570129395,
230
  "learning_rate": 4.875676392797169e-06,
231
+ "loss": 1.5712,
232
+ "mean_token_accuracy": 0.6030839681625366,
233
+ "num_tokens": 44954.0,
234
  "step": 25
235
  },
236
  {
237
  "epoch": 0.10483870967741936,
238
+ "grad_norm": 6.477639198303223,
239
  "learning_rate": 4.865623422679593e-06,
240
+ "loss": 1.6535,
241
+ "mean_token_accuracy": 0.5837808847427368,
242
+ "num_tokens": 46818.0,
243
  "step": 26
244
  },
245
  {
246
  "epoch": 0.10887096774193548,
247
+ "grad_norm": 8.109735488891602,
248
  "learning_rate": 4.855190843835338e-06,
249
+ "loss": 1.5428,
250
+ "mean_token_accuracy": 0.60636305809021,
251
+ "num_tokens": 48423.0,
252
  "step": 27
253
  },
254
  {
255
  "epoch": 0.11290322580645161,
256
+ "grad_norm": 6.8654046058654785,
257
  "learning_rate": 4.844380330367701e-06,
258
+ "loss": 1.5027,
259
+ "mean_token_accuracy": 0.6173325777053833,
260
+ "num_tokens": 50202.0,
261
  "step": 28
262
  },
263
  {
264
  "epoch": 0.11693548387096774,
265
+ "grad_norm": 6.878509044647217,
266
  "learning_rate": 4.833193617026692e-06,
267
+ "loss": 1.6905,
268
+ "mean_token_accuracy": 0.5734870433807373,
269
+ "num_tokens": 51939.0,
270
  "step": 29
271
  },
272
  {
273
  "epoch": 0.12096774193548387,
274
+ "grad_norm": 6.781070709228516,
275
  "learning_rate": 4.821632498930656e-06,
276
+ "loss": 1.584,
277
+ "mean_token_accuracy": 0.5995055437088013,
278
+ "num_tokens": 53559.0,
279
  "step": 30
280
  },
281
  {
282
  "epoch": 0.125,
283
+ "grad_norm": 6.2921271324157715,
284
  "learning_rate": 4.809698831278217e-06,
285
+ "loss": 1.6781,
286
+ "mean_token_accuracy": 0.582112193107605,
287
+ "num_tokens": 55540.0,
288
  "step": 31
289
  },
290
  {
291
  "epoch": 0.12903225806451613,
292
+ "grad_norm": 6.02636194229126,
293
  "learning_rate": 4.797394529050577e-06,
294
+ "loss": 1.5655,
295
+ "mean_token_accuracy": 0.6006144285202026,
296
+ "num_tokens": 57495.0,
297
  "step": 32
298
  },
299
  {
300
  "epoch": 0.13306451612903225,
301
+ "grad_norm": 7.5229902267456055,
302
  "learning_rate": 4.784721566704217e-06,
303
+ "loss": 1.7261,
304
+ "mean_token_accuracy": 0.5760197639465332,
305
+ "num_tokens": 59115.0,
306
  "step": 33
307
  },
308
  {
309
  "epoch": 0.13709677419354838,
310
+ "grad_norm": 6.765624046325684,
311
  "learning_rate": 4.771681977854062e-06,
312
+ "loss": 1.625,
313
+ "mean_token_accuracy": 0.5822873115539551,
314
+ "num_tokens": 60551.0,
315
  "step": 34
316
  },
317
  {
318
  "epoch": 0.14112903225806453,
319
+ "grad_norm": 6.864407062530518,
320
  "learning_rate": 4.75827785494715e-06,
321
+ "loss": 1.6879,
322
+ "mean_token_accuracy": 0.5782891511917114,
323
+ "num_tokens": 62552.0,
324
  "step": 35
325
  },
326
  {
327
  "epoch": 0.14516129032258066,
328
+ "grad_norm": 6.733999252319336,
329
  "learning_rate": 4.744511348926855e-06,
330
+ "loss": 1.5214,
331
+ "mean_token_accuracy": 0.6006430983543396,
332
+ "num_tokens": 64109.0,
333
  "step": 36
334
  },
335
  {
336
  "epoch": 0.14919354838709678,
337
+ "grad_norm": 5.977267265319824,
338
  "learning_rate": 4.730384668887731e-06,
339
+ "loss": 1.6555,
340
+ "mean_token_accuracy": 0.575398862361908,
341
+ "num_tokens": 66054.0,
342
  "step": 37
343
  },
344
  {
345
  "epoch": 0.1532258064516129,
346
+ "grad_norm": 6.319035053253174,
347
  "learning_rate": 4.715900081721021e-06,
348
+ "loss": 1.657,
349
+ "mean_token_accuracy": 0.5774725079536438,
350
+ "num_tokens": 67876.0,
351
  "step": 38
352
  },
353
  {
354
  "epoch": 0.15725806451612903,
355
+ "grad_norm": 6.248798847198486,
356
  "learning_rate": 4.7010599117508936e-06,
357
+ "loss": 1.5393,
358
+ "mean_token_accuracy": 0.5967366099357605,
359
+ "num_tokens": 69594.0,
360
  "step": 39
361
  },
362
  {
363
  "epoch": 0.16129032258064516,
364
+ "grad_norm": 6.949793815612793,
365
  "learning_rate": 4.685866540361456e-06,
366
+ "loss": 1.6971,
367
+ "mean_token_accuracy": 0.5807833671569824,
368
+ "num_tokens": 71230.0,
369
  "step": 40
370
  },
371
  {
372
  "epoch": 0.16532258064516128,
373
+ "grad_norm": 7.044243812561035,
374
  "learning_rate": 4.670322405614621e-06,
375
+ "loss": 1.5476,
376
+ "mean_token_accuracy": 0.6068921685218811,
377
+ "num_tokens": 72799.0,
378
  "step": 41
379
  },
380
  {
381
  "epoch": 0.1693548387096774,
382
+ "grad_norm": 6.1895670890808105,
383
  "learning_rate": 4.654430001858874e-06,
384
+ "loss": 1.5685,
385
+ "mean_token_accuracy": 0.5987124443054199,
386
+ "num_tokens": 74665.0,
387
  "step": 42
388
  },
389
  {
390
  "epoch": 0.17338709677419356,
391
+ "grad_norm": 6.357958793640137,
392
  "learning_rate": 4.638191879329005e-06,
393
+ "loss": 1.6369,
394
+ "mean_token_accuracy": 0.6065057516098022,
395
+ "num_tokens": 76573.0,
396
  "step": 43
397
  },
398
  {
399
  "epoch": 0.1774193548387097,
400
+ "grad_norm": 6.44213342666626,
401
  "learning_rate": 4.621610643736878e-06,
402
+ "loss": 1.477,
403
+ "mean_token_accuracy": 0.6165943741798401,
404
+ "num_tokens": 78419.0,
405
  "step": 44
406
  },
407
  {
408
  "epoch": 0.1814516129032258,
409
+ "grad_norm": 11.772004127502441,
410
  "learning_rate": 4.6046889558532925e-06,
411
+ "loss": 1.2458,
412
+ "mean_token_accuracy": 0.6105882525444031,
413
+ "num_tokens": 79271.0,
414
  "step": 45
415
  },
416
  {
417
  "epoch": 0.18548387096774194,
418
+ "grad_norm": 5.835907936096191,
419
  "learning_rate": 4.587429531081019e-06,
420
+ "loss": 1.5937,
421
+ "mean_token_accuracy": 0.5787923336029053,
422
+ "num_tokens": 81310.0,
423
  "step": 46
424
  },
425
  {
426
  "epoch": 0.18951612903225806,
427
+ "grad_norm": 6.897541046142578,
428
  "learning_rate": 4.569835139019054e-06,
429
+ "loss": 1.6269,
430
+ "mean_token_accuracy": 0.5900123119354248,
431
+ "num_tokens": 82934.0,
432
  "step": 47
433
  },
434
  {
435
  "epoch": 0.1935483870967742,
436
+ "grad_norm": 6.150733947753906,
437
  "learning_rate": 4.551908603018191e-06,
438
+ "loss": 1.515,
439
+ "mean_token_accuracy": 0.5997865796089172,
440
+ "num_tokens": 84810.0,
441
  "step": 48
442
  },
443
  {
444
  "epoch": 0.1975806451612903,
445
+ "grad_norm": 5.933169841766357,
446
  "learning_rate": 4.53365279972796e-06,
447
+ "loss": 1.6289,
448
+ "mean_token_accuracy": 0.5821572542190552,
449
+ "num_tokens": 86796.0,
450
  "step": 49
451
  },
452
  {
453
  "epoch": 0.20161290322580644,
454
+ "grad_norm": 5.93269681930542,
455
  "learning_rate": 4.515070658635013e-06,
456
+ "loss": 1.6608,
457
+ "mean_token_accuracy": 0.5925726294517517,
458
+ "num_tokens": 88656.0,
459
  "step": 50
460
  },
461
  {
462
  "epoch": 0.2056451612903226,
463
+ "grad_norm": 6.56980037689209,
464
  "learning_rate": 4.4961651615930344e-06,
465
+ "loss": 1.6368,
466
+ "mean_token_accuracy": 0.5721311569213867,
467
+ "num_tokens": 90488.0,
468
  "step": 51
469
  },
470
  {
471
  "epoch": 0.20967741935483872,
472
+ "grad_norm": 5.973373889923096,
473
  "learning_rate": 4.476939342344246e-06,
474
+ "loss": 1.5851,
475
+ "mean_token_accuracy": 0.586832582950592,
476
+ "num_tokens": 92419.0,
477
  "step": 52
478
  },
479
  {
480
  "epoch": 0.21370967741935484,
481
+ "grad_norm": 6.0411553382873535,
482
  "learning_rate": 4.457396286032589e-06,
483
+ "loss": 1.6096,
484
+ "mean_token_accuracy": 0.5805962681770325,
485
+ "num_tokens": 94400.0,
486
  "step": 53
487
  },
488
  {
489
  "epoch": 0.21774193548387097,
490
+ "grad_norm": 6.7846221923828125,
491
  "learning_rate": 4.437539128708647e-06,
492
+ "loss": 1.7701,
493
+ "mean_token_accuracy": 0.5684491991996765,
494
+ "num_tokens": 96272.0,
495
  "step": 54
496
  },
497
  {
498
  "epoch": 0.2217741935483871,
499
+ "grad_norm": 6.365209579467773,
500
  "learning_rate": 4.417371056826417e-06,
501
+ "loss": 1.5444,
502
+ "mean_token_accuracy": 0.6007281541824341,
503
+ "num_tokens": 97922.0,
504
  "step": 55
505
  },
506
  {
507
  "epoch": 0.22580645161290322,
508
+ "grad_norm": 6.159636974334717,
509
  "learning_rate": 4.396895306731978e-06,
510
+ "loss": 1.6374,
511
+ "mean_token_accuracy": 0.5812357068061829,
512
+ "num_tokens": 99672.0,
513
  "step": 56
514
  },
515
  {
516
  "epoch": 0.22983870967741934,
517
+ "grad_norm": 6.037807464599609,
518
  "learning_rate": 4.376115164144157e-06,
519
+ "loss": 1.5427,
520
+ "mean_token_accuracy": 0.6033584475517273,
521
+ "num_tokens": 101401.0,
522
  "step": 57
523
  },
524
  {
525
  "epoch": 0.23387096774193547,
526
+ "grad_norm": 6.224438190460205,
527
  "learning_rate": 4.355033963627277e-06,
528
+ "loss": 1.5835,
529
+ "mean_token_accuracy": 0.5897436141967773,
530
+ "num_tokens": 103158.0,
531
  "step": 58
532
  },
533
  {
534
  "epoch": 0.23790322580645162,
535
+ "grad_norm": 5.788248062133789,
536
  "learning_rate": 4.333655088056065e-06,
537
+ "loss": 1.5512,
538
+ "mean_token_accuracy": 0.5878172516822815,
539
+ "num_tokens": 105130.0,
540
  "step": 59
541
  },
542
  {
543
  "epoch": 0.24193548387096775,
544
+ "grad_norm": 6.5106024742126465,
545
  "learning_rate": 4.3119819680728e-06,
546
+ "loss": 1.6323,
547
+ "mean_token_accuracy": 0.5971143245697021,
548
+ "num_tokens": 106934.0,
549
  "step": 60
550
  },
551
  {
552
  "epoch": 0.24596774193548387,
553
+ "grad_norm": 5.9823737144470215,
554
  "learning_rate": 4.290018081536807e-06,
555
+ "loss": 1.5736,
556
+ "mean_token_accuracy": 0.5935134887695312,
557
+ "num_tokens": 108786.0,
558
  "step": 61
559
  },
560
  {
561
  "epoch": 0.25,
562
+ "grad_norm": 6.844268798828125,
563
  "learning_rate": 4.267766952966369e-06,
564
+ "loss": 1.6744,
565
+ "mean_token_accuracy": 0.5678654313087463,
566
+ "num_tokens": 110512.0,
567
  "step": 62
568
  },
569
  {
570
  "epoch": 0.2540322580645161,
571
+ "grad_norm": 6.168908596038818,
572
  "learning_rate": 4.245232152973148e-06,
573
+ "loss": 1.6208,
574
+ "mean_token_accuracy": 0.5953115224838257,
575
+ "num_tokens": 112135.0,
576
  "step": 63
577
  },
578
  {
579
  "epoch": 0.25806451612903225,
580
+ "grad_norm": 6.162073135375977,
581
  "learning_rate": 4.222417297689217e-06,
582
+ "loss": 1.6455,
583
+ "mean_token_accuracy": 0.6012843251228333,
584
+ "num_tokens": 113850.0,
585
  "step": 64
586
  },
587
  {
588
  "epoch": 0.2620967741935484,
589
+ "grad_norm": 5.909740447998047,
590
  "learning_rate": 4.199326048186783e-06,
591
+ "loss": 1.6408,
592
+ "mean_token_accuracy": 0.5889984369277954,
593
+ "num_tokens": 115779.0,
594
  "step": 65
595
  },
596
  {
597
  "epoch": 0.2661290322580645,
598
+ "grad_norm": 6.424287796020508,
599
  "learning_rate": 4.175962109890697e-06,
600
+ "loss": 1.4827,
601
+ "mean_token_accuracy": 0.6072772741317749,
602
+ "num_tokens": 117375.0,
603
  "step": 66
604
  },
605
  {
606
  "epoch": 0.2701612903225806,
607
+ "grad_norm": 7.354248523712158,
608
  "learning_rate": 4.152329231983852e-06,
609
+ "loss": 1.5259,
610
+ "mean_token_accuracy": 0.6074517369270325,
611
+ "num_tokens": 118880.0,
612
  "step": 67
613
  },
614
  {
615
  "epoch": 0.27419354838709675,
616
+ "grad_norm": 6.386468887329102,
617
  "learning_rate": 4.128431206805556e-06,
618
+ "loss": 1.4579,
619
+ "mean_token_accuracy": 0.6266149878501892,
620
+ "num_tokens": 120430.0,
621
  "step": 68
622
  },
623
  {
624
  "epoch": 0.2782258064516129,
625
+ "grad_norm": 5.8017168045043945,
626
  "learning_rate": 4.104271869242975e-06,
627
+ "loss": 1.5997,
628
+ "mean_token_accuracy": 0.5914376378059387,
629
+ "num_tokens": 122324.0,
630
  "step": 69
631
  },
632
  {
633
  "epoch": 0.28225806451612906,
634
+ "grad_norm": 6.298964500427246,
635
  "learning_rate": 4.07985509611576e-06,
636
+ "loss": 1.5175,
637
+ "mean_token_accuracy": 0.6027742624282837,
638
+ "num_tokens": 123912.0,
639
  "step": 70
640
  },
641
  {
642
  "epoch": 0.2862903225806452,
643
+ "grad_norm": 5.610908031463623,
644
  "learning_rate": 4.0551848055539345e-06,
645
+ "loss": 1.4782,
646
+ "mean_token_accuracy": 0.6146214008331299,
647
+ "num_tokens": 125829.0,
648
  "step": 71
649
  },
650
  {
651
  "epoch": 0.2903225806451613,
652
+ "grad_norm": 5.9410014152526855,
653
  "learning_rate": 4.030264956369158e-06,
654
+ "loss": 1.5216,
655
+ "mean_token_accuracy": 0.6010044813156128,
656
+ "num_tokens": 127623.0,
657
  "step": 72
658
  },
659
  {
660
  "epoch": 0.29435483870967744,
661
+ "grad_norm": 6.027655601501465,
662
  "learning_rate": 4.005099547419458e-06,
663
+ "loss": 1.5922,
664
+ "mean_token_accuracy": 0.604594349861145,
665
+ "num_tokens": 129671.0,
666
  "step": 73
667
  },
668
  {
669
  "epoch": 0.29838709677419356,
670
+ "grad_norm": 6.194504261016846,
671
  "learning_rate": 3.979692616967543e-06,
672
+ "loss": 1.6518,
673
+ "mean_token_accuracy": 0.5671883225440979,
674
+ "num_tokens": 131526.0,
675
  "step": 74
676
  },
677
  {
678
  "epoch": 0.3024193548387097,
679
+ "grad_norm": 6.842575550079346,
680
  "learning_rate": 3.9540482420327845e-06,
681
+ "loss": 1.7604,
682
+ "mean_token_accuracy": 0.5674124956130981,
683
+ "num_tokens": 133271.0,
684
  "step": 75
685
  },
686
  {
687
  "epoch": 0.3064516129032258,
688
+ "grad_norm": 7.071008682250977,
689
  "learning_rate": 3.9281705377369814e-06,
690
+ "loss": 1.5444,
691
+ "mean_token_accuracy": 0.592570424079895,
692
+ "num_tokens": 134942.0,
693
  "step": 76
694
  },
695
  {
696
  "epoch": 0.31048387096774194,
697
+ "grad_norm": 6.561822414398193,
698
  "learning_rate": 3.902063656644012e-06,
699
+ "loss": 1.6206,
700
+ "mean_token_accuracy": 0.5793358087539673,
701
+ "num_tokens": 136841.0,
702
  "step": 77
703
  },
704
  {
705
  "epoch": 0.31451612903225806,
706
+ "grad_norm": 6.077663421630859,
707
  "learning_rate": 3.875731788093478e-06,
708
+ "loss": 1.5986,
709
+ "mean_token_accuracy": 0.591160237789154,
710
+ "num_tokens": 138653.0,
711
  "step": 78
712
  },
713
  {
714
  "epoch": 0.3185483870967742,
715
+ "grad_norm": 6.567859649658203,
716
  "learning_rate": 3.84917915752845e-06,
717
+ "loss": 1.5392,
718
+ "mean_token_accuracy": 0.5913461446762085,
719
+ "num_tokens": 140111.0,
720
  "step": 79
721
  },
722
  {
723
  "epoch": 0.3225806451612903,
724
+ "grad_norm": 5.775151252746582,
725
  "learning_rate": 3.8224100258174066e-06,
726
+ "loss": 1.5684,
727
+ "mean_token_accuracy": 0.5914602279663086,
728
+ "num_tokens": 142010.0,
729
  "step": 80
730
  },
731
  {
732
  "epoch": 0.32661290322580644,
733
+ "grad_norm": 6.075804233551025,
734
  "learning_rate": 3.795428688570505e-06,
735
+ "loss": 1.5588,
736
+ "mean_token_accuracy": 0.5793555974960327,
737
+ "num_tokens": 143688.0,
738
  "step": 81
739
  },
740
  {
741
  "epoch": 0.33064516129032256,
742
+ "grad_norm": 6.182637691497803,
743
  "learning_rate": 3.7682394754502687e-06,
744
+ "loss": 1.6354,
745
+ "mean_token_accuracy": 0.5826601982116699,
746
+ "num_tokens": 145547.0,
747
  "step": 82
748
  },
749
  {
750
  "epoch": 0.3346774193548387,
751
+ "grad_norm": 6.141689777374268,
752
  "learning_rate": 3.7408467494768104e-06,
753
+ "loss": 1.5793,
754
+ "mean_token_accuracy": 0.5980230569839478,
755
+ "num_tokens": 147370.0,
756
  "step": 83
757
  },
758
  {
759
  "epoch": 0.3387096774193548,
760
+ "grad_norm": 5.8948140144348145,
761
  "learning_rate": 3.7132549063277033e-06,
762
+ "loss": 1.6276,
763
+ "mean_token_accuracy": 0.5829015374183655,
764
+ "num_tokens": 149302.0,
765
  "step": 84
766
  },
767
  {
768
  "epoch": 0.34274193548387094,
769
+ "grad_norm": 6.238958358764648,
770
  "learning_rate": 3.685468373632613e-06,
771
+ "loss": 1.6904,
772
+ "mean_token_accuracy": 0.5823293328285217,
773
+ "num_tokens": 151047.0,
774
  "step": 85
775
  },
776
  {
777
  "epoch": 0.3467741935483871,
778
+ "grad_norm": 5.967896461486816,
779
  "learning_rate": 3.657491610262802e-06,
780
+ "loss": 1.5618,
781
+ "mean_token_accuracy": 0.5967926979064941,
782
+ "num_tokens": 152795.0,
783
  "step": 86
784
  },
785
  {
786
  "epoch": 0.35080645161290325,
787
+ "grad_norm": 5.502566337585449,
788
  "learning_rate": 3.6293291056156178e-06,
789
+ "loss": 1.5015,
790
+ "mean_token_accuracy": 0.5971802473068237,
791
+ "num_tokens": 154783.0,
792
  "step": 87
793
  },
794
  {
795
  "epoch": 0.3548387096774194,
796
+ "grad_norm": 5.598520755767822,
797
  "learning_rate": 3.600985378894086e-06,
798
+ "loss": 1.6132,
799
+ "mean_token_accuracy": 0.5847665667533875,
800
+ "num_tokens": 156820.0,
801
  "step": 88
802
  },
803
  {
804
  "epoch": 0.3588709677419355,
805
+ "grad_norm": 5.761752605438232,
806
  "learning_rate": 3.572464978381719e-06,
807
+ "loss": 1.5378,
808
+ "mean_token_accuracy": 0.5998993515968323,
809
+ "num_tokens": 158809.0,
810
  "step": 89
811
  },
812
  {
813
  "epoch": 0.3629032258064516,
814
+ "grad_norm": 6.394772529602051,
815
  "learning_rate": 3.5437724807126583e-06,
816
+ "loss": 1.5489,
817
+ "mean_token_accuracy": 0.597762405872345,
818
+ "num_tokens": 160688.0,
819
  "step": 90
820
  },
821
  {
822
  "epoch": 0.36693548387096775,
823
+ "grad_norm": 5.892634868621826,
824
  "learning_rate": 3.514912490137268e-06,
825
+ "loss": 1.3987,
826
+ "mean_token_accuracy": 0.634549617767334,
827
+ "num_tokens": 162444.0,
828
  "step": 91
829
  },
830
  {
831
  "epoch": 0.3709677419354839,
832
+ "grad_norm": 6.148153305053711,
833
  "learning_rate": 3.4858896377832966e-06,
834
+ "loss": 1.5568,
835
+ "mean_token_accuracy": 0.5997700095176697,
836
+ "num_tokens": 164185.0,
837
  "step": 92
838
  },
839
  {
840
  "epoch": 0.375,
841
+ "grad_norm": 5.827703952789307,
842
  "learning_rate": 3.4567085809127247e-06,
843
+ "loss": 1.6176,
844
+ "mean_token_accuracy": 0.6020779013633728,
845
+ "num_tokens": 166112.0,
846
  "step": 93
847
  },
848
  {
849
  "epoch": 0.3790322580645161,
850
+ "grad_norm": 5.979760646820068,
851
  "learning_rate": 3.42737400217442e-06,
852
+ "loss": 1.6762,
853
+ "mean_token_accuracy": 0.574064314365387,
854
+ "num_tokens": 168011.0,
855
  "step": 94
856
  },
857
  {
858
  "epoch": 0.38306451612903225,
859
+ "grad_norm": 6.755500793457031,
860
  "learning_rate": 3.397890608852718e-06,
861
+ "loss": 1.6051,
862
+ "mean_token_accuracy": 0.5879656076431274,
863
+ "num_tokens": 169758.0,
864
  "step": 95
865
  },
866
  {
867
  "epoch": 0.3870967741935484,
868
+ "grad_norm": 6.7287068367004395,
869
  "learning_rate": 3.3682631321120507e-06,
870
+ "loss": 1.5288,
871
+ "mean_token_accuracy": 0.6041095852851868,
872
+ "num_tokens": 171220.0,
873
  "step": 96
874
  },
875
  {
876
  "epoch": 0.3911290322580645,
877
+ "grad_norm": 6.124728679656982,
878
  "learning_rate": 3.3384963262377434e-06,
879
+ "loss": 1.5012,
880
+ "mean_token_accuracy": 0.5954991579055786,
881
+ "num_tokens": 172955.0,
882
  "step": 97
883
  },
884
  {
885
  "epoch": 0.3951612903225806,
886
+ "grad_norm": 5.964875221252441,
887
  "learning_rate": 3.3085949678730953e-06,
888
+ "loss": 1.4252,
889
+ "mean_token_accuracy": 0.6224677562713623,
890
+ "num_tokens": 174586.0,
891
  "step": 98
892
  },
893
  {
894
  "epoch": 0.39919354838709675,
895
+ "grad_norm": 6.314966678619385,
896
  "learning_rate": 3.278563855252885e-06,
897
+ "loss": 1.4519,
898
+ "mean_token_accuracy": 0.630832850933075,
899
+ "num_tokens": 176281.0,
900
  "step": 99
901
  },
902
  {
903
  "epoch": 0.4032258064516129,
904
+ "grad_norm": 6.121771812438965,
905
  "learning_rate": 3.248407807433396e-06,
906
+ "loss": 1.541,
907
+ "mean_token_accuracy": 0.5919907093048096,
908
+ "num_tokens": 178006.0,
909
  "step": 100
910
  },
911
  {
912
  "epoch": 0.40725806451612906,
913
+ "grad_norm": 6.246324062347412,
914
  "learning_rate": 3.2181316635191125e-06,
915
+ "loss": 1.4925,
916
+ "mean_token_accuracy": 0.6121912598609924,
917
+ "num_tokens": 179911.0,
918
  "step": 101
919
  },
920
  {
921
  "epoch": 0.4112903225806452,
922
+ "grad_norm": 5.7846221923828125,
923
  "learning_rate": 3.1877402818861954e-06,
924
+ "loss": 1.5096,
925
+ "mean_token_accuracy": 0.6097561120986938,
926
+ "num_tokens": 181799.0,
927
  "step": 102
928
  },
929
  {
930
  "epoch": 0.4153225806451613,
931
+ "grad_norm": 5.7426252365112305,
932
  "learning_rate": 3.157238539402862e-06,
933
+ "loss": 1.4982,
934
+ "mean_token_accuracy": 0.5930666923522949,
935
+ "num_tokens": 183676.0,
936
  "step": 103
937
  },
938
  {
939
  "epoch": 0.41935483870967744,
940
+ "grad_norm": 6.007601261138916,
941
  "learning_rate": 3.1266313306468018e-06,
942
+ "loss": 1.515,
943
+ "mean_token_accuracy": 0.6020166277885437,
944
+ "num_tokens": 185364.0,
945
  "step": 104
946
  },
947
  {
948
  "epoch": 0.42338709677419356,
949
+ "grad_norm": 6.339498519897461,
950
  "learning_rate": 3.095923567119748e-06,
951
+ "loss": 1.5713,
952
+ "mean_token_accuracy": 0.5824423432350159,
953
+ "num_tokens": 187143.0,
954
  "step": 105
955
  },
956
  {
957
  "epoch": 0.4274193548387097,
958
+ "grad_norm": 6.260862827301025,
959
  "learning_rate": 3.0651201764593375e-06,
960
+ "loss": 1.6099,
961
+ "mean_token_accuracy": 0.5965318083763123,
962
+ "num_tokens": 188875.0,
963
  "step": 106
964
  },
965
  {
966
  "epoch": 0.4314516129032258,
967
+ "grad_norm": 5.941509246826172,
968
  "learning_rate": 3.034226101648377e-06,
969
+ "loss": 1.6252,
970
+ "mean_token_accuracy": 0.586152195930481,
971
+ "num_tokens": 190769.0,
972
  "step": 107
973
  },
974
  {
975
  "epoch": 0.43548387096774194,
976
+ "grad_norm": 5.4027509689331055,
977
  "learning_rate": 3.0032463002216504e-06,
978
+ "loss": 1.3669,
979
+ "mean_token_accuracy": 0.6161772012710571,
980
+ "num_tokens": 192712.0,
981
  "step": 108
982
  },
983
  {
984
  "epoch": 0.43951612903225806,
985
+ "grad_norm": 5.507197856903076,
986
  "learning_rate": 2.972185743470386e-06,
987
+ "loss": 1.5409,
988
+ "mean_token_accuracy": 0.6028755307197571,
989
+ "num_tokens": 194731.0,
990
  "step": 109
991
  },
992
  {
993
  "epoch": 0.4435483870967742,
994
+ "grad_norm": 6.563897609710693,
995
  "learning_rate": 2.941049415644522e-06,
996
+ "loss": 1.5292,
997
+ "mean_token_accuracy": 0.6081171035766602,
998
+ "num_tokens": 196236.0,
999
  "step": 110
1000
  },
1001
  {
1002
  "epoch": 0.4475806451612903,
1003
+ "grad_norm": 6.108341693878174,
1004
  "learning_rate": 2.909842313152888e-06,
1005
+ "loss": 1.5694,
1006
+ "mean_token_accuracy": 0.5977421402931213,
1007
+ "num_tokens": 197921.0,
1008
  "step": 111
1009
  },
1010
  {
1011
  "epoch": 0.45161290322580644,
1012
+ "grad_norm": 5.4269256591796875,
1013
  "learning_rate": 2.878569443761442e-06,
1014
+ "loss": 1.4808,
1015
+ "mean_token_accuracy": 0.6034912467002869,
1016
+ "num_tokens": 199928.0,
1017
  "step": 112
1018
  },
1019
  {
1020
  "epoch": 0.45564516129032256,
1021
+ "grad_norm": 6.073188304901123,
1022
  "learning_rate": 2.847235825789673e-06,
1023
+ "loss": 1.5251,
1024
+ "mean_token_accuracy": 0.6071428656578064,
1025
+ "num_tokens": 201750.0,
1026
  "step": 113
1027
  },
1028
  {
1029
  "epoch": 0.4596774193548387,
1030
+ "grad_norm": 6.206169605255127,
1031
  "learning_rate": 2.8158464873053236e-06,
1032
+ "loss": 1.6275,
1033
+ "mean_token_accuracy": 0.5718509554862976,
1034
+ "num_tokens": 203443.0,
1035
  "step": 114
1036
  },
1037
  {
1038
  "epoch": 0.4637096774193548,
1039
+ "grad_norm": 6.5099616050720215,
1040
  "learning_rate": 2.784406465317538e-06,
1041
+ "loss": 1.4364,
1042
+ "mean_token_accuracy": 0.6256089210510254,
1043
+ "num_tokens": 204882.0,
1044
  "step": 115
1045
  },
1046
  {
1047
  "epoch": 0.46774193548387094,
1048
+ "grad_norm": 5.677121639251709,
1049
  "learning_rate": 2.752920804968581e-06,
1050
+ "loss": 1.515,
1051
+ "mean_token_accuracy": 0.5985877513885498,
1052
+ "num_tokens": 206725.0,
1053
  "step": 116
1054
  },
1055
  {
1056
  "epoch": 0.4717741935483871,
1057
+ "grad_norm": 5.99458646774292,
1058
  "learning_rate": 2.7213945587242507e-06,
1059
+ "loss": 1.6479,
1060
+ "mean_token_accuracy": 0.5737704634666443,
1061
+ "num_tokens": 208740.0,
1062
  "step": 117
1063
  },
1064
  {
1065
  "epoch": 0.47580645161290325,
1066
+ "grad_norm": 5.447024822235107,
1067
  "learning_rate": 2.689832785563116e-06,
1068
+ "loss": 1.5805,
1069
+ "mean_token_accuracy": 0.5884652733802795,
1070
+ "num_tokens": 210788.0,
1071
  "step": 118
1072
  },
1073
  {
1074
  "epoch": 0.4798387096774194,
1075
+ "grad_norm": 5.383421421051025,
1076
  "learning_rate": 2.658240550164704e-06,
1077
+ "loss": 1.5772,
1078
+ "mean_token_accuracy": 0.5977517366409302,
1079
+ "num_tokens": 212836.0,
1080
  "step": 119
1081
  },
1082
  {
1083
  "epoch": 0.4838709677419355,
1084
+ "grad_norm": 6.088338375091553,
1085
  "learning_rate": 2.626622922096782e-06,
1086
+ "loss": 1.565,
1087
+ "mean_token_accuracy": 0.5961331725120544,
1088
+ "num_tokens": 214700.0,
1089
  "step": 120
1090
  },
1091
  {
1092
  "epoch": 0.4879032258064516,
1093
+ "grad_norm": 5.788651466369629,
1094
  "learning_rate": 2.5949849750018486e-06,
1095
+ "loss": 1.3632,
1096
+ "mean_token_accuracy": 0.6446384191513062,
1097
+ "num_tokens": 216306.0,
1098
  "step": 121
1099
  },
1100
  {
1101
  "epoch": 0.49193548387096775,
1102
+ "grad_norm": 5.829148769378662,
1103
  "learning_rate": 2.56333178578297e-06,
1104
+ "loss": 1.6738,
1105
+ "mean_token_accuracy": 0.574331521987915,
1106
+ "num_tokens": 218178.0,
1107
  "step": 122
1108
  },
1109
  {
1110
  "epoch": 0.4959677419354839,
1111
+ "grad_norm": 5.958081245422363,
1112
  "learning_rate": 2.5316684337891005e-06,
1113
+ "loss": 1.3764,
1114
+ "mean_token_accuracy": 0.6442367434501648,
1115
+ "num_tokens": 219785.0,
1116
  "step": 123
1117
  },
1118
  {
1119
  "epoch": 0.5,
1120
+ "grad_norm": 6.186583995819092,
1121
  "learning_rate": 2.5e-06,
1122
+ "loss": 1.4931,
1123
+ "mean_token_accuracy": 0.6139564514160156,
1124
+ "num_tokens": 221349.0,
1125
  "step": 124
1126
  },
1127
  {
1128
  "epoch": 0.5040322580645161,
1129
+ "grad_norm": 5.803173542022705,
1130
  "learning_rate": 2.4683315662109003e-06,
1131
+ "loss": 1.5739,
1132
+ "mean_token_accuracy": 0.58859783411026,
1133
+ "num_tokens": 223298.0,
1134
  "step": 125
1135
  },
1136
  {
1137
  "epoch": 0.5080645161290323,
1138
+ "grad_norm": 5.776984214782715,
1139
  "learning_rate": 2.436668214217031e-06,
1140
+ "loss": 1.5603,
1141
+ "mean_token_accuracy": 0.5867098569869995,
1142
+ "num_tokens": 225151.0,
1143
  "step": 126
1144
  },
1145
  {
1146
  "epoch": 0.5120967741935484,
1147
+ "grad_norm": 5.5442094802856445,
1148
  "learning_rate": 2.4050150249981522e-06,
1149
+ "loss": 1.4753,
1150
+ "mean_token_accuracy": 0.6018254160881042,
1151
+ "num_tokens": 226906.0,
1152
  "step": 127
1153
  },
1154
  {
1155
  "epoch": 0.5161290322580645,
1156
+ "grad_norm": 5.492681503295898,
1157
  "learning_rate": 2.3733770779032185e-06,
1158
+ "loss": 1.6577,
1159
+ "mean_token_accuracy": 0.5733202695846558,
1160
+ "num_tokens": 228947.0,
1161
  "step": 128
1162
  },
1163
  {
1164
  "epoch": 0.5201612903225806,
1165
+ "grad_norm": 5.890382766723633,
1166
  "learning_rate": 2.341759449835297e-06,
1167
+ "loss": 1.5399,
1168
+ "mean_token_accuracy": 0.5982404947280884,
1169
+ "num_tokens": 230995.0,
1170
  "step": 129
1171
  },
1172
  {
1173
  "epoch": 0.5241935483870968,
1174
+ "grad_norm": 5.885677337646484,
1175
  "learning_rate": 2.310167214436885e-06,
1176
+ "loss": 1.5541,
1177
+ "mean_token_accuracy": 0.6021798253059387,
1178
+ "num_tokens": 232832.0,
1179
  "step": 130
1180
  },
1181
  {
1182
  "epoch": 0.5282258064516129,
1183
+ "grad_norm": 5.711495876312256,
1184
  "learning_rate": 2.27860544127575e-06,
1185
+ "loss": 1.5311,
1186
+ "mean_token_accuracy": 0.5873016119003296,
1187
+ "num_tokens": 234598.0,
1188
  "step": 131
1189
  },
1190
  {
1191
  "epoch": 0.532258064516129,
1192
+ "grad_norm": 5.489191055297852,
1193
  "learning_rate": 2.24707919503142e-06,
1194
+ "loss": 1.4668,
1195
+ "mean_token_accuracy": 0.6193058490753174,
1196
+ "num_tokens": 236444.0,
1197
  "step": 132
1198
  },
1199
  {
1200
  "epoch": 0.5362903225806451,
1201
+ "grad_norm": 5.6918110847473145,
1202
  "learning_rate": 2.2155935346824634e-06,
1203
+ "loss": 1.5273,
1204
+ "mean_token_accuracy": 0.6107238531112671,
1205
+ "num_tokens": 238311.0,
1206
  "step": 133
1207
  },
1208
  {
1209
  "epoch": 0.5403225806451613,
1210
+ "grad_norm": 6.0905938148498535,
1211
  "learning_rate": 2.1841535126946777e-06,
1212
+ "loss": 1.5814,
1213
+ "mean_token_accuracy": 0.6002208590507507,
1214
+ "num_tokens": 240124.0,
1215
  "step": 134
1216
  },
1217
  {
1218
  "epoch": 0.5443548387096774,
1219
+ "grad_norm": 5.952730178833008,
1220
  "learning_rate": 2.1527641742103282e-06,
1221
+ "loss": 1.4447,
1222
+ "mean_token_accuracy": 0.6165267825126648,
1223
+ "num_tokens": 241675.0,
1224
  "step": 135
1225
  },
1226
  {
1227
  "epoch": 0.5483870967741935,
1228
+ "grad_norm": 6.087653160095215,
1229
  "learning_rate": 2.1214305562385592e-06,
1230
+ "loss": 1.6968,
1231
+ "mean_token_accuracy": 0.5795266628265381,
1232
+ "num_tokens": 243494.0,
1233
  "step": 136
1234
  },
1235
  {
1236
  "epoch": 0.5524193548387096,
1237
+ "grad_norm": 5.839648246765137,
1238
  "learning_rate": 2.0901576868471125e-06,
1239
+ "loss": 1.7217,
1240
+ "mean_token_accuracy": 0.5630123019218445,
1241
+ "num_tokens": 245448.0,
1242
  "step": 137
1243
  },
1244
  {
1245
  "epoch": 0.5564516129032258,
1246
+ "grad_norm": 5.999613285064697,
1247
  "learning_rate": 2.05895058435548e-06,
1248
+ "loss": 1.3948,
1249
+ "mean_token_accuracy": 0.6161738038063049,
1250
+ "num_tokens": 247107.0,
1251
  "step": 138
1252
  },
1253
  {
1254
  "epoch": 0.5604838709677419,
1255
+ "grad_norm": 5.091090202331543,
1256
  "learning_rate": 2.0278142565296153e-06,
1257
+ "loss": 1.4922,
1258
+ "mean_token_accuracy": 0.607038140296936,
1259
+ "num_tokens": 249155.0,
1260
  "step": 139
1261
  },
1262
  {
1263
  "epoch": 0.5645161290322581,
1264
+ "grad_norm": 5.5148539543151855,
1265
  "learning_rate": 1.9967536997783495e-06,
1266
+ "loss": 1.5327,
1267
+ "mean_token_accuracy": 0.6058357954025269,
1268
+ "num_tokens": 251179.0,
1269
  "step": 140
1270
  },
1271
  {
1272
  "epoch": 0.5685483870967742,
1273
+ "grad_norm": 6.003233432769775,
1274
  "learning_rate": 1.9657738983516227e-06,
1275
+ "loss": 1.5269,
1276
+ "mean_token_accuracy": 0.592024564743042,
1277
+ "num_tokens": 253137.0,
1278
  "step": 141
1279
  },
1280
  {
1281
  "epoch": 0.5725806451612904,
1282
+ "grad_norm": 5.6866068840026855,
1283
  "learning_rate": 1.934879823540663e-06,
1284
+ "loss": 1.4626,
1285
+ "mean_token_accuracy": 0.6116965413093567,
1286
+ "num_tokens": 255037.0,
1287
  "step": 142
1288
  },
1289
  {
1290
  "epoch": 0.5766129032258065,
1291
+ "grad_norm": 5.541101455688477,
1292
  "learning_rate": 1.9040764328802523e-06,
1293
+ "loss": 1.3471,
1294
+ "mean_token_accuracy": 0.6364172101020813,
1295
+ "num_tokens": 256736.0,
1296
  "step": 143
1297
  },
1298
  {
1299
  "epoch": 0.5806451612903226,
1300
+ "grad_norm": 5.830052375793457,
1301
  "learning_rate": 1.8733686693531986e-06,
1302
+ "loss": 1.5507,
1303
+ "mean_token_accuracy": 0.5932571887969971,
1304
+ "num_tokens": 258577.0,
1305
  "step": 144
1306
  },
1307
  {
1308
  "epoch": 0.5846774193548387,
1309
+ "grad_norm": 6.131454944610596,
1310
  "learning_rate": 1.842761460597138e-06,
1311
+ "loss": 1.4502,
1312
+ "mean_token_accuracy": 0.633840024471283,
1313
+ "num_tokens": 260092.0,
1314
  "step": 145
1315
  },
1316
  {
1317
  "epoch": 0.5887096774193549,
1318
+ "grad_norm": 7.0200324058532715,
1319
  "learning_rate": 1.812259718113805e-06,
1320
+ "loss": 1.5706,
1321
+ "mean_token_accuracy": 0.5800653696060181,
1322
+ "num_tokens": 261930.0,
1323
  "step": 146
1324
  },
1325
  {
1326
  "epoch": 0.592741935483871,
1327
+ "grad_norm": 6.529550552368164,
1328
  "learning_rate": 1.7818683364808883e-06,
1329
+ "loss": 1.6779,
1330
+ "mean_token_accuracy": 0.5723379850387573,
1331
+ "num_tokens": 263660.0,
1332
  "step": 147
1333
  },
1334
  {
1335
  "epoch": 0.5967741935483871,
1336
+ "grad_norm": 6.201499938964844,
1337
  "learning_rate": 1.7515921925666053e-06,
1338
+ "loss": 1.5537,
1339
+ "mean_token_accuracy": 0.5979142785072327,
1340
+ "num_tokens": 265388.0,
1341
  "step": 148
1342
  },
1343
  {
1344
  "epoch": 0.6008064516129032,
1345
+ "grad_norm": 6.028550624847412,
1346
  "learning_rate": 1.7214361447471156e-06,
1347
+ "loss": 1.5809,
1348
+ "mean_token_accuracy": 0.5865436792373657,
1349
+ "num_tokens": 267233.0,
1350
  "step": 149
1351
  },
1352
  {
1353
  "epoch": 0.6048387096774194,
1354
+ "grad_norm": 5.58845853805542,
1355
  "learning_rate": 1.6914050321269049e-06,
1356
+ "loss": 1.4061,
1357
+ "mean_token_accuracy": 0.623369574546814,
1358
+ "num_tokens": 269075.0,
1359
  "step": 150
1360
  },
1361
  {
1362
  "epoch": 0.6088709677419355,
1363
+ "grad_norm": 6.210185527801514,
1364
  "learning_rate": 1.6615036737622574e-06,
1365
+ "loss": 1.6096,
1366
+ "mean_token_accuracy": 0.5861111283302307,
1367
+ "num_tokens": 270877.0,
1368
  "step": 151
1369
  },
1370
  {
1371
  "epoch": 0.6129032258064516,
1372
+ "grad_norm": 6.132157325744629,
1373
  "learning_rate": 1.6317368678879497e-06,
1374
+ "loss": 1.6248,
1375
+ "mean_token_accuracy": 0.5724177956581116,
1376
+ "num_tokens": 272612.0,
1377
  "step": 152
1378
  },
1379
  {
1380
  "epoch": 0.6169354838709677,
1381
+ "grad_norm": 6.025227069854736,
1382
  "learning_rate": 1.6021093911472825e-06,
1383
+ "loss": 1.5701,
1384
+ "mean_token_accuracy": 0.5928652286529541,
1385
+ "num_tokens": 274380.0,
1386
  "step": 153
1387
  },
1388
  {
1389
  "epoch": 0.6209677419354839,
1390
+ "grad_norm": 5.912363529205322,
1391
  "learning_rate": 1.572625997825581e-06,
1392
+ "loss": 1.4989,
1393
+ "mean_token_accuracy": 0.5981630086898804,
1394
+ "num_tokens": 276124.0,
1395
  "step": 154
1396
  },
1397
  {
1398
  "epoch": 0.625,
1399
+ "grad_norm": 5.779432773590088,
1400
  "learning_rate": 1.5432914190872757e-06,
1401
+ "loss": 1.4394,
1402
+ "mean_token_accuracy": 0.6316964030265808,
1403
+ "num_tokens": 277918.0,
1404
  "step": 155
1405
  },
1406
  {
1407
  "epoch": 0.6290322580645161,
1408
+ "grad_norm": 5.328639984130859,
1409
  "learning_rate": 1.5141103622167042e-06,
1410
+ "loss": 1.4466,
1411
+ "mean_token_accuracy": 0.6170212626457214,
1412
+ "num_tokens": 279800.0,
1413
  "step": 156
1414
  },
1415
  {
1416
  "epoch": 0.6330645161290323,
1417
+ "grad_norm": 6.32019567489624,
1418
  "learning_rate": 1.4850875098627326e-06,
1419
+ "loss": 1.5899,
1420
+ "mean_token_accuracy": 0.6091743111610413,
1421
+ "num_tokens": 281437.0,
1422
  "step": 157
1423
  },
1424
  {
1425
  "epoch": 0.6370967741935484,
1426
+ "grad_norm": 5.477099418640137,
1427
  "learning_rate": 1.456227519287343e-06,
1428
+ "loss": 1.5322,
1429
+ "mean_token_accuracy": 0.5986914038658142,
1430
+ "num_tokens": 283273.0,
1431
  "step": 158
1432
  },
1433
  {
1434
  "epoch": 0.6411290322580645,
1435
+ "grad_norm": 6.026065349578857,
1436
  "learning_rate": 1.4275350216182824e-06,
1437
+ "loss": 1.3874,
1438
+ "mean_token_accuracy": 0.6260971426963806,
1439
+ "num_tokens": 284984.0,
1440
  "step": 159
1441
  },
1442
  {
1443
  "epoch": 0.6451612903225806,
1444
+ "grad_norm": 5.370904922485352,
1445
  "learning_rate": 1.3990146211059141e-06,
1446
+ "loss": 1.5299,
1447
+ "mean_token_accuracy": 0.6030451059341431,
1448
+ "num_tokens": 286825.0,
1449
  "step": 160
1450
  },
1451
  {
1452
  "epoch": 0.6491935483870968,
1453
+ "grad_norm": 5.8816633224487305,
1454
  "learning_rate": 1.3706708943843822e-06,
1455
+ "loss": 1.4889,
1456
+ "mean_token_accuracy": 0.6019198298454285,
1457
+ "num_tokens": 288598.0,
1458
  "step": 161
1459
  },
1460
  {
1461
  "epoch": 0.6532258064516129,
1462
+ "grad_norm": 6.100244522094727,
1463
  "learning_rate": 1.3425083897371983e-06,
1464
+ "loss": 1.4759,
1465
+ "mean_token_accuracy": 0.6230722069740295,
1466
+ "num_tokens": 290221.0,
1467
  "step": 162
1468
  },
1469
  {
1470
  "epoch": 0.657258064516129,
1471
+ "grad_norm": 5.920684337615967,
1472
  "learning_rate": 1.3145316263673874e-06,
1473
+ "loss": 1.5585,
1474
+ "mean_token_accuracy": 0.5871710777282715,
1475
+ "num_tokens": 292047.0,
1476
  "step": 163
1477
  },
1478
  {
1479
  "epoch": 0.6612903225806451,
1480
+ "grad_norm": 6.059237003326416,
1481
  "learning_rate": 1.286745093672298e-06,
1482
+ "loss": 1.5276,
1483
+ "mean_token_accuracy": 0.6024376153945923,
1484
+ "num_tokens": 293772.0,
1485
  "step": 164
1486
  },
1487
  {
1488
  "epoch": 0.6653225806451613,
1489
+ "grad_norm": 5.7648701667785645,
1490
  "learning_rate": 1.2591532505231906e-06,
1491
+ "loss": 1.5058,
1492
+ "mean_token_accuracy": 0.5996025800704956,
1493
+ "num_tokens": 295787.0,
1494
  "step": 165
1495
  },
1496
  {
1497
  "epoch": 0.6693548387096774,
1498
+ "grad_norm": 5.578222274780273,
1499
  "learning_rate": 1.2317605245497324e-06,
1500
+ "loss": 1.5277,
1501
+ "mean_token_accuracy": 0.5919329524040222,
1502
+ "num_tokens": 297698.0,
1503
  "step": 166
1504
  },
1505
  {
1506
  "epoch": 0.6733870967741935,
1507
+ "grad_norm": 5.404395580291748,
1508
  "learning_rate": 1.204571311429496e-06,
1509
+ "loss": 1.4855,
1510
+ "mean_token_accuracy": 0.5967162847518921,
1511
+ "num_tokens": 299649.0,
1512
  "step": 167
1513
  },
1514
  {
1515
  "epoch": 0.6774193548387096,
1516
+ "grad_norm": 6.532902240753174,
1517
  "learning_rate": 1.1775899741825947e-06,
1518
+ "loss": 1.4942,
1519
+ "mean_token_accuracy": 0.5987738370895386,
1520
+ "num_tokens": 301119.0,
1521
  "step": 168
1522
  },
1523
  {
1524
  "epoch": 0.6814516129032258,
1525
+ "grad_norm": 6.508765697479248,
1526
  "learning_rate": 1.1508208424715511e-06,
1527
+ "loss": 1.5848,
1528
+ "mean_token_accuracy": 0.5865746736526489,
1529
+ "num_tokens": 302715.0,
1530
  "step": 169
1531
  },
1532
  {
1533
  "epoch": 0.6854838709677419,
1534
+ "grad_norm": 5.411791801452637,
1535
  "learning_rate": 1.1242682119065217e-06,
1536
+ "loss": 1.4479,
1537
+ "mean_token_accuracy": 0.6116002202033997,
1538
+ "num_tokens": 304648.0,
1539
  "step": 170
1540
  },
1541
  {
1542
  "epoch": 0.6895161290322581,
1543
+ "grad_norm": 6.0362548828125,
1544
  "learning_rate": 1.0979363433559892e-06,
1545
+ "loss": 1.5337,
1546
+ "mean_token_accuracy": 0.5946902632713318,
1547
+ "num_tokens": 306345.0,
1548
  "step": 171
1549
  },
1550
  {
1551
  "epoch": 0.6935483870967742,
1552
+ "grad_norm": 6.6355743408203125,
1553
  "learning_rate": 1.0718294622630188e-06,
1554
+ "loss": 1.5346,
1555
+ "mean_token_accuracy": 0.6140100359916687,
1556
+ "num_tokens": 307746.0,
1557
  "step": 172
1558
  },
1559
  {
1560
  "epoch": 0.6975806451612904,
1561
+ "grad_norm": 5.8390421867370605,
1562
  "learning_rate": 1.045951757967215e-06,
1563
+ "loss": 1.5117,
1564
+ "mean_token_accuracy": 0.6113694906234741,
1565
+ "num_tokens": 309683.0,
1566
  "step": 173
1567
  },
1568
  {
1569
  "epoch": 0.7016129032258065,
1570
+ "grad_norm": 6.209187030792236,
1571
  "learning_rate": 1.0203073830324566e-06,
1572
+ "loss": 1.4862,
1573
+ "mean_token_accuracy": 0.6215522885322571,
1574
+ "num_tokens": 311244.0,
1575
  "step": 174
1576
  },
1577
  {
1578
  "epoch": 0.7056451612903226,
1579
+ "grad_norm": 5.811971664428711,
1580
  "learning_rate": 9.949004525805423e-07,
1581
+ "loss": 1.4113,
1582
+ "mean_token_accuracy": 0.625806450843811,
1583
+ "num_tokens": 312796.0,
1584
  "step": 175
1585
  },
1586
  {
1587
  "epoch": 0.7096774193548387,
1588
+ "grad_norm": 6.348616600036621,
1589
  "learning_rate": 9.697350436308428e-07,
1590
+ "loss": 1.4407,
1591
+ "mean_token_accuracy": 0.614819347858429,
1592
+ "num_tokens": 314431.0,
1593
  "step": 176
1594
  },
1595
  {
1596
  "epoch": 0.7137096774193549,
1597
+ "grad_norm": 5.936633110046387,
1598
  "learning_rate": 9.448151944460657e-07,
1599
+ "loss": 1.6396,
1600
+ "mean_token_accuracy": 0.5756798386573792,
1601
+ "num_tokens": 316382.0,
1602
  "step": 177
1603
  },
1604
  {
1605
  "epoch": 0.717741935483871,
1606
+ "grad_norm": 5.856295585632324,
1607
  "learning_rate": 9.201449038842403e-07,
1608
+ "loss": 1.5693,
1609
+ "mean_token_accuracy": 0.6030020713806152,
1610
+ "num_tokens": 318316.0,
1611
  "step": 178
1612
  },
1613
  {
1614
  "epoch": 0.7217741935483871,
1615
+ "grad_norm": 6.206198215484619,
1616
  "learning_rate": 8.957281307570254e-07,
1617
+ "loss": 1.4272,
1618
+ "mean_token_accuracy": 0.6132633090019226,
1619
+ "num_tokens": 319841.0,
1620
  "step": 179
1621
  },
1622
  {
1623
  "epoch": 0.7258064516129032,
1624
+ "grad_norm": 5.620049476623535,
1625
  "learning_rate": 8.71568793194445e-07,
1626
+ "loss": 1.5291,
1627
+ "mean_token_accuracy": 0.6036797761917114,
1628
+ "num_tokens": 321854.0,
1629
  "step": 180
1630
  },
1631
  {
1632
  "epoch": 0.7298387096774194,
1633
+ "grad_norm": 5.8175177574157715,
1634
  "learning_rate": 8.476707680161486e-07,
1635
+ "loss": 1.4948,
1636
+ "mean_token_accuracy": 0.5953565239906311,
1637
+ "num_tokens": 323665.0,
1638
  "step": 181
1639
  },
1640
  {
1641
  "epoch": 0.7338709677419355,
1642
+ "grad_norm": 5.765270709991455,
1643
  "learning_rate": 8.240378901093035e-07,
1644
+ "loss": 1.5781,
1645
+ "mean_token_accuracy": 0.6022663116455078,
1646
+ "num_tokens": 325432.0,
1647
  "step": 182
1648
  },
1649
  {
1650
  "epoch": 0.7379032258064516,
1651
+ "grad_norm": 6.488076686859131,
1652
  "learning_rate": 8.006739518132179e-07,
1653
+ "loss": 1.5952,
1654
+ "mean_token_accuracy": 0.5865209698677063,
1655
+ "num_tokens": 327081.0,
1656
  "step": 183
1657
  },
1658
  {
1659
  "epoch": 0.7419354838709677,
1660
+ "grad_norm": 6.85787296295166,
1661
  "learning_rate": 7.775827023107835e-07,
1662
+ "loss": 1.5112,
1663
+ "mean_token_accuracy": 0.6159420013427734,
1664
+ "num_tokens": 328463.0,
1665
  "step": 184
1666
  },
1667
  {
1668
  "epoch": 0.7459677419354839,
1669
+ "grad_norm": 5.89924955368042,
1670
  "learning_rate": 7.547678470268526e-07,
1671
+ "loss": 1.4708,
1672
+ "mean_token_accuracy": 0.615646243095398,
1673
+ "num_tokens": 330229.0,
1674
  "step": 185
1675
  },
1676
  {
1677
  "epoch": 0.75,
1678
+ "grad_norm": 5.8790812492370605,
1679
  "learning_rate": 7.322330470336314e-07,
1680
+ "loss": 1.6306,
1681
+ "mean_token_accuracy": 0.5779411792755127,
1682
+ "num_tokens": 332271.0,
1683
  "step": 186
1684
  },
1685
  {
1686
  "epoch": 0.7540322580645161,
1687
+ "grad_norm": 5.292142391204834,
1688
  "learning_rate": 7.099819184631929e-07,
1689
+ "loss": 1.4507,
1690
+ "mean_token_accuracy": 0.6181613206863403,
1691
+ "num_tokens": 334046.0,
1692
  "step": 187
1693
  },
1694
  {
1695
  "epoch": 0.7580645161290323,
1696
+ "grad_norm": 5.520077228546143,
1697
  "learning_rate": 6.880180319272006e-07,
1698
+ "loss": 1.4124,
1699
+ "mean_token_accuracy": 0.6184003949165344,
1700
+ "num_tokens": 335961.0,
1701
  "step": 188
1702
  },
1703
  {
1704
  "epoch": 0.7620967741935484,
1705
+ "grad_norm": 5.447739124298096,
1706
  "learning_rate": 6.663449119439358e-07,
1707
+ "loss": 1.5101,
1708
+ "mean_token_accuracy": 0.6079999804496765,
1709
+ "num_tokens": 337838.0,
1710
  "step": 189
1711
  },
1712
  {
1713
  "epoch": 0.7661290322580645,
1714
+ "grad_norm": 5.440929889678955,
1715
  "learning_rate": 6.449660363727236e-07,
1716
+ "loss": 1.5131,
1717
+ "mean_token_accuracy": 0.5955055952072144,
1718
+ "num_tokens": 339709.0,
1719
  "step": 190
1720
  },
1721
  {
1722
  "epoch": 0.7701612903225806,
1723
+ "grad_norm": 6.182394504547119,
1724
  "learning_rate": 6.238848358558439e-07,
1725
+ "loss": 1.485,
1726
+ "mean_token_accuracy": 0.6035979986190796,
1727
+ "num_tokens": 341323.0,
1728
  "step": 191
1729
  },
1730
  {
1731
  "epoch": 0.7741935483870968,
1732
+ "grad_norm": 6.693000793457031,
1733
  "learning_rate": 6.031046932680229e-07,
1734
+ "loss": 1.4743,
1735
+ "mean_token_accuracy": 0.6190476417541504,
1736
+ "num_tokens": 342627.0,
1737
  "step": 192
1738
  },
1739
  {
1740
  "epoch": 0.7782258064516129,
1741
+ "grad_norm": 5.218486309051514,
1742
  "learning_rate": 5.826289431735832e-07,
1743
+ "loss": 1.4456,
1744
+ "mean_token_accuracy": 0.6149131655693054,
1745
+ "num_tokens": 344587.0,
1746
  "step": 193
1747
  },
1748
  {
1749
  "epoch": 0.782258064516129,
1750
+ "grad_norm": 6.235224723815918,
1751
  "learning_rate": 5.624608712913531e-07,
1752
+ "loss": 1.3816,
1753
+ "mean_token_accuracy": 0.6143410801887512,
1754
+ "num_tokens": 346137.0,
1755
  "step": 194
1756
  },
1757
  {
1758
  "epoch": 0.7862903225806451,
1759
+ "grad_norm": 5.2699761390686035,
1760
  "learning_rate": 5.426037139674117e-07,
1761
+ "loss": 1.4791,
1762
+ "mean_token_accuracy": 0.6150712966918945,
1763
+ "num_tokens": 348103.0,
1764
  "step": 195
1765
  },
1766
  {
1767
  "epoch": 0.7903225806451613,
1768
+ "grad_norm": 5.843770980834961,
1769
  "learning_rate": 5.23060657655754e-07,
1770
+ "loss": 1.5063,
1771
+ "mean_token_accuracy": 0.6077738404273987,
1772
+ "num_tokens": 350086.0,
1773
  "step": 196
1774
  },
1775
  {
1776
  "epoch": 0.7943548387096774,
1777
+ "grad_norm": 5.2697248458862305,
1778
  "learning_rate": 5.038348384069663e-07,
1779
+ "loss": 1.4611,
1780
+ "mean_token_accuracy": 0.6173697113990784,
1781
+ "num_tokens": 352103.0,
1782
  "step": 197
1783
  },
1784
  {
1785
  "epoch": 0.7983870967741935,
1786
+ "grad_norm": 5.984487056732178,
1787
  "learning_rate": 4.84929341364988e-07,
1788
+ "loss": 1.5012,
1789
+ "mean_token_accuracy": 0.6050000190734863,
1790
+ "num_tokens": 353905.0,
1791
  "step": 198
1792
  },
1793
  {
1794
  "epoch": 0.8024193548387096,
1795
+ "grad_norm": 5.947335243225098,
1796
  "learning_rate": 4.6634720027204093e-07,
1797
+ "loss": 1.4547,
1798
+ "mean_token_accuracy": 0.6180400848388672,
1799
+ "num_tokens": 355703.0,
1800
  "step": 199
1801
  },
1802
  {
1803
  "epoch": 0.8064516129032258,
1804
+ "grad_norm": 5.749611854553223,
1805
  "learning_rate": 4.480913969818099e-07,
1806
+ "loss": 1.5199,
1807
+ "mean_token_accuracy": 0.5952813029289246,
1808
+ "num_tokens": 357358.0,
1809
  "step": 200
1810
  },
1811
  {
1812
  "epoch": 0.8104838709677419,
1813
+ "grad_norm": 5.199901580810547,
1814
  "learning_rate": 4.3016486098094667e-07,
1815
+ "loss": 1.47,
1816
+ "mean_token_accuracy": 0.6163245439529419,
1817
+ "num_tokens": 359406.0,
1818
  "step": 201
1819
  },
1820
  {
1821
  "epoch": 0.8145161290322581,
1822
+ "grad_norm": 5.465948581695557,
1823
  "learning_rate": 4.125704689189819e-07,
1824
+ "loss": 1.5307,
1825
+ "mean_token_accuracy": 0.6006842851638794,
1826
+ "num_tokens": 361454.0,
1827
  "step": 202
1828
  },
1829
  {
1830
  "epoch": 0.8185483870967742,
1831
+ "grad_norm": 5.9000749588012695,
1832
  "learning_rate": 3.953110441467073e-07,
1833
+ "loss": 1.4202,
1834
+ "mean_token_accuracy": 0.6333949565887451,
1835
+ "num_tokens": 363079.0,
1836
  "step": 203
1837
  },
1838
  {
1839
  "epoch": 0.8225806451612904,
1840
+ "grad_norm": 5.953113555908203,
1841
  "learning_rate": 3.7838935626312246e-07,
1842
+ "loss": 1.4819,
1843
+ "mean_token_accuracy": 0.6164458990097046,
1844
+ "num_tokens": 364893.0,
1845
  "step": 204
1846
  },
1847
  {
1848
  "epoch": 0.8266129032258065,
1849
+ "grad_norm": 5.6844892501831055,
1850
  "learning_rate": 3.6180812067099477e-07,
1851
+ "loss": 1.4199,
1852
+ "mean_token_accuracy": 0.6286211013793945,
1853
+ "num_tokens": 366621.0,
1854
  "step": 205
1855
  },
1856
  {
1857
  "epoch": 0.8306451612903226,
1858
+ "grad_norm": 5.685678482055664,
1859
  "learning_rate": 3.455699981411259e-07,
1860
+ "loss": 1.5947,
1861
+ "mean_token_accuracy": 0.5794117450714111,
1862
+ "num_tokens": 368663.0,
1863
  "step": 206
1864
  },
1865
  {
1866
  "epoch": 0.8346774193548387,
1867
+ "grad_norm": 5.832515716552734,
1868
  "learning_rate": 3.296775943853789e-07,
1869
+ "loss": 1.419,
1870
+ "mean_token_accuracy": 0.6132450103759766,
1871
+ "num_tokens": 370175.0,
1872
  "step": 207
1873
  },
1874
  {
1875
  "epoch": 0.8387096774193549,
1876
+ "grad_norm": 5.6208720207214355,
1877
  "learning_rate": 3.141334596385448e-07,
1878
+ "loss": 1.4981,
1879
+ "mean_token_accuracy": 0.6034953594207764,
1880
+ "num_tokens": 372008.0,
1881
  "step": 208
1882
  },
1883
  {
1884
  "epoch": 0.842741935483871,
1885
+ "grad_norm": 5.669355869293213,
1886
  "learning_rate": 2.9894008824910726e-07,
1887
+ "loss": 1.5807,
1888
+ "mean_token_accuracy": 0.591269850730896,
1889
+ "num_tokens": 374026.0,
1890
  "step": 209
1891
  },
1892
  {
1893
  "epoch": 0.8467741935483871,
1894
+ "grad_norm": 5.443574905395508,
1895
  "learning_rate": 2.840999182789797e-07,
1896
+ "loss": 1.4487,
1897
+ "mean_token_accuracy": 0.6099706888198853,
1898
+ "num_tokens": 376074.0,
1899
  "step": 210
1900
  },
1901
  {
1902
  "epoch": 0.8508064516129032,
1903
+ "grad_norm": 5.724096298217773,
1904
  "learning_rate": 2.696153311122704e-07,
1905
+ "loss": 1.6509,
1906
+ "mean_token_accuracy": 0.5762162208557129,
1907
+ "num_tokens": 377926.0,
1908
  "step": 211
1909
  },
1910
  {
1911
  "epoch": 0.8548387096774194,
1912
+ "grad_norm": 6.077378749847412,
1913
  "learning_rate": 2.5548865107314606e-07,
1914
+ "loss": 1.461,
1915
+ "mean_token_accuracy": 0.6061684489250183,
1916
+ "num_tokens": 379614.0,
1917
  "step": 212
1918
  },
1919
  {
1920
  "epoch": 0.8588709677419355,
1921
+ "grad_norm": 5.7304229736328125,
1922
  "learning_rate": 2.4172214505285006e-07,
1923
+ "loss": 1.4963,
1924
+ "mean_token_accuracy": 0.6054667234420776,
1925
+ "num_tokens": 381555.0,
1926
  "step": 213
1927
  },
1928
  {
1929
  "epoch": 0.8629032258064516,
1930
+ "grad_norm": 5.604650020599365,
1931
  "learning_rate": 2.2831802214593774e-07,
1932
+ "loss": 1.6199,
1933
+ "mean_token_accuracy": 0.5726895332336426,
1934
+ "num_tokens": 383483.0,
1935
  "step": 214
1936
  },
1937
  {
1938
  "epoch": 0.8669354838709677,
1939
+ "grad_norm": 5.319303512573242,
1940
  "learning_rate": 2.1527843329578328e-07,
1941
+ "loss": 1.5539,
1942
+ "mean_token_accuracy": 0.597201406955719,
1943
+ "num_tokens": 385486.0,
1944
  "step": 215
1945
  },
1946
  {
1947
  "epoch": 0.8709677419354839,
1948
+ "grad_norm": 5.75916862487793,
1949
  "learning_rate": 2.026054709494235e-07,
1950
+ "loss": 1.498,
1951
+ "mean_token_accuracy": 0.6148062348365784,
1952
+ "num_tokens": 387217.0,
1953
  "step": 216
1954
  },
1955
  {
1956
  "epoch": 0.875,
1957
+ "grad_norm": 6.274486541748047,
1958
  "learning_rate": 1.9030116872178317e-07,
1959
+ "loss": 1.4691,
1960
+ "mean_token_accuracy": 0.6093660593032837,
1961
+ "num_tokens": 388970.0,
1962
  "step": 217
1963
  },
1964
  {
1965
  "epoch": 0.8790322580645161,
1966
+ "grad_norm": 5.682790756225586,
1967
  "learning_rate": 1.7836750106934475e-07,
1968
+ "loss": 1.4487,
1969
+ "mean_token_accuracy": 0.6198156476020813,
1970
+ "num_tokens": 390708.0,
1971
  "step": 218
1972
  },
1973
  {
1974
  "epoch": 0.8830645161290323,
1975
+ "grad_norm": 6.133249282836914,
1976
  "learning_rate": 1.6680638297330854e-07,
1977
+ "loss": 1.4905,
1978
+ "mean_token_accuracy": 0.5962441563606262,
1979
+ "num_tokens": 392414.0,
1980
  "step": 219
1981
  },
1982
  {
1983
  "epoch": 0.8870967741935484,
1984
+ "grad_norm": 6.2251386642456055,
1985
  "learning_rate": 1.5561966963229925e-07,
1986
+ "loss": 1.4176,
1987
+ "mean_token_accuracy": 0.6197279095649719,
1988
+ "num_tokens": 393886.0,
1989
  "step": 220
1990
  },
1991
  {
1992
  "epoch": 0.8911290322580645,
1993
+ "grad_norm": 5.665816307067871,
1994
  "learning_rate": 1.448091561646628e-07,
1995
+ "loss": 1.5031,
1996
+ "mean_token_accuracy": 0.6201764345169067,
1997
+ "num_tokens": 395702.0,
1998
  "step": 221
1999
  },
2000
  {
2001
  "epoch": 0.8951612903225806,
2002
+ "grad_norm": 6.105023384094238,
2003
  "learning_rate": 1.3437657732040783e-07,
2004
+ "loss": 1.3976,
2005
+ "mean_token_accuracy": 0.618686854839325,
2006
+ "num_tokens": 397288.0,
2007
  "step": 222
2008
  },
2009
  {
2010
  "epoch": 0.8991935483870968,
2011
+ "grad_norm": 5.846229553222656,
2012
  "learning_rate": 1.243236072028317e-07,
2013
+ "loss": 1.4722,
2014
+ "mean_token_accuracy": 0.6098484992980957,
2015
+ "num_tokens": 399138.0,
2016
  "step": 223
2017
  },
2018
  {
2019
  "epoch": 0.9032258064516129,
2020
+ "grad_norm": 5.863070964813232,
2021
  "learning_rate": 1.1465185899987797e-07,
2022
+ "loss": 1.6153,
2023
+ "mean_token_accuracy": 0.5908849835395813,
2024
+ "num_tokens": 401027.0,
2025
  "step": 224
2026
  },
2027
  {
2028
  "epoch": 0.907258064516129,
2029
+ "grad_norm": 5.625185966491699,
2030
  "learning_rate": 1.0536288472527162e-07,
2031
+ "loss": 1.6113,
2032
+ "mean_token_accuracy": 0.5997909903526306,
2033
+ "num_tokens": 402943.0,
2034
  "step": 225
2035
  },
2036
  {
2037
  "epoch": 0.9112903225806451,
2038
+ "grad_norm": 5.376096725463867,
2039
  "learning_rate": 9.645817496946902e-08,
2040
+ "loss": 1.4866,
2041
+ "mean_token_accuracy": 0.6109374761581421,
2042
+ "num_tokens": 404865.0,
2043
  "step": 226
2044
  },
2045
  {
2046
  "epoch": 0.9153225806451613,
2047
+ "grad_norm": 6.09489631652832,
2048
  "learning_rate": 8.79391586604636e-08,
2049
+ "loss": 1.6137,
2050
+ "mean_token_accuracy": 0.5798553228378296,
2051
+ "num_tokens": 406664.0,
2052
  "step": 227
2053
  },
2054
  {
2055
  "epoch": 0.9193548387096774,
2056
+ "grad_norm": 6.106083393096924,
2057
  "learning_rate": 7.980720283448957e-08,
2058
+ "loss": 1.7006,
2059
+ "mean_token_accuracy": 0.5664711594581604,
2060
+ "num_tokens": 408712.0,
2061
  "step": 228
2062
  },
2063
  {
2064
  "epoch": 0.9233870967741935,
2065
+ "grad_norm": 6.476028919219971,
2066
  "learning_rate": 7.206361241665266e-08,
2067
+ "loss": 1.5162,
2068
+ "mean_token_accuracy": 0.608540952205658,
2069
+ "num_tokens": 410119.0,
2070
  "step": 229
2071
  },
2072
  {
2073
  "epoch": 0.9274193548387096,
2074
+ "grad_norm": 5.52684211730957,
2075
  "learning_rate": 6.470963001153268e-08,
2076
+ "loss": 1.5155,
2077
+ "mean_token_accuracy": 0.6171218752861023,
2078
+ "num_tokens": 412025.0,
2079
  "step": 230
2080
  },
2081
  {
2082
  "epoch": 0.9314516129032258,
2083
+ "grad_norm": 5.9519782066345215,
2084
  "learning_rate": 5.774643570378296e-08,
2085
+ "loss": 1.3252,
2086
+ "mean_token_accuracy": 0.6445659399032593,
2087
+ "num_tokens": 413490.0,
2088
  "step": 231
2089
  },
2090
  {
2091
  "epoch": 0.9354838709677419,
2092
+ "grad_norm": 5.545828342437744,
2093
  "learning_rate": 5.117514686876379e-08,
2094
+ "loss": 1.4661,
2095
+ "mean_token_accuracy": 0.6093666553497314,
2096
+ "num_tokens": 415371.0,
2097
  "step": 232
2098
  },
2099
  {
2100
  "epoch": 0.9395161290322581,
2101
+ "grad_norm": 5.3752617835998535,
2102
  "learning_rate": 4.4996817993239464e-08,
2103
+ "loss": 1.5721,
2104
+ "mean_token_accuracy": 0.589442789554596,
2105
+ "num_tokens": 417419.0,
2106
  "step": 233
2107
  },
2108
  {
2109
  "epoch": 0.9435483870967742,
2110
+ "grad_norm": 6.181939601898193,
2111
  "learning_rate": 3.9212440506164465e-08,
2112
+ "loss": 1.4851,
2113
+ "mean_token_accuracy": 0.6140567064285278,
2114
+ "num_tokens": 419043.0,
2115
  "step": 234
2116
  },
2117
  {
2118
  "epoch": 0.9475806451612904,
2119
+ "grad_norm": 6.1111836433410645,
2120
  "learning_rate": 3.382294261959157e-08,
2121
+ "loss": 1.5793,
2122
+ "mean_token_accuracy": 0.6070796251296997,
2123
+ "num_tokens": 420740.0,
2124
  "step": 235
2125
  },
2126
  {
2127
  "epoch": 0.9516129032258065,
2128
+ "grad_norm": 6.486725330352783,
2129
  "learning_rate": 2.8829189179721552e-08,
2130
+ "loss": 1.4531,
2131
+ "mean_token_accuracy": 0.6059664487838745,
2132
+ "num_tokens": 422351.0,
2133
  "step": 236
2134
  },
2135
  {
2136
  "epoch": 0.9556451612903226,
2137
+ "grad_norm": 5.367029190063477,
2138
  "learning_rate": 2.423198152812306e-08,
2139
+ "loss": 1.412,
2140
+ "mean_token_accuracy": 0.6007625460624695,
2141
+ "num_tokens": 424189.0,
2142
  "step": 237
2143
  },
2144
  {
2145
  "epoch": 0.9596774193548387,
2146
+ "grad_norm": 5.885536193847656,
2147
  "learning_rate": 2.0032057373142453e-08,
2148
+ "loss": 1.4751,
2149
+ "mean_token_accuracy": 0.6121867895126343,
2150
+ "num_tokens": 425947.0,
2151
  "step": 238
2152
  },
2153
  {
2154
  "epoch": 0.9637096774193549,
2155
+ "grad_norm": 6.31535005569458,
2156
  "learning_rate": 1.6230090671524312e-08,
2157
+ "loss": 1.5074,
2158
+ "mean_token_accuracy": 0.60240238904953,
2159
+ "num_tokens": 427614.0,
2160
  "step": 239
2161
  },
2162
  {
2163
  "epoch": 0.967741935483871,
2164
+ "grad_norm": 6.376657485961914,
2165
  "learning_rate": 1.2826691520262114e-08,
2166
+ "loss": 1.6623,
2167
+ "mean_token_accuracy": 0.5947854518890381,
2168
+ "num_tokens": 429457.0,
2169
  "step": 240
2170
  },
2171
  {
2172
  "epoch": 0.9717741935483871,
2173
+ "grad_norm": 5.587056636810303,
2174
  "learning_rate": 9.822406058697665e-09,
2175
+ "loss": 1.5953,
2176
+ "mean_token_accuracy": 0.5949429869651794,
2177
+ "num_tokens": 431476.0,
2178
  "step": 241
2179
  },
2180
  {
2181
  "epoch": 0.9758064516129032,
2182
+ "grad_norm": 5.475006580352783,
2183
  "learning_rate": 7.217716380881479e-09,
2184
+ "loss": 1.476,
2185
+ "mean_token_accuracy": 0.6025062799453735,
2186
+ "num_tokens": 433473.0,
2187
  "step": 242
2188
  },
2189
  {
2190
  "epoch": 0.9798387096774194,
2191
+ "grad_norm": 6.063528537750244,
2192
  "learning_rate": 5.0130404582127144e-09,
2193
+ "loss": 1.3995,
2194
+ "mean_token_accuracy": 0.6313099265098572,
2195
+ "num_tokens": 435040.0,
2196
  "step": 243
2197
  },
2198
  {
2199
  "epoch": 0.9838709677419355,
2200
+ "grad_norm": 5.9405035972595215,
2201
  "learning_rate": 3.208732072368104e-09,
2202
+ "loss": 1.5641,
2203
+ "mean_token_accuracy": 0.5938547253608704,
2204
+ "num_tokens": 436832.0,
2205
  "step": 244
2206
  },
2207
  {
2208
  "epoch": 0.9879032258064516,
2209
+ "grad_norm": 5.634851932525635,
2210
  "learning_rate": 1.8050807585293095e-09,
2211
+ "loss": 1.4805,
2212
+ "mean_token_accuracy": 0.5975820422172546,
2213
+ "num_tokens": 438571.0,
2214
  "step": 245
2215
  },
2216
  {
2217
  "epoch": 0.9919354838709677,
2218
+ "grad_norm": 5.381149768829346,
2219
  "learning_rate": 8.023117589237017e-10,
2220
+ "loss": 1.513,
2221
+ "mean_token_accuracy": 0.6109482049942017,
2222
+ "num_tokens": 440619.0,
2223
  "step": 246
2224
  },
2225
  {
2226
  "epoch": 0.9959677419354839,
2227
+ "grad_norm": 6.20208215713501,
2228
  "learning_rate": 2.0058598667854755e-10,
2229
+ "loss": 1.4039,
2230
+ "mean_token_accuracy": 0.6231250166893005,
2231
+ "num_tokens": 442221.0,
2232
  "step": 247
2233
  },
2234
  {
2235
  "epoch": 1.0,
2236
+ "grad_norm": 6.332696914672852,
2237
  "learning_rate": 0.0,
2238
+ "loss": 1.4206,
2239
+ "mean_token_accuracy": 0.5873016119003296,
2240
+ "num_tokens": 442852.0,
2241
  "step": 248
2242
  }
2243
  ],
 
2258
  "attributes": {}
2259
  }
2260
  },
2261
+ "total_flos": 9984447243878400.0,
2262
  "train_batch_size": 1,
2263
  "trial_name": null,
2264
  "trial_params": null