amuvarma commited on
Commit
2e21557
·
verified ·
1 Parent(s): f989e9f

Update model

Browse files
Files changed (1) hide show
  1. trainer_state.json +1149 -0
trainer_state.json ADDED
@@ -0,0 +1,1149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 500,
6
+ "global_step": 124,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.008064516129032258,
13
+ "grad_norm": 82.1250991821289,
14
+ "learning_rate": 4.999197688241076e-07,
15
+ "loss": 3.3188,
16
+ "mean_token_accuracy": 0.4619899392127991,
17
+ "num_tokens": 3582.0,
18
+ "step": 1
19
+ },
20
+ {
21
+ "epoch": 0.016129032258064516,
22
+ "grad_norm": 85.84162139892578,
23
+ "learning_rate": 4.996791267927632e-07,
24
+ "loss": 3.1019,
25
+ "mean_token_accuracy": 0.48747536540031433,
26
+ "num_tokens": 7139.0,
27
+ "step": 2
28
+ },
29
+ {
30
+ "epoch": 0.024193548387096774,
31
+ "grad_norm": 77.11351776123047,
32
+ "learning_rate": 4.992782283619118e-07,
33
+ "loss": 3.0994,
34
+ "mean_token_accuracy": 0.46798279881477356,
35
+ "num_tokens": 11094.0,
36
+ "step": 3
37
+ },
38
+ {
39
+ "epoch": 0.03225806451612903,
40
+ "grad_norm": 74.11935424804688,
41
+ "learning_rate": 4.987173308479737e-07,
42
+ "loss": 3.0893,
43
+ "mean_token_accuracy": 0.4673512279987335,
44
+ "num_tokens": 14896.0,
45
+ "step": 4
46
+ },
47
+ {
48
+ "epoch": 0.04032258064516129,
49
+ "grad_norm": 58.4622917175293,
50
+ "learning_rate": 4.979967942626857e-07,
51
+ "loss": 2.9759,
52
+ "mean_token_accuracy": 0.4749932289123535,
53
+ "num_tokens": 18599.0,
54
+ "step": 5
55
+ },
56
+ {
57
+ "epoch": 0.04838709677419355,
58
+ "grad_norm": 71.54608154296875,
59
+ "learning_rate": 4.971170810820278e-07,
60
+ "loss": 2.8627,
61
+ "mean_token_accuracy": 0.489130437374115,
62
+ "num_tokens": 22099.0,
63
+ "step": 6
64
+ },
65
+ {
66
+ "epoch": 0.056451612903225805,
67
+ "grad_norm": 67.31790924072266,
68
+ "learning_rate": 4.960787559493836e-07,
69
+ "loss": 2.9878,
70
+ "mean_token_accuracy": 0.49747729301452637,
71
+ "num_tokens": 25076.0,
72
+ "step": 7
73
+ },
74
+ {
75
+ "epoch": 0.06451612903225806,
76
+ "grad_norm": 51.87714767456055,
77
+ "learning_rate": 4.948824853131236e-07,
78
+ "loss": 2.8431,
79
+ "mean_token_accuracy": 0.4804079532623291,
80
+ "num_tokens": 28806.0,
81
+ "step": 8
82
+ },
83
+ {
84
+ "epoch": 0.07258064516129033,
85
+ "grad_norm": 50.492496490478516,
86
+ "learning_rate": 4.935290369988468e-07,
87
+ "loss": 2.7265,
88
+ "mean_token_accuracy": 0.4897351861000061,
89
+ "num_tokens": 32171.0,
90
+ "step": 9
91
+ },
92
+ {
93
+ "epoch": 0.08064516129032258,
94
+ "grad_norm": 50.19187927246094,
95
+ "learning_rate": 4.920192797165511e-07,
96
+ "loss": 2.5757,
97
+ "mean_token_accuracy": 0.5013927817344666,
98
+ "num_tokens": 35765.0,
99
+ "step": 10
100
+ },
101
+ {
102
+ "epoch": 0.08870967741935484,
103
+ "grad_norm": 53.92109298706055,
104
+ "learning_rate": 4.903541825030531e-07,
105
+ "loss": 2.5328,
106
+ "mean_token_accuracy": 0.5096573233604431,
107
+ "num_tokens": 38979.0,
108
+ "step": 11
109
+ },
110
+ {
111
+ "epoch": 0.0967741935483871,
112
+ "grad_norm": 131.06643676757812,
113
+ "learning_rate": 4.885348141000122e-07,
114
+ "loss": 2.4167,
115
+ "mean_token_accuracy": 0.507402777671814,
116
+ "num_tokens": 42968.0,
117
+ "step": 12
118
+ },
119
+ {
120
+ "epoch": 0.10483870967741936,
121
+ "grad_norm": 42.45437240600586,
122
+ "learning_rate": 4.865623422679592e-07,
123
+ "loss": 2.3694,
124
+ "mean_token_accuracy": 0.5128635168075562,
125
+ "num_tokens": 46548.0,
126
+ "step": 13
127
+ },
128
+ {
129
+ "epoch": 0.11290322580645161,
130
+ "grad_norm": 40.65642547607422,
131
+ "learning_rate": 4.844380330367701e-07,
132
+ "loss": 2.2659,
133
+ "mean_token_accuracy": 0.5277366042137146,
134
+ "num_tokens": 49923.0,
135
+ "step": 14
136
+ },
137
+ {
138
+ "epoch": 0.12096774193548387,
139
+ "grad_norm": 40.563663482666016,
140
+ "learning_rate": 4.821632498930656e-07,
141
+ "loss": 2.325,
142
+ "mean_token_accuracy": 0.5243242979049683,
143
+ "num_tokens": 53257.0,
144
+ "step": 15
145
+ },
146
+ {
147
+ "epoch": 0.12903225806451613,
148
+ "grad_norm": 29.71602439880371,
149
+ "learning_rate": 4.797394529050577e-07,
150
+ "loss": 2.2635,
151
+ "mean_token_accuracy": 0.5110940933227539,
152
+ "num_tokens": 57182.0,
153
+ "step": 16
154
+ },
155
+ {
156
+ "epoch": 0.13709677419354838,
157
+ "grad_norm": 36.85178756713867,
158
+ "learning_rate": 4.771681977854062e-07,
159
+ "loss": 2.2947,
160
+ "mean_token_accuracy": 0.49719932675361633,
161
+ "num_tokens": 60221.0,
162
+ "step": 17
163
+ },
164
+ {
165
+ "epoch": 0.14516129032258066,
166
+ "grad_norm": 40.669857025146484,
167
+ "learning_rate": 4.744511348926854e-07,
168
+ "loss": 2.1618,
169
+ "mean_token_accuracy": 0.5076530575752258,
170
+ "num_tokens": 63753.0,
171
+ "step": 18
172
+ },
173
+ {
174
+ "epoch": 0.1532258064516129,
175
+ "grad_norm": 31.07614517211914,
176
+ "learning_rate": 4.7159000817210204e-07,
177
+ "loss": 2.2338,
178
+ "mean_token_accuracy": 0.5,
179
+ "num_tokens": 67491.0,
180
+ "step": 19
181
+ },
182
+ {
183
+ "epoch": 0.16129032258064516,
184
+ "grad_norm": 39.705753326416016,
185
+ "learning_rate": 4.685866540361455e-07,
186
+ "loss": 2.1938,
187
+ "mean_token_accuracy": 0.5107913613319397,
188
+ "num_tokens": 70831.0,
189
+ "step": 20
190
+ },
191
+ {
192
+ "epoch": 0.1693548387096774,
193
+ "grad_norm": 40.17426681518555,
194
+ "learning_rate": 4.654430001858874e-07,
195
+ "loss": 2.1139,
196
+ "mean_token_accuracy": 0.5218411087989807,
197
+ "num_tokens": 74246.0,
198
+ "step": 21
199
+ },
200
+ {
201
+ "epoch": 0.1774193548387097,
202
+ "grad_norm": 34.13058853149414,
203
+ "learning_rate": 4.6216106437368775e-07,
204
+ "loss": 2.0303,
205
+ "mean_token_accuracy": 0.5338891744613647,
206
+ "num_tokens": 77968.0,
207
+ "step": 22
208
+ },
209
+ {
210
+ "epoch": 0.18548387096774194,
211
+ "grad_norm": 25.600557327270508,
212
+ "learning_rate": 4.5874295310810185e-07,
213
+ "loss": 2.0356,
214
+ "mean_token_accuracy": 0.515510618686676,
215
+ "num_tokens": 80841.0,
216
+ "step": 23
217
+ },
218
+ {
219
+ "epoch": 0.1935483870967742,
220
+ "grad_norm": 14.962992668151855,
221
+ "learning_rate": 4.551908603018191e-07,
222
+ "loss": 1.9918,
223
+ "mean_token_accuracy": 0.5420023202896118,
224
+ "num_tokens": 84321.0,
225
+ "step": 24
226
+ },
227
+ {
228
+ "epoch": 0.20161290322580644,
229
+ "grad_norm": 10.336100578308105,
230
+ "learning_rate": 4.5150706586350127e-07,
231
+ "loss": 2.0338,
232
+ "mean_token_accuracy": 0.5391054153442383,
233
+ "num_tokens": 88148.0,
234
+ "step": 25
235
+ },
236
+ {
237
+ "epoch": 0.20967741935483872,
238
+ "grad_norm": 11.232644081115723,
239
+ "learning_rate": 4.476939342344246e-07,
240
+ "loss": 2.0344,
241
+ "mean_token_accuracy": 0.5330302119255066,
242
+ "num_tokens": 91891.0,
243
+ "step": 26
244
+ },
245
+ {
246
+ "epoch": 0.21774193548387097,
247
+ "grad_norm": 8.730783462524414,
248
+ "learning_rate": 4.437539128708647e-07,
249
+ "loss": 2.0291,
250
+ "mean_token_accuracy": 0.5340464115142822,
251
+ "num_tokens": 95728.0,
252
+ "step": 27
253
+ },
254
+ {
255
+ "epoch": 0.22580645161290322,
256
+ "grad_norm": 9.901666641235352,
257
+ "learning_rate": 4.396895306731977e-07,
258
+ "loss": 1.9593,
259
+ "mean_token_accuracy": 0.5452925562858582,
260
+ "num_tokens": 99099.0,
261
+ "step": 28
262
+ },
263
+ {
264
+ "epoch": 0.23387096774193547,
265
+ "grad_norm": 8.69468879699707,
266
+ "learning_rate": 4.355033963627277e-07,
267
+ "loss": 1.9023,
268
+ "mean_token_accuracy": 0.5690703988075256,
269
+ "num_tokens": 102556.0,
270
+ "step": 29
271
+ },
272
+ {
273
+ "epoch": 0.24193548387096775,
274
+ "grad_norm": 8.984487533569336,
275
+ "learning_rate": 4.3119819680727996e-07,
276
+ "loss": 1.938,
277
+ "mean_token_accuracy": 0.5465425252914429,
278
+ "num_tokens": 106320.0,
279
+ "step": 30
280
+ },
281
+ {
282
+ "epoch": 0.25,
283
+ "grad_norm": 9.061673164367676,
284
+ "learning_rate": 4.2677669529663686e-07,
285
+ "loss": 1.9321,
286
+ "mean_token_accuracy": 0.5477131605148315,
287
+ "num_tokens": 109866.0,
288
+ "step": 31
289
+ },
290
+ {
291
+ "epoch": 0.25806451612903225,
292
+ "grad_norm": 8.572089195251465,
293
+ "learning_rate": 4.2224172976892166e-07,
294
+ "loss": 1.9352,
295
+ "mean_token_accuracy": 0.5485040545463562,
296
+ "num_tokens": 113179.0,
297
+ "step": 32
298
+ },
299
+ {
300
+ "epoch": 0.2661290322580645,
301
+ "grad_norm": 8.716224670410156,
302
+ "learning_rate": 4.175962109890696e-07,
303
+ "loss": 1.8432,
304
+ "mean_token_accuracy": 0.550000011920929,
305
+ "num_tokens": 116683.0,
306
+ "step": 33
307
+ },
308
+ {
309
+ "epoch": 0.27419354838709675,
310
+ "grad_norm": 11.871490478515625,
311
+ "learning_rate": 4.128431206805556e-07,
312
+ "loss": 1.8008,
313
+ "mean_token_accuracy": 0.5759973526000977,
314
+ "num_tokens": 119720.0,
315
+ "step": 34
316
+ },
317
+ {
318
+ "epoch": 0.28225806451612906,
319
+ "grad_norm": 9.3665132522583,
320
+ "learning_rate": 4.0798550961157595e-07,
321
+ "loss": 1.8901,
322
+ "mean_token_accuracy": 0.552654504776001,
323
+ "num_tokens": 123171.0,
324
+ "step": 35
325
+ },
326
+ {
327
+ "epoch": 0.2903225806451613,
328
+ "grad_norm": 9.704719543457031,
329
+ "learning_rate": 4.030264956369157e-07,
330
+ "loss": 1.8121,
331
+ "mean_token_accuracy": 0.5599347352981567,
332
+ "num_tokens": 126854.0,
333
+ "step": 36
334
+ },
335
+ {
336
+ "epoch": 0.29838709677419356,
337
+ "grad_norm": 10.195917129516602,
338
+ "learning_rate": 3.9796926169675424e-07,
339
+ "loss": 1.885,
340
+ "mean_token_accuracy": 0.5533230304718018,
341
+ "num_tokens": 130740.0,
342
+ "step": 37
343
+ },
344
+ {
345
+ "epoch": 0.3064516129032258,
346
+ "grad_norm": 9.518072128295898,
347
+ "learning_rate": 3.9281705377369805e-07,
348
+ "loss": 1.9059,
349
+ "mean_token_accuracy": 0.5531600713729858,
350
+ "num_tokens": 134130.0,
351
+ "step": 38
352
+ },
353
+ {
354
+ "epoch": 0.31451612903225806,
355
+ "grad_norm": 7.841033458709717,
356
+ "learning_rate": 3.875731788093478e-07,
357
+ "loss": 1.8547,
358
+ "mean_token_accuracy": 0.5586956739425659,
359
+ "num_tokens": 137814.0,
360
+ "step": 39
361
+ },
362
+ {
363
+ "epoch": 0.3225806451612903,
364
+ "grad_norm": 10.610803604125977,
365
+ "learning_rate": 3.822410025817406e-07,
366
+ "loss": 1.8147,
367
+ "mean_token_accuracy": 0.5637563467025757,
368
+ "num_tokens": 141151.0,
369
+ "step": 40
370
+ },
371
+ {
372
+ "epoch": 0.33064516129032256,
373
+ "grad_norm": 10.157751083374023,
374
+ "learning_rate": 3.768239475450268e-07,
375
+ "loss": 1.8701,
376
+ "mean_token_accuracy": 0.549227237701416,
377
+ "num_tokens": 144649.0,
378
+ "step": 41
379
+ },
380
+ {
381
+ "epoch": 0.3387096774193548,
382
+ "grad_norm": 8.627851486206055,
383
+ "learning_rate": 3.713254906327703e-07,
384
+ "loss": 1.8508,
385
+ "mean_token_accuracy": 0.557908833026886,
386
+ "num_tokens": 148383.0,
387
+ "step": 42
388
+ },
389
+ {
390
+ "epoch": 0.3467741935483871,
391
+ "grad_norm": 7.615920543670654,
392
+ "learning_rate": 3.657491610262802e-07,
393
+ "loss": 1.8584,
394
+ "mean_token_accuracy": 0.5659451484680176,
395
+ "num_tokens": 151852.0,
396
+ "step": 43
397
+ },
398
+ {
399
+ "epoch": 0.3548387096774194,
400
+ "grad_norm": 8.200079917907715,
401
+ "learning_rate": 3.6009853788940856e-07,
402
+ "loss": 1.7812,
403
+ "mean_token_accuracy": 0.5586913228034973,
404
+ "num_tokens": 155860.0,
405
+ "step": 44
406
+ },
407
+ {
408
+ "epoch": 0.3629032258064516,
409
+ "grad_norm": 7.837497234344482,
410
+ "learning_rate": 3.543772480712658e-07,
411
+ "loss": 1.7635,
412
+ "mean_token_accuracy": 0.5686478018760681,
413
+ "num_tokens": 159717.0,
414
+ "step": 45
415
+ },
416
+ {
417
+ "epoch": 0.3709677419354839,
418
+ "grad_norm": 8.181253433227539,
419
+ "learning_rate": 3.4858896377832965e-07,
420
+ "loss": 1.7004,
421
+ "mean_token_accuracy": 0.5794797539710999,
422
+ "num_tokens": 163181.0,
423
+ "step": 46
424
+ },
425
+ {
426
+ "epoch": 0.3790322580645161,
427
+ "grad_norm": 6.781255722045898,
428
+ "learning_rate": 3.42737400217442e-07,
429
+ "loss": 1.8779,
430
+ "mean_token_accuracy": 0.5520778298377991,
431
+ "num_tokens": 166987.0,
432
+ "step": 47
433
+ },
434
+ {
435
+ "epoch": 0.3870967741935484,
436
+ "grad_norm": 7.666635036468506,
437
+ "learning_rate": 3.36826313211205e-07,
438
+ "loss": 1.8343,
439
+ "mean_token_accuracy": 0.5685471296310425,
440
+ "num_tokens": 170164.0,
441
+ "step": 48
442
+ },
443
+ {
444
+ "epoch": 0.3951612903225806,
445
+ "grad_norm": 8.650765419006348,
446
+ "learning_rate": 3.308594967873095e-07,
447
+ "loss": 1.7022,
448
+ "mean_token_accuracy": 0.5841107368469238,
449
+ "num_tokens": 173491.0,
450
+ "step": 49
451
+ },
452
+ {
453
+ "epoch": 0.4032258064516129,
454
+ "grad_norm": 7.792832851409912,
455
+ "learning_rate": 3.2484078074333956e-07,
456
+ "loss": 1.7076,
457
+ "mean_token_accuracy": 0.5851721167564392,
458
+ "num_tokens": 176894.0,
459
+ "step": 50
460
+ },
461
+ {
462
+ "epoch": 0.4112903225806452,
463
+ "grad_norm": 9.63578987121582,
464
+ "learning_rate": 3.1877402818861946e-07,
465
+ "loss": 1.7369,
466
+ "mean_token_accuracy": 0.576729416847229,
467
+ "num_tokens": 180671.0,
468
+ "step": 51
469
+ },
470
+ {
471
+ "epoch": 0.41935483870967744,
472
+ "grad_norm": 7.068120002746582,
473
+ "learning_rate": 3.126631330646801e-07,
474
+ "loss": 1.7723,
475
+ "mean_token_accuracy": 0.5763046741485596,
476
+ "num_tokens": 184220.0,
477
+ "step": 52
478
+ },
479
+ {
480
+ "epoch": 0.4274193548387097,
481
+ "grad_norm": 7.844783782958984,
482
+ "learning_rate": 3.065120176459337e-07,
483
+ "loss": 1.7888,
484
+ "mean_token_accuracy": 0.5663133859634399,
485
+ "num_tokens": 187715.0,
486
+ "step": 53
487
+ },
488
+ {
489
+ "epoch": 0.43548387096774194,
490
+ "grad_norm": 6.963573932647705,
491
+ "learning_rate": 3.00324630022165e-07,
492
+ "loss": 1.6962,
493
+ "mean_token_accuracy": 0.5830934047698975,
494
+ "num_tokens": 191540.0,
495
+ "step": 54
496
+ },
497
+ {
498
+ "epoch": 0.4435483870967742,
499
+ "grad_norm": 7.507209777832031,
500
+ "learning_rate": 2.9410494156445216e-07,
501
+ "loss": 1.7422,
502
+ "mean_token_accuracy": 0.5805253982543945,
503
+ "num_tokens": 195046.0,
504
+ "step": 55
505
+ },
506
+ {
507
+ "epoch": 0.45161290322580644,
508
+ "grad_norm": 7.959713459014893,
509
+ "learning_rate": 2.8785694437614416e-07,
510
+ "loss": 1.7368,
511
+ "mean_token_accuracy": 0.5760632753372192,
512
+ "num_tokens": 198718.0,
513
+ "step": 56
514
+ },
515
+ {
516
+ "epoch": 0.4596774193548387,
517
+ "grad_norm": 7.953831195831299,
518
+ "learning_rate": 2.8158464873053234e-07,
519
+ "loss": 1.7707,
520
+ "mean_token_accuracy": 0.5705274939537048,
521
+ "num_tokens": 202210.0,
522
+ "step": 57
523
+ },
524
+ {
525
+ "epoch": 0.46774193548387094,
526
+ "grad_norm": 8.072004318237305,
527
+ "learning_rate": 2.7529208049685804e-07,
528
+ "loss": 1.7197,
529
+ "mean_token_accuracy": 0.5858024954795837,
530
+ "num_tokens": 205454.0,
531
+ "step": 58
532
+ },
533
+ {
534
+ "epoch": 0.47580645161290325,
535
+ "grad_norm": 8.691699028015137,
536
+ "learning_rate": 2.6898327855631154e-07,
537
+ "loss": 1.807,
538
+ "mean_token_accuracy": 0.560819149017334,
539
+ "num_tokens": 209511.0,
540
+ "step": 59
541
+ },
542
+ {
543
+ "epoch": 0.4838709677419355,
544
+ "grad_norm": 6.996568202972412,
545
+ "learning_rate": 2.626622922096782e-07,
546
+ "loss": 1.7697,
547
+ "mean_token_accuracy": 0.5675398111343384,
548
+ "num_tokens": 213409.0,
549
+ "step": 60
550
+ },
551
+ {
552
+ "epoch": 0.49193548387096775,
553
+ "grad_norm": 8.156956672668457,
554
+ "learning_rate": 2.5633317857829693e-07,
555
+ "loss": 1.6853,
556
+ "mean_token_accuracy": 0.5745296478271484,
557
+ "num_tokens": 216868.0,
558
+ "step": 61
559
+ },
560
+ {
561
+ "epoch": 0.5,
562
+ "grad_norm": 8.733380317687988,
563
+ "learning_rate": 2.5e-07,
564
+ "loss": 1.6566,
565
+ "mean_token_accuracy": 0.5956632494926453,
566
+ "num_tokens": 220008.0,
567
+ "step": 62
568
+ },
569
+ {
570
+ "epoch": 0.5080645161290323,
571
+ "grad_norm": 6.773242473602295,
572
+ "learning_rate": 2.4366682142170305e-07,
573
+ "loss": 1.7601,
574
+ "mean_token_accuracy": 0.5683262944221497,
575
+ "num_tokens": 223788.0,
576
+ "step": 63
577
+ },
578
+ {
579
+ "epoch": 0.5161290322580645,
580
+ "grad_norm": 6.594358444213867,
581
+ "learning_rate": 2.3733770779032184e-07,
582
+ "loss": 1.754,
583
+ "mean_token_accuracy": 0.5651482939720154,
584
+ "num_tokens": 227568.0,
585
+ "step": 64
586
+ },
587
+ {
588
+ "epoch": 0.5241935483870968,
589
+ "grad_norm": 7.253048419952393,
590
+ "learning_rate": 2.3101672144368846e-07,
591
+ "loss": 1.7411,
592
+ "mean_token_accuracy": 0.5777088403701782,
593
+ "num_tokens": 231439.0,
594
+ "step": 65
595
+ },
596
+ {
597
+ "epoch": 0.532258064516129,
598
+ "grad_norm": 6.758819580078125,
599
+ "learning_rate": 2.2470791950314196e-07,
600
+ "loss": 1.6841,
601
+ "mean_token_accuracy": 0.583798885345459,
602
+ "num_tokens": 235023.0,
603
+ "step": 66
604
+ },
605
+ {
606
+ "epoch": 0.5403225806451613,
607
+ "grad_norm": 7.620910167694092,
608
+ "learning_rate": 2.1841535126946775e-07,
609
+ "loss": 1.7435,
610
+ "mean_token_accuracy": 0.5806363224983215,
611
+ "num_tokens": 238673.0,
612
+ "step": 67
613
+ },
614
+ {
615
+ "epoch": 0.5483870967741935,
616
+ "grad_norm": 7.808475017547607,
617
+ "learning_rate": 2.121430556238559e-07,
618
+ "loss": 1.7878,
619
+ "mean_token_accuracy": 0.5660945773124695,
620
+ "num_tokens": 241998.0,
621
+ "step": 68
622
+ },
623
+ {
624
+ "epoch": 0.5564516129032258,
625
+ "grad_norm": 10.025287628173828,
626
+ "learning_rate": 2.0589505843554795e-07,
627
+ "loss": 1.8058,
628
+ "mean_token_accuracy": 0.566918134689331,
629
+ "num_tokens": 245581.0,
630
+ "step": 69
631
+ },
632
+ {
633
+ "epoch": 0.5645161290322581,
634
+ "grad_norm": 7.763560771942139,
635
+ "learning_rate": 1.9967536997783493e-07,
636
+ "loss": 1.6668,
637
+ "mean_token_accuracy": 0.5830458402633667,
638
+ "num_tokens": 249643.0,
639
+ "step": 70
640
+ },
641
+ {
642
+ "epoch": 0.5725806451612904,
643
+ "grad_norm": 7.218297958374023,
644
+ "learning_rate": 1.9348798235406626e-07,
645
+ "loss": 1.6796,
646
+ "mean_token_accuracy": 0.5924479365348816,
647
+ "num_tokens": 253487.0,
648
+ "step": 71
649
+ },
650
+ {
651
+ "epoch": 0.5806451612903226,
652
+ "grad_norm": 7.534745693206787,
653
+ "learning_rate": 1.8733686693531982e-07,
654
+ "loss": 1.6667,
655
+ "mean_token_accuracy": 0.5781828761100769,
656
+ "num_tokens": 257002.0,
657
+ "step": 72
658
+ },
659
+ {
660
+ "epoch": 0.5887096774193549,
661
+ "grad_norm": 9.321051597595215,
662
+ "learning_rate": 1.8122597181138047e-07,
663
+ "loss": 1.6971,
664
+ "mean_token_accuracy": 0.5793435573577881,
665
+ "num_tokens": 260327.0,
666
+ "step": 73
667
+ },
668
+ {
669
+ "epoch": 0.5967741935483871,
670
+ "grad_norm": 7.620558261871338,
671
+ "learning_rate": 1.751592192566605e-07,
672
+ "loss": 1.8091,
673
+ "mean_token_accuracy": 0.5647743940353394,
674
+ "num_tokens": 263766.0,
675
+ "step": 74
676
+ },
677
+ {
678
+ "epoch": 0.6048387096774194,
679
+ "grad_norm": 6.960766315460205,
680
+ "learning_rate": 1.6914050321269047e-07,
681
+ "loss": 1.7232,
682
+ "mean_token_accuracy": 0.5714285969734192,
683
+ "num_tokens": 267431.0,
684
+ "step": 75
685
+ },
686
+ {
687
+ "epoch": 0.6129032258064516,
688
+ "grad_norm": 8.637341499328613,
689
+ "learning_rate": 1.6317368678879496e-07,
690
+ "loss": 1.8332,
691
+ "mean_token_accuracy": 0.5513805747032166,
692
+ "num_tokens": 270948.0,
693
+ "step": 76
694
+ },
695
+ {
696
+ "epoch": 0.6209677419354839,
697
+ "grad_norm": 6.999222278594971,
698
+ "learning_rate": 1.5726259978255807e-07,
699
+ "loss": 1.737,
700
+ "mean_token_accuracy": 0.578661322593689,
701
+ "num_tokens": 274448.0,
702
+ "step": 77
703
+ },
704
+ {
705
+ "epoch": 0.6290322580645161,
706
+ "grad_norm": 6.716799736022949,
707
+ "learning_rate": 1.514110362216704e-07,
708
+ "loss": 1.5999,
709
+ "mean_token_accuracy": 0.6048541069030762,
710
+ "num_tokens": 278119.0,
711
+ "step": 78
712
+ },
713
+ {
714
+ "epoch": 0.6370967741935484,
715
+ "grad_norm": 7.0507659912109375,
716
+ "learning_rate": 1.4562275192873428e-07,
717
+ "loss": 1.7363,
718
+ "mean_token_accuracy": 0.5828737020492554,
719
+ "num_tokens": 281568.0,
720
+ "step": 79
721
+ },
722
+ {
723
+ "epoch": 0.6451612903225806,
724
+ "grad_norm": 8.561986923217773,
725
+ "learning_rate": 1.3990146211059139e-07,
726
+ "loss": 1.6661,
727
+ "mean_token_accuracy": 0.591465175151825,
728
+ "num_tokens": 285087.0,
729
+ "step": 80
730
+ },
731
+ {
732
+ "epoch": 0.6532258064516129,
733
+ "grad_norm": 7.29276704788208,
734
+ "learning_rate": 1.342508389737198e-07,
735
+ "loss": 1.6752,
736
+ "mean_token_accuracy": 0.5835070013999939,
737
+ "num_tokens": 288450.0,
738
+ "step": 81
739
+ },
740
+ {
741
+ "epoch": 0.6612903225806451,
742
+ "grad_norm": 7.706023216247559,
743
+ "learning_rate": 1.2867450936722978e-07,
744
+ "loss": 1.7451,
745
+ "mean_token_accuracy": 0.5716311931610107,
746
+ "num_tokens": 291979.0,
747
+ "step": 82
748
+ },
749
+ {
750
+ "epoch": 0.6693548387096774,
751
+ "grad_norm": 7.056520462036133,
752
+ "learning_rate": 1.2317605245497323e-07,
753
+ "loss": 1.7154,
754
+ "mean_token_accuracy": 0.5758506059646606,
755
+ "num_tokens": 295892.0,
756
+ "step": 83
757
+ },
758
+ {
759
+ "epoch": 0.6774193548387096,
760
+ "grad_norm": 7.503317356109619,
761
+ "learning_rate": 1.1775899741825945e-07,
762
+ "loss": 1.6899,
763
+ "mean_token_accuracy": 0.5809495449066162,
764
+ "num_tokens": 299287.0,
765
+ "step": 84
766
+ },
767
+ {
768
+ "epoch": 0.6854838709677419,
769
+ "grad_norm": 6.785901069641113,
770
+ "learning_rate": 1.1242682119065216e-07,
771
+ "loss": 1.7004,
772
+ "mean_token_accuracy": 0.5904300808906555,
773
+ "num_tokens": 302802.0,
774
+ "step": 85
775
+ },
776
+ {
777
+ "epoch": 0.6935483870967742,
778
+ "grad_norm": 8.526169776916504,
779
+ "learning_rate": 1.0718294622630186e-07,
780
+ "loss": 1.764,
781
+ "mean_token_accuracy": 0.5716612339019775,
782
+ "num_tokens": 305876.0,
783
+ "step": 86
784
+ },
785
+ {
786
+ "epoch": 0.7016129032258065,
787
+ "grad_norm": 8.141705513000488,
788
+ "learning_rate": 1.0203073830324565e-07,
789
+ "loss": 1.6696,
790
+ "mean_token_accuracy": 0.5959654450416565,
791
+ "num_tokens": 309350.0,
792
+ "step": 87
793
+ },
794
+ {
795
+ "epoch": 0.7096774193548387,
796
+ "grad_norm": 7.417829513549805,
797
+ "learning_rate": 9.697350436308427e-08,
798
+ "loss": 1.6542,
799
+ "mean_token_accuracy": 0.5915982127189636,
800
+ "num_tokens": 312520.0,
801
+ "step": 88
802
+ },
803
+ {
804
+ "epoch": 0.717741935483871,
805
+ "grad_norm": 7.390657424926758,
806
+ "learning_rate": 9.201449038842401e-08,
807
+ "loss": 1.7805,
808
+ "mean_token_accuracy": 0.5598756074905396,
809
+ "num_tokens": 316382.0,
810
+ "step": 89
811
+ },
812
+ {
813
+ "epoch": 0.7258064516129032,
814
+ "grad_norm": 7.71380615234375,
815
+ "learning_rate": 8.715687931944449e-08,
816
+ "loss": 1.6886,
817
+ "mean_token_accuracy": 0.578542947769165,
818
+ "num_tokens": 319900.0,
819
+ "step": 90
820
+ },
821
+ {
822
+ "epoch": 0.7338709677419355,
823
+ "grad_norm": 7.551061153411865,
824
+ "learning_rate": 8.240378901093034e-08,
825
+ "loss": 1.7469,
826
+ "mean_token_accuracy": 0.570781409740448,
827
+ "num_tokens": 323436.0,
828
+ "step": 91
829
+ },
830
+ {
831
+ "epoch": 0.7419354838709677,
832
+ "grad_norm": 9.389298439025879,
833
+ "learning_rate": 7.775827023107834e-08,
834
+ "loss": 1.7162,
835
+ "mean_token_accuracy": 0.571618914604187,
836
+ "num_tokens": 326442.0,
837
+ "step": 92
838
+ },
839
+ {
840
+ "epoch": 0.75,
841
+ "grad_norm": 7.125860691070557,
842
+ "learning_rate": 7.322330470336313e-08,
843
+ "loss": 1.7112,
844
+ "mean_token_accuracy": 0.568901777267456,
845
+ "num_tokens": 330234.0,
846
+ "step": 93
847
+ },
848
+ {
849
+ "epoch": 0.7580645161290323,
850
+ "grad_norm": 7.604709148406982,
851
+ "learning_rate": 6.880180319272006e-08,
852
+ "loss": 1.653,
853
+ "mean_token_accuracy": 0.5836288928985596,
854
+ "num_tokens": 333903.0,
855
+ "step": 94
856
+ },
857
+ {
858
+ "epoch": 0.7661290322580645,
859
+ "grad_norm": 6.952591896057129,
860
+ "learning_rate": 6.449660363727236e-08,
861
+ "loss": 1.6965,
862
+ "mean_token_accuracy": 0.5798975825309753,
863
+ "num_tokens": 337618.0,
864
+ "step": 95
865
+ },
866
+ {
867
+ "epoch": 0.7741935483870968,
868
+ "grad_norm": 10.35627555847168,
869
+ "learning_rate": 6.031046932680229e-08,
870
+ "loss": 1.6945,
871
+ "mean_token_accuracy": 0.5744901299476624,
872
+ "num_tokens": 340515.0,
873
+ "step": 96
874
+ },
875
+ {
876
+ "epoch": 0.782258064516129,
877
+ "grad_norm": 7.425570487976074,
878
+ "learning_rate": 5.624608712913531e-08,
879
+ "loss": 1.6154,
880
+ "mean_token_accuracy": 0.5917431116104126,
881
+ "num_tokens": 344007.0,
882
+ "step": 97
883
+ },
884
+ {
885
+ "epoch": 0.7903225806451613,
886
+ "grad_norm": 10.059042930603027,
887
+ "learning_rate": 5.230606576557539e-08,
888
+ "loss": 1.6855,
889
+ "mean_token_accuracy": 0.5853906869888306,
890
+ "num_tokens": 347940.0,
891
+ "step": 98
892
+ },
893
+ {
894
+ "epoch": 0.7983870967741935,
895
+ "grad_norm": 6.582756042480469,
896
+ "learning_rate": 4.84929341364988e-08,
897
+ "loss": 1.6939,
898
+ "mean_token_accuracy": 0.5809850096702576,
899
+ "num_tokens": 351741.0,
900
+ "step": 99
901
+ },
902
+ {
903
+ "epoch": 0.8064516129032258,
904
+ "grad_norm": 6.744507312774658,
905
+ "learning_rate": 4.480913969818098e-08,
906
+ "loss": 1.6739,
907
+ "mean_token_accuracy": 0.585814356803894,
908
+ "num_tokens": 355171.0,
909
+ "step": 100
910
+ },
911
+ {
912
+ "epoch": 0.8145161290322581,
913
+ "grad_norm": 6.796554088592529,
914
+ "learning_rate": 4.125704689189818e-08,
915
+ "loss": 1.7039,
916
+ "mean_token_accuracy": 0.5784457325935364,
917
+ "num_tokens": 359267.0,
918
+ "step": 101
919
+ },
920
+ {
921
+ "epoch": 0.8225806451612904,
922
+ "grad_norm": 6.474706649780273,
923
+ "learning_rate": 3.783893562631224e-08,
924
+ "loss": 1.6828,
925
+ "mean_token_accuracy": 0.5838965773582458,
926
+ "num_tokens": 362674.0,
927
+ "step": 102
928
+ },
929
+ {
930
+ "epoch": 0.8306451612903226,
931
+ "grad_norm": 7.683028221130371,
932
+ "learning_rate": 3.455699981411259e-08,
933
+ "loss": 1.7309,
934
+ "mean_token_accuracy": 0.5660528540611267,
935
+ "num_tokens": 366425.0,
936
+ "step": 103
937
+ },
938
+ {
939
+ "epoch": 0.8387096774193549,
940
+ "grad_norm": 7.123412609100342,
941
+ "learning_rate": 3.141334596385447e-08,
942
+ "loss": 1.6526,
943
+ "mean_token_accuracy": 0.5894578099250793,
944
+ "num_tokens": 369749.0,
945
+ "step": 104
946
+ },
947
+ {
948
+ "epoch": 0.8467741935483871,
949
+ "grad_norm": 6.300046920776367,
950
+ "learning_rate": 2.8409991827897968e-08,
951
+ "loss": 1.6927,
952
+ "mean_token_accuracy": 0.5794023275375366,
953
+ "num_tokens": 373802.0,
954
+ "step": 105
955
+ },
956
+ {
957
+ "epoch": 0.8548387096774194,
958
+ "grad_norm": 6.947690486907959,
959
+ "learning_rate": 2.5548865107314604e-08,
960
+ "loss": 1.7412,
961
+ "mean_token_accuracy": 0.5656623244285583,
962
+ "num_tokens": 377324.0,
963
+ "step": 106
964
+ },
965
+ {
966
+ "epoch": 0.8629032258064516,
967
+ "grad_norm": 7.4605231285095215,
968
+ "learning_rate": 2.283180221459377e-08,
969
+ "loss": 1.7679,
970
+ "mean_token_accuracy": 0.5644617676734924,
971
+ "num_tokens": 381183.0,
972
+ "step": 107
973
+ },
974
+ {
975
+ "epoch": 0.8709677419354839,
976
+ "grad_norm": 6.687449932098389,
977
+ "learning_rate": 2.0260547094942348e-08,
978
+ "loss": 1.7144,
979
+ "mean_token_accuracy": 0.5790040493011475,
980
+ "num_tokens": 384902.0,
981
+ "step": 108
982
+ },
983
+ {
984
+ "epoch": 0.8790322580645161,
985
+ "grad_norm": 9.971823692321777,
986
+ "learning_rate": 1.7836750106934474e-08,
987
+ "loss": 1.6716,
988
+ "mean_token_accuracy": 0.5851125121116638,
989
+ "num_tokens": 388372.0,
990
+ "step": 109
991
+ },
992
+ {
993
+ "epoch": 0.8870967741935484,
994
+ "grad_norm": 8.792091369628906,
995
+ "learning_rate": 1.5561966963229923e-08,
996
+ "loss": 1.66,
997
+ "mean_token_accuracy": 0.5835981965065002,
998
+ "num_tokens": 391522.0,
999
+ "step": 110
1000
+ },
1001
+ {
1002
+ "epoch": 0.8951612903225806,
1003
+ "grad_norm": 7.2787981033325195,
1004
+ "learning_rate": 1.3437657732040781e-08,
1005
+ "loss": 1.6463,
1006
+ "mean_token_accuracy": 0.5921754837036133,
1007
+ "num_tokens": 394900.0,
1008
+ "step": 111
1009
+ },
1010
+ {
1011
+ "epoch": 0.9032258064516129,
1012
+ "grad_norm": 7.135777473449707,
1013
+ "learning_rate": 1.1465185899987794e-08,
1014
+ "loss": 1.7416,
1015
+ "mean_token_accuracy": 0.5735452771186829,
1016
+ "num_tokens": 398616.0,
1017
+ "step": 112
1018
+ },
1019
+ {
1020
+ "epoch": 0.9112903225806451,
1021
+ "grad_norm": 8.646471977233887,
1022
+ "learning_rate": 9.6458174969469e-09,
1023
+ "loss": 1.7218,
1024
+ "mean_token_accuracy": 0.5801047086715698,
1025
+ "num_tokens": 402440.0,
1026
+ "step": 113
1027
+ },
1028
+ {
1029
+ "epoch": 0.9193548387096774,
1030
+ "grad_norm": 6.620112895965576,
1031
+ "learning_rate": 7.980720283448955e-09,
1032
+ "loss": 1.8393,
1033
+ "mean_token_accuracy": 0.5539906024932861,
1034
+ "num_tokens": 406278.0,
1035
+ "step": 114
1036
+ },
1037
+ {
1038
+ "epoch": 0.9274193548387096,
1039
+ "grad_norm": 8.354401588439941,
1040
+ "learning_rate": 6.470963001153268e-09,
1041
+ "loss": 1.7239,
1042
+ "mean_token_accuracy": 0.5888278484344482,
1043
+ "num_tokens": 409558.0,
1044
+ "step": 115
1045
+ },
1046
+ {
1047
+ "epoch": 0.9354838709677419,
1048
+ "grad_norm": 7.131601810455322,
1049
+ "learning_rate": 5.117514686876378e-09,
1050
+ "loss": 1.6136,
1051
+ "mean_token_accuracy": 0.5943509340286255,
1052
+ "num_tokens": 412890.0,
1053
+ "step": 116
1054
+ },
1055
+ {
1056
+ "epoch": 0.9435483870967742,
1057
+ "grad_norm": 7.938460826873779,
1058
+ "learning_rate": 3.921244050616446e-09,
1059
+ "loss": 1.7333,
1060
+ "mean_token_accuracy": 0.5793216824531555,
1061
+ "num_tokens": 416550.0,
1062
+ "step": 117
1063
+ },
1064
+ {
1065
+ "epoch": 0.9516129032258065,
1066
+ "grad_norm": 7.657175540924072,
1067
+ "learning_rate": 2.8829189179721547e-09,
1068
+ "loss": 1.7309,
1069
+ "mean_token_accuracy": 0.5766178369522095,
1070
+ "num_tokens": 419830.0,
1071
+ "step": 118
1072
+ },
1073
+ {
1074
+ "epoch": 0.9596774193548387,
1075
+ "grad_norm": 7.384486198425293,
1076
+ "learning_rate": 2.0032057373142453e-09,
1077
+ "loss": 1.6211,
1078
+ "mean_token_accuracy": 0.5845938324928284,
1079
+ "num_tokens": 423404.0,
1080
+ "step": 119
1081
+ },
1082
+ {
1083
+ "epoch": 0.967741935483871,
1084
+ "grad_norm": 8.56894588470459,
1085
+ "learning_rate": 1.2826691520262112e-09,
1086
+ "loss": 1.7963,
1087
+ "mean_token_accuracy": 0.57413250207901,
1088
+ "num_tokens": 426895.0,
1089
+ "step": 120
1090
+ },
1091
+ {
1092
+ "epoch": 0.9758064516129032,
1093
+ "grad_norm": 6.813057899475098,
1094
+ "learning_rate": 7.217716380881477e-10,
1095
+ "loss": 1.726,
1096
+ "mean_token_accuracy": 0.5653586387634277,
1097
+ "num_tokens": 430900.0,
1098
+ "step": 121
1099
+ },
1100
+ {
1101
+ "epoch": 0.9838709677419355,
1102
+ "grad_norm": 6.8685431480407715,
1103
+ "learning_rate": 3.2087320723681033e-10,
1104
+ "loss": 1.6799,
1105
+ "mean_token_accuracy": 0.5889589190483093,
1106
+ "num_tokens": 434237.0,
1107
+ "step": 122
1108
+ },
1109
+ {
1110
+ "epoch": 0.9919354838709677,
1111
+ "grad_norm": 7.024580955505371,
1112
+ "learning_rate": 8.023117589237016e-11,
1113
+ "loss": 1.7037,
1114
+ "mean_token_accuracy": 0.5847502946853638,
1115
+ "num_tokens": 438005.0,
1116
+ "step": 123
1117
+ },
1118
+ {
1119
+ "epoch": 1.0,
1120
+ "grad_norm": 8.294002532958984,
1121
+ "learning_rate": 0.0,
1122
+ "loss": 1.6386,
1123
+ "mean_token_accuracy": 0.5888568758964539,
1124
+ "num_tokens": 441132.0,
1125
+ "step": 124
1126
+ }
1127
+ ],
1128
+ "logging_steps": 1,
1129
+ "max_steps": 124,
1130
+ "num_input_tokens_seen": 0,
1131
+ "num_train_epochs": 1,
1132
+ "save_steps": 3000,
1133
+ "stateful_callbacks": {
1134
+ "TrainerControl": {
1135
+ "args": {
1136
+ "should_epoch_stop": false,
1137
+ "should_evaluate": false,
1138
+ "should_log": false,
1139
+ "should_save": true,
1140
+ "should_training_stop": true
1141
+ },
1142
+ "attributes": {}
1143
+ }
1144
+ },
1145
+ "total_flos": 1.0678265542868992e+16,
1146
+ "train_batch_size": 2,
1147
+ "trial_name": null,
1148
+ "trial_params": null
1149
+ }