yihanwang617 commited on
Commit
6fc75e2
·
verified ·
1 Parent(s): 7903fa7

Model save

Browse files
README.md CHANGED
@@ -1,11 +1,8 @@
1
  ---
2
  base_model: meta-llama/Meta-Llama-3-8B
3
- datasets:
4
- - yihanwang617/WizardLM_70k_processed_indicator_unfiltered_4k
5
  library_name: peft
6
  license: llama3
7
  tags:
8
- - alignment-handbook
9
  - trl
10
  - sft
11
  - generated_from_trainer
@@ -19,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # llama-3-qlora-wizard-processed-indicator-0.6
21
 
22
- This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the yihanwang617/WizardLM_70k_processed_indicator_unfiltered_4k dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 0.6753
25
 
26
  ## Model description
27
 
@@ -52,16 +49,20 @@ The following hyperparameters were used during training:
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
55
- - num_epochs: 1
56
 
57
  ### Training results
58
 
59
  | Training Loss | Epoch | Step | Validation Loss |
60
  |:-------------:|:------:|:----:|:---------------:|
61
- | 0.7113 | 0.2225 | 200 | 0.7031 |
62
- | 0.7181 | 0.4450 | 400 | 0.6872 |
63
- | 0.7171 | 0.6675 | 600 | 0.6786 |
64
- | 0.7031 | 0.8900 | 800 | 0.6755 |
 
 
 
 
65
 
66
 
67
  ### Framework versions
 
1
  ---
2
  base_model: meta-llama/Meta-Llama-3-8B
 
 
3
  library_name: peft
4
  license: llama3
5
  tags:
 
6
  - trl
7
  - sft
8
  - generated_from_trainer
 
16
 
17
  # llama-3-qlora-wizard-processed-indicator-0.6
18
 
19
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.6655
22
 
23
  ## Model description
24
 
 
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
  - lr_scheduler_warmup_ratio: 0.1
52
+ - num_epochs: 2
53
 
54
  ### Training results
55
 
56
  | Training Loss | Epoch | Step | Validation Loss |
57
  |:-------------:|:------:|:----:|:---------------:|
58
+ | 0.718 | 0.2225 | 200 | 0.7090 |
59
+ | 0.7205 | 0.4450 | 400 | 0.6897 |
60
+ | 0.7203 | 0.6675 | 600 | 0.6808 |
61
+ | 0.703 | 0.8900 | 800 | 0.6756 |
62
+ | 0.6759 | 1.1125 | 1000 | 0.6748 |
63
+ | 0.6533 | 1.3350 | 1200 | 0.6695 |
64
+ | 0.6458 | 1.5575 | 1400 | 0.6669 |
65
+ | 0.632 | 1.7800 | 1600 | 0.6655 |
66
 
67
 
68
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c68ab42de0cf1c7faf6fad6a5bd72f0a4d89064a10fbb588272f2957de1c23db
3
  size 2185327392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f420978ff31f01a60159d82cd103535799080cef13efa5f4803a4ab17719f4f
3
  size 2185327392
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 0.9990265609790016,
3
- "eval_loss": 0.6752772331237793,
4
- "eval_runtime": 38.0852,
5
- "eval_samples": 407,
6
- "eval_samples_per_second": 10.687,
7
- "eval_steps_per_second": 0.683,
8
- "total_flos": 3899966689378304.0,
9
- "train_loss": 0.7223505903458542,
10
- "train_runtime": 18859.9671,
11
  "train_samples": 57523,
12
- "train_samples_per_second": 3.05,
13
  "train_steps_per_second": 0.048
14
  }
 
1
  {
2
+ "epoch": 1.998053121958003,
3
+ "total_flos": 7795310658584576.0,
4
+ "train_loss": 0.6907267017260957,
5
+ "train_runtime": 37673.7111,
 
 
 
 
 
6
  "train_samples": 57523,
7
+ "train_samples_per_second": 3.054,
8
  "train_steps_per_second": 0.048
9
  }
runs/Sep13_22-50-54_nova.cs.ucla.edu/events.out.tfevents.1726293207.nova.cs.ucla.edu.4131510.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc746f786a5027e0ff81c1de2887e82566bd9d7832faa0d345746d4688b2c43a
3
- size 78963
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0979ae6ac7f46f0222cda9d297363032b21d5d709b864fd3a583447ce49cbc4b
3
+ size 83326
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.9990265609790016,
3
- "total_flos": 3899966689378304.0,
4
- "train_loss": 0.7223505903458542,
5
- "train_runtime": 18859.9671,
6
  "train_samples": 57523,
7
- "train_samples_per_second": 3.05,
8
  "train_steps_per_second": 0.048
9
  }
 
1
  {
2
+ "epoch": 1.998053121958003,
3
+ "total_flos": 7795310658584576.0,
4
+ "train_loss": 0.6907267017260957,
5
+ "train_runtime": 37673.7111,
6
  "train_samples": 57523,
7
+ "train_samples_per_second": 3.054,
8
  "train_steps_per_second": 0.048
9
  }
trainer_state.json CHANGED
@@ -1,1321 +1,2613 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9990265609790016,
5
  "eval_steps": 200,
6
- "global_step": 898,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.001112501738283966,
13
- "grad_norm": 0.6684414708991117,
14
- "learning_rate": 2.2222222222222225e-06,
15
  "loss": 1.1751,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.00556250869141983,
20
- "grad_norm": 0.7639609960034875,
21
- "learning_rate": 1.1111111111111112e-05,
22
- "loss": 1.282,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.01112501738283966,
27
- "grad_norm": 0.8970088039778137,
28
- "learning_rate": 2.2222222222222223e-05,
29
- "loss": 1.276,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.016687526074259492,
34
- "grad_norm": 0.5983155959810567,
35
- "learning_rate": 3.3333333333333335e-05,
36
- "loss": 1.2166,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.02225003476567932,
41
- "grad_norm": 0.9387996319771594,
42
- "learning_rate": 4.4444444444444447e-05,
43
- "loss": 1.0526,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.027812543457099152,
48
- "grad_norm": 0.3387344315023908,
49
- "learning_rate": 5.555555555555556e-05,
50
- "loss": 0.9608,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.033375052148518984,
55
- "grad_norm": 0.23890923775202408,
56
- "learning_rate": 6.666666666666667e-05,
57
- "loss": 0.911,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.03893756083993881,
62
- "grad_norm": 0.22798914076978022,
63
- "learning_rate": 7.777777777777778e-05,
64
- "loss": 0.8445,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.04450006953135864,
69
- "grad_norm": 0.31587333267646156,
70
- "learning_rate": 8.888888888888889e-05,
71
- "loss": 0.8105,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.05006257822277847,
76
- "grad_norm": 0.19505366669382362,
77
- "learning_rate": 0.0001,
78
- "loss": 0.8265,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.055625086914198305,
83
- "grad_norm": 0.20253125415926881,
84
- "learning_rate": 0.00011111111111111112,
85
- "loss": 0.7973,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.061187595605618136,
90
- "grad_norm": 0.19874800607368992,
91
- "learning_rate": 0.00012222222222222224,
92
- "loss": 0.7554,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.06675010429703797,
97
- "grad_norm": 0.1603617233155014,
98
- "learning_rate": 0.00013333333333333334,
99
- "loss": 0.7737,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.0723126129884578,
104
- "grad_norm": 0.182313382422268,
105
- "learning_rate": 0.00014444444444444444,
106
- "loss": 0.7744,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.07787512167987762,
111
- "grad_norm": 0.1404552378108098,
112
- "learning_rate": 0.00015555555555555556,
113
- "loss": 0.7995,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.08343763037129745,
118
- "grad_norm": 0.15330163912523168,
119
- "learning_rate": 0.0001666666666666667,
120
- "loss": 0.7854,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.08900013906271728,
125
- "grad_norm": 0.15880144560375917,
126
- "learning_rate": 0.00017777777777777779,
127
- "loss": 0.7572,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.09456264775413711,
132
- "grad_norm": 0.17526658243677723,
133
- "learning_rate": 0.00018888888888888888,
134
- "loss": 0.7421,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.10012515644555695,
139
- "grad_norm": 0.1756854246552801,
140
- "learning_rate": 0.0002,
141
- "loss": 0.7569,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.10568766513697678,
146
- "grad_norm": 0.13988463897936598,
147
- "learning_rate": 0.00019998110384864614,
148
- "loss": 0.7271,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.11125017382839661,
153
- "grad_norm": 0.14983210481706255,
154
- "learning_rate": 0.0001999244225358753,
155
- "loss": 0.7444,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.11681268251981644,
160
- "grad_norm": 0.14861727909266345,
161
- "learning_rate": 0.00019982997748286082,
162
- "loss": 0.7783,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.12237519121123627,
167
- "grad_norm": 0.17979168387360328,
168
- "learning_rate": 0.00019969780438256293,
169
- "loss": 0.7651,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.1279376999026561,
174
- "grad_norm": 0.16202080618298298,
175
- "learning_rate": 0.00019952795318623986,
176
- "loss": 0.719,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.13350020859407594,
181
- "grad_norm": 0.13085394814942794,
182
- "learning_rate": 0.0001993204880845699,
183
- "loss": 0.7044,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.13906271728549577,
188
- "grad_norm": 0.1369700622873931,
189
- "learning_rate": 0.00019907548748339222,
190
- "loss": 0.7166,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.1446252259769156,
195
- "grad_norm": 0.12798471944493164,
196
- "learning_rate": 0.0001987930439740757,
197
- "loss": 0.7302,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.15018773466833543,
202
- "grad_norm": 0.16062970178106098,
203
- "learning_rate": 0.0001984732642985263,
204
- "loss": 0.7477,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.15575024335975524,
209
- "grad_norm": 0.12776472354551027,
210
- "learning_rate": 0.0001981162693088471,
211
- "loss": 0.7445,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.16131275205117507,
216
- "grad_norm": 0.13554426042598045,
217
- "learning_rate": 0.00019772219392166519,
218
- "loss": 0.7252,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.1668752607425949,
223
- "grad_norm": 0.13347482010697612,
224
- "learning_rate": 0.00019729118706714375,
225
- "loss": 0.7254,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.17243776943401473,
230
- "grad_norm": 0.134631392605464,
231
- "learning_rate": 0.000196823411632698,
232
- "loss": 0.7184,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.17800027812543456,
237
- "grad_norm": 0.1504769169328635,
238
- "learning_rate": 0.00019631904440143612,
239
- "loss": 0.7259,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.1835627868168544,
244
- "grad_norm": 0.13782774235003004,
245
- "learning_rate": 0.00019577827598534885,
246
- "loss": 0.7367,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.18912529550827423,
251
- "grad_norm": 0.1363344338061366,
252
- "learning_rate": 0.00019520131075327298,
253
- "loss": 0.722,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.19468780419969406,
258
- "grad_norm": 0.14621350415121373,
259
- "learning_rate": 0.00019458836675365556,
260
- "loss": 0.73,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.2002503128911139,
265
- "grad_norm": 0.15278741364826062,
266
- "learning_rate": 0.00019393967563214833,
267
- "loss": 0.6995,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.20581282158253372,
272
- "grad_norm": 0.13331150727297583,
273
- "learning_rate": 0.00019325548254406352,
274
- "loss": 0.7487,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.21137533027395355,
279
- "grad_norm": 0.1412877661682615,
280
- "learning_rate": 0.00019253604606172417,
281
- "loss": 0.7359,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.2169378389653734,
286
- "grad_norm": 0.15015640302548008,
287
- "learning_rate": 0.0001917816380767434,
288
- "loss": 0.7014,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.22250034765679322,
293
- "grad_norm": 0.13131810843452316,
294
- "learning_rate": 0.0001909925436972706,
295
- "loss": 0.7113,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.22250034765679322,
300
- "eval_loss": 0.7031363844871521,
301
- "eval_runtime": 38.8569,
302
- "eval_samples_per_second": 10.474,
303
- "eval_steps_per_second": 0.669,
304
  "step": 200
305
  },
306
  {
307
  "epoch": 0.22806285634821305,
308
- "grad_norm": 0.14737211911767406,
309
- "learning_rate": 0.0001901690611402423,
310
- "loss": 0.7264,
311
  "step": 205
312
  },
313
  {
314
  "epoch": 0.23362536503963288,
315
- "grad_norm": 0.1357323208908195,
316
- "learning_rate": 0.00018931150161867916,
317
- "loss": 0.7366,
318
  "step": 210
319
  },
320
  {
321
  "epoch": 0.2391878737310527,
322
- "grad_norm": 0.13993345760253112,
323
- "learning_rate": 0.0001884201892240715,
324
- "loss": 0.7117,
325
  "step": 215
326
  },
327
  {
328
  "epoch": 0.24475038242247255,
329
- "grad_norm": 0.1369219623281007,
330
- "learning_rate": 0.00018749546080389757,
331
- "loss": 0.7197,
332
  "step": 220
333
  },
334
  {
335
  "epoch": 0.2503128911138924,
336
- "grad_norm": 0.15142549765891067,
337
- "learning_rate": 0.00018653766583432113,
338
- "loss": 0.7402,
339
  "step": 225
340
  },
341
  {
342
  "epoch": 0.2558753998053122,
343
- "grad_norm": 0.1268630478080886,
344
- "learning_rate": 0.0001855471662881164,
345
- "loss": 0.7061,
346
  "step": 230
347
  },
348
  {
349
  "epoch": 0.26143790849673204,
350
- "grad_norm": 0.15250219181797606,
351
- "learning_rate": 0.0001845243364978702,
352
- "loss": 0.6856,
353
  "step": 235
354
  },
355
  {
356
  "epoch": 0.2670004171881519,
357
- "grad_norm": 0.12578511590375505,
358
- "learning_rate": 0.00018346956301451304,
359
- "loss": 0.6933,
360
  "step": 240
361
  },
362
  {
363
  "epoch": 0.2725629258795717,
364
- "grad_norm": 0.12595811486114722,
365
- "learning_rate": 0.00018238324446123266,
366
- "loss": 0.7216,
367
  "step": 245
368
  },
369
  {
370
  "epoch": 0.27812543457099154,
371
- "grad_norm": 0.1372248567648991,
372
- "learning_rate": 0.00018126579138282503,
373
- "loss": 0.7092,
374
  "step": 250
375
  },
376
  {
377
  "epoch": 0.28368794326241137,
378
- "grad_norm": 0.1268675904703008,
379
- "learning_rate": 0.0001801176260905402,
380
- "loss": 0.6885,
381
  "step": 255
382
  },
383
  {
384
  "epoch": 0.2892504519538312,
385
- "grad_norm": 0.15050289967108632,
386
- "learning_rate": 0.00017893918250248104,
387
- "loss": 0.7095,
388
  "step": 260
389
  },
390
  {
391
  "epoch": 0.29481296064525103,
392
- "grad_norm": 0.13336034584149736,
393
- "learning_rate": 0.00017773090597961554,
394
- "loss": 0.6966,
395
  "step": 265
396
  },
397
  {
398
  "epoch": 0.30037546933667086,
399
- "grad_norm": 0.158284438999716,
400
- "learning_rate": 0.00017649325315746478,
401
- "loss": 0.7275,
402
  "step": 270
403
  },
404
  {
405
  "epoch": 0.3059379780280907,
406
- "grad_norm": 0.14579684984990443,
407
- "learning_rate": 0.00017522669177352977,
408
- "loss": 0.7206,
409
  "step": 275
410
  },
411
  {
412
  "epoch": 0.31150048671951047,
413
- "grad_norm": 0.12269074197800435,
414
- "learning_rate": 0.0001739317004905227,
415
- "loss": 0.697,
416
  "step": 280
417
  },
418
  {
419
  "epoch": 0.3170629954109303,
420
- "grad_norm": 0.13835017103100894,
421
- "learning_rate": 0.00017260876871546936,
422
- "loss": 0.7174,
423
  "step": 285
424
  },
425
  {
426
  "epoch": 0.32262550410235014,
427
- "grad_norm": 0.14300200863185467,
428
- "learning_rate": 0.00017125839641475072,
429
- "loss": 0.726,
430
  "step": 290
431
  },
432
  {
433
  "epoch": 0.32818801279376997,
434
- "grad_norm": 0.12914092926857734,
435
- "learning_rate": 0.0001698810939251543,
436
- "loss": 0.6991,
437
  "step": 295
438
  },
439
  {
440
  "epoch": 0.3337505214851898,
441
- "grad_norm": 0.1293105805311886,
442
- "learning_rate": 0.00016847738176100632,
443
- "loss": 0.7264,
444
  "step": 300
445
  },
446
  {
447
  "epoch": 0.33931303017660963,
448
- "grad_norm": 0.1263743857475521,
449
- "learning_rate": 0.00016704779041745686,
450
- "loss": 0.6938,
451
  "step": 305
452
  },
453
  {
454
  "epoch": 0.34487553886802946,
455
- "grad_norm": 0.16556508393117347,
456
- "learning_rate": 0.000165592860169994,
457
- "loss": 0.7251,
458
  "step": 310
459
  },
460
  {
461
  "epoch": 0.3504380475594493,
462
- "grad_norm": 0.13892185415083144,
463
- "learning_rate": 0.00016411314087026106,
464
- "loss": 0.7439,
465
  "step": 315
466
  },
467
  {
468
  "epoch": 0.3560005562508691,
469
- "grad_norm": 0.1427616131100252,
470
- "learning_rate": 0.00016260919173825508,
471
- "loss": 0.6905,
472
  "step": 320
473
  },
474
  {
475
  "epoch": 0.36156306494228896,
476
- "grad_norm": 0.1187614399483276,
477
- "learning_rate": 0.00016108158115098444,
478
- "loss": 0.681,
479
  "step": 325
480
  },
481
  {
482
  "epoch": 0.3671255736337088,
483
- "grad_norm": 0.1428657673910464,
484
- "learning_rate": 0.0001595308864276666,
485
- "loss": 0.705,
486
  "step": 330
487
  },
488
  {
489
  "epoch": 0.3726880823251286,
490
- "grad_norm": 0.14224088322242012,
491
- "learning_rate": 0.00015795769361154547,
492
- "loss": 0.7139,
493
  "step": 335
494
  },
495
  {
496
  "epoch": 0.37825059101654845,
497
- "grad_norm": 0.1456871215491553,
498
- "learning_rate": 0.00015636259724841222,
499
- "loss": 0.7292,
500
  "step": 340
501
  },
502
  {
503
  "epoch": 0.3838130997079683,
504
- "grad_norm": 0.13983468342540284,
505
- "learning_rate": 0.00015474620016191294,
506
- "loss": 0.7222,
507
  "step": 345
508
  },
509
  {
510
  "epoch": 0.3893756083993881,
511
- "grad_norm": 0.14315721096194434,
512
- "learning_rate": 0.00015310911322572753,
513
- "loss": 0.7112,
514
  "step": 350
515
  },
516
  {
517
  "epoch": 0.39493811709080795,
518
- "grad_norm": 0.12717703522604246,
519
- "learning_rate": 0.00015145195513270644,
520
- "loss": 0.7018,
521
  "step": 355
522
  },
523
  {
524
  "epoch": 0.4005006257822278,
525
- "grad_norm": 0.1405738893906113,
526
- "learning_rate": 0.0001497753521610526,
527
- "loss": 0.708,
528
  "step": 360
529
  },
530
  {
531
  "epoch": 0.4060631344736476,
532
- "grad_norm": 0.12649904699680478,
533
- "learning_rate": 0.00014807993793763619,
534
- "loss": 0.6728,
535
  "step": 365
536
  },
537
  {
538
  "epoch": 0.41162564316506745,
539
- "grad_norm": 0.1366397886559032,
540
- "learning_rate": 0.00014636635319853275,
541
- "loss": 0.6855,
542
  "step": 370
543
  },
544
  {
545
  "epoch": 0.4171881518564873,
546
- "grad_norm": 0.1348575434185503,
547
- "learning_rate": 0.00014463524554687399,
548
- "loss": 0.6969,
549
  "step": 375
550
  },
551
  {
552
  "epoch": 0.4227506605479071,
553
- "grad_norm": 0.13467406560845768,
554
- "learning_rate": 0.0001428872692081038,
555
- "loss": 0.7179,
556
  "step": 380
557
  },
558
  {
559
  "epoch": 0.42831316923932694,
560
- "grad_norm": 0.13474764353512392,
561
- "learning_rate": 0.00014112308478273145,
562
- "loss": 0.6853,
563
  "step": 385
564
  },
565
  {
566
  "epoch": 0.4338756779307468,
567
- "grad_norm": 0.12908327147450624,
568
- "learning_rate": 0.00013934335899667527,
569
- "loss": 0.7131,
570
  "step": 390
571
  },
572
  {
573
  "epoch": 0.4394381866221666,
574
- "grad_norm": 0.12215743992138174,
575
- "learning_rate": 0.00013754876444929166,
576
- "loss": 0.6748,
577
  "step": 395
578
  },
579
  {
580
  "epoch": 0.44500069531358644,
581
- "grad_norm": 0.12646747341185763,
582
- "learning_rate": 0.0001357399793591844,
583
- "loss": 0.7181,
584
  "step": 400
585
  },
586
  {
587
  "epoch": 0.44500069531358644,
588
- "eval_loss": 0.6872018575668335,
589
- "eval_runtime": 38.1286,
590
- "eval_samples_per_second": 10.674,
591
- "eval_steps_per_second": 0.682,
592
  "step": 400
593
  },
594
  {
595
  "epoch": 0.45056320400500627,
596
- "grad_norm": 0.13122165542663003,
597
- "learning_rate": 0.00013391768730789002,
598
- "loss": 0.7121,
599
  "step": 405
600
  },
601
  {
602
  "epoch": 0.4561257126964261,
603
- "grad_norm": 0.12302514176584399,
604
- "learning_rate": 0.00013208257698153677,
605
- "loss": 0.7122,
606
  "step": 410
607
  },
608
  {
609
  "epoch": 0.46168822138784593,
610
- "grad_norm": 0.14440204630512954,
611
- "learning_rate": 0.00013023534191057426,
612
- "loss": 0.7239,
613
  "step": 415
614
  },
615
  {
616
  "epoch": 0.46725073007926576,
617
- "grad_norm": 0.13547988803582495,
618
- "learning_rate": 0.0001283766802076722,
619
- "loss": 0.7206,
620
  "step": 420
621
  },
622
  {
623
  "epoch": 0.4728132387706856,
624
- "grad_norm": 0.13999737919216407,
625
- "learning_rate": 0.00012650729430388764,
626
- "loss": 0.6826,
627
  "step": 425
628
  },
629
  {
630
  "epoch": 0.4783757474621054,
631
- "grad_norm": 0.13048583251685064,
632
- "learning_rate": 0.00012462789068320017,
633
- "loss": 0.6926,
634
  "step": 430
635
  },
636
  {
637
  "epoch": 0.48393825615352526,
638
- "grad_norm": 0.1496684308590174,
639
- "learning_rate": 0.00012273917961551513,
640
- "loss": 0.7342,
641
  "step": 435
642
  },
643
  {
644
  "epoch": 0.4895007648449451,
645
- "grad_norm": 0.14327740803897993,
646
- "learning_rate": 0.00012084187488823657,
647
- "loss": 0.6966,
648
  "step": 440
649
  },
650
  {
651
  "epoch": 0.4950632735363649,
652
- "grad_norm": 0.14722640475177795,
653
- "learning_rate": 0.00011893669353651031,
654
- "loss": 0.7045,
655
  "step": 445
656
  },
657
  {
658
  "epoch": 0.5006257822277848,
659
- "grad_norm": 0.13182498566362877,
660
- "learning_rate": 0.00011702435557223987,
661
- "loss": 0.7205,
662
  "step": 450
663
  },
664
  {
665
  "epoch": 0.5061882909192046,
666
- "grad_norm": 0.14649854622775277,
667
- "learning_rate": 0.00011510558371197753,
668
- "loss": 0.6972,
669
  "step": 455
670
  },
671
  {
672
  "epoch": 0.5117507996106244,
673
- "grad_norm": 0.14603995632853023,
674
- "learning_rate": 0.00011318110310379301,
675
- "loss": 0.7455,
676
  "step": 460
677
  },
678
  {
679
  "epoch": 0.5173133083020443,
680
- "grad_norm": 0.1413104318676706,
681
- "learning_rate": 0.0001112516410532233,
682
- "loss": 0.6917,
683
  "step": 465
684
  },
685
  {
686
  "epoch": 0.5228758169934641,
687
- "grad_norm": 0.1319527122221744,
688
- "learning_rate": 0.00010931792674840718,
689
- "loss": 0.6855,
690
  "step": 470
691
  },
692
  {
693
  "epoch": 0.5284383256848839,
694
- "grad_norm": 0.1245675456499167,
695
- "learning_rate": 0.0001073806909845082,
696
- "loss": 0.7028,
697
  "step": 475
698
  },
699
  {
700
  "epoch": 0.5340008343763037,
701
- "grad_norm": 0.11698601668007995,
702
- "learning_rate": 0.00010544066588753044,
703
- "loss": 0.6933,
704
  "step": 480
705
  },
706
  {
707
  "epoch": 0.5395633430677236,
708
- "grad_norm": 0.12677292396669446,
709
- "learning_rate": 0.00010349858463763113,
710
- "loss": 0.683,
711
  "step": 485
712
  },
713
  {
714
  "epoch": 0.5451258517591434,
715
- "grad_norm": 0.13456238683420088,
716
- "learning_rate": 0.0001015551811920351,
717
- "loss": 0.7006,
718
  "step": 490
719
  },
720
  {
721
  "epoch": 0.5506883604505632,
722
- "grad_norm": 0.15227986749806005,
723
- "learning_rate": 9.961119000765531e-05,
724
- "loss": 0.6894,
725
  "step": 495
726
  },
727
  {
728
  "epoch": 0.5562508691419831,
729
- "grad_norm": 0.12847199230412348,
730
- "learning_rate": 9.766734576352478e-05,
731
- "loss": 0.6964,
732
  "step": 500
733
  },
734
  {
735
  "epoch": 0.5618133778334029,
736
- "grad_norm": 0.13064877508080042,
737
- "learning_rate": 9.572438308314446e-05,
738
- "loss": 0.6885,
739
  "step": 505
740
  },
741
  {
742
  "epoch": 0.5673758865248227,
743
- "grad_norm": 0.1372527364951275,
744
- "learning_rate": 9.378303625685195e-05,
745
- "loss": 0.7056,
746
  "step": 510
747
  },
748
  {
749
  "epoch": 0.5729383952162426,
750
- "grad_norm": 0.13318320240144188,
751
- "learning_rate": 9.18440389643165e-05,
752
- "loss": 0.6792,
753
  "step": 515
754
  },
755
  {
756
  "epoch": 0.5785009039076624,
757
- "grad_norm": 0.1389287540275015,
758
- "learning_rate": 8.990812399726435e-05,
759
- "loss": 0.6955,
760
  "step": 520
761
  },
762
  {
763
  "epoch": 0.5840634125990822,
764
- "grad_norm": 0.15589038738911026,
765
- "learning_rate": 8.797602298254004e-05,
766
- "loss": 0.6995,
767
  "step": 525
768
  },
769
  {
770
  "epoch": 0.5896259212905021,
771
- "grad_norm": 0.13296983699316012,
772
- "learning_rate": 8.604846610560771e-05,
773
- "loss": 0.6707,
774
  "step": 530
775
  },
776
  {
777
  "epoch": 0.5951884299819219,
778
- "grad_norm": 0.12114356841208347,
779
- "learning_rate": 8.412618183459708e-05,
780
- "loss": 0.7015,
781
  "step": 535
782
  },
783
  {
784
  "epoch": 0.6007509386733417,
785
- "grad_norm": 0.12351709388346521,
786
- "learning_rate": 8.220989664499878e-05,
787
- "loss": 0.7006,
788
  "step": 540
789
  },
790
  {
791
  "epoch": 0.6063134473647616,
792
- "grad_norm": 0.1233117181776912,
793
- "learning_rate": 8.030033474511249e-05,
794
- "loss": 0.7061,
795
  "step": 545
796
  },
797
  {
798
  "epoch": 0.6118759560561814,
799
- "grad_norm": 0.14966743246815337,
800
- "learning_rate": 7.839821780235168e-05,
801
- "loss": 0.7383,
802
  "step": 550
803
  },
804
  {
805
  "epoch": 0.6174384647476012,
806
- "grad_norm": 0.13689976425741557,
807
- "learning_rate": 7.650426467050926e-05,
808
- "loss": 0.7083,
809
  "step": 555
810
  },
811
  {
812
  "epoch": 0.6230009734390209,
813
- "grad_norm": 0.1540376322278511,
814
- "learning_rate": 7.461919111808595e-05,
815
- "loss": 0.6991,
816
  "step": 560
817
  },
818
  {
819
  "epoch": 0.6285634821304408,
820
- "grad_norm": 0.15818277915085305,
821
- "learning_rate": 7.274370955778498e-05,
822
- "loss": 0.7147,
823
  "step": 565
824
  },
825
  {
826
  "epoch": 0.6341259908218606,
827
- "grad_norm": 0.13733946492240742,
828
- "learning_rate": 7.087852877727481e-05,
829
- "loss": 0.7043,
830
  "step": 570
831
  },
832
  {
833
  "epoch": 0.6396884995132804,
834
- "grad_norm": 0.12905771101166952,
835
- "learning_rate": 6.902435367132208e-05,
836
- "loss": 0.6952,
837
  "step": 575
838
  },
839
  {
840
  "epoch": 0.6452510082047003,
841
- "grad_norm": 0.14917864740623707,
842
- "learning_rate": 6.718188497539554e-05,
843
- "loss": 0.6894,
844
  "step": 580
845
  },
846
  {
847
  "epoch": 0.6508135168961201,
848
- "grad_norm": 0.17034610159423622,
849
- "learning_rate": 6.535181900084206e-05,
850
- "loss": 0.6906,
851
  "step": 585
852
  },
853
  {
854
  "epoch": 0.6563760255875399,
855
- "grad_norm": 0.14577692793179953,
856
- "learning_rate": 6.35348473717345e-05,
857
- "loss": 0.6822,
858
  "step": 590
859
  },
860
  {
861
  "epoch": 0.6619385342789598,
862
- "grad_norm": 0.1400663188779511,
863
- "learning_rate": 6.173165676349103e-05,
864
- "loss": 0.702,
865
  "step": 595
866
  },
867
  {
868
  "epoch": 0.6675010429703796,
869
- "grad_norm": 0.13477519960817833,
870
- "learning_rate": 5.9942928643364724e-05,
871
- "loss": 0.7171,
872
  "step": 600
873
  },
874
  {
875
  "epoch": 0.6675010429703796,
876
- "eval_loss": 0.6785927414894104,
877
- "eval_runtime": 38.1279,
878
- "eval_samples_per_second": 10.675,
879
- "eval_steps_per_second": 0.682,
880
  "step": 600
881
  },
882
  {
883
  "epoch": 0.6730635516617994,
884
- "grad_norm": 0.13058646321017944,
885
- "learning_rate": 5.816933901290136e-05,
886
- "loss": 0.6959,
887
  "step": 605
888
  },
889
  {
890
  "epoch": 0.6786260603532193,
891
- "grad_norm": 0.14039443258781562,
892
- "learning_rate": 5.6411558152462894e-05,
893
- "loss": 0.6991,
894
  "step": 610
895
  },
896
  {
897
  "epoch": 0.6841885690446391,
898
- "grad_norm": 0.1435099198472092,
899
- "learning_rate": 5.4670250367913023e-05,
900
- "loss": 0.7008,
901
  "step": 615
902
  },
903
  {
904
  "epoch": 0.6897510777360589,
905
- "grad_norm": 0.1251912516827599,
906
- "learning_rate": 5.2946073739560706e-05,
907
- "loss": 0.6828,
908
  "step": 620
909
  },
910
  {
911
  "epoch": 0.6953135864274788,
912
- "grad_norm": 0.12907022711465035,
913
- "learning_rate": 5.1239679873456634e-05,
914
- "loss": 0.6811,
915
  "step": 625
916
  },
917
  {
918
  "epoch": 0.7008760951188986,
919
- "grad_norm": 0.13237387347735238,
920
- "learning_rate": 4.955171365513603e-05,
921
- "loss": 0.7235,
922
  "step": 630
923
  },
924
  {
925
  "epoch": 0.7064386038103184,
926
- "grad_norm": 0.12376618467656812,
927
- "learning_rate": 4.7882813005901696e-05,
928
- "loss": 0.687,
929
  "step": 635
930
  },
931
  {
932
  "epoch": 0.7120011125017383,
933
- "grad_norm": 0.1456941018180003,
934
- "learning_rate": 4.623360864173893e-05,
935
- "loss": 0.6813,
936
  "step": 640
937
  },
938
  {
939
  "epoch": 0.7175636211931581,
940
- "grad_norm": 0.17032008806579718,
941
- "learning_rate": 4.460472383495331e-05,
942
- "loss": 0.6795,
943
  "step": 645
944
  },
945
  {
946
  "epoch": 0.7231261298845779,
947
- "grad_norm": 0.12579166198761085,
948
- "learning_rate": 4.2996774178621736e-05,
949
- "loss": 0.6835,
950
  "step": 650
951
  },
952
  {
953
  "epoch": 0.7286886385759977,
954
- "grad_norm": 0.14450683290791613,
955
- "learning_rate": 4.141036735394574e-05,
956
- "loss": 0.6862,
957
  "step": 655
958
  },
959
  {
960
  "epoch": 0.7342511472674176,
961
- "grad_norm": 0.17119446089706108,
962
- "learning_rate": 3.984610290059467e-05,
963
- "loss": 0.7112,
964
  "step": 660
965
  },
966
  {
967
  "epoch": 0.7398136559588374,
968
- "grad_norm": 0.14950992219628134,
969
- "learning_rate": 3.830457199012585e-05,
970
- "loss": 0.7259,
971
  "step": 665
972
  },
973
  {
974
  "epoch": 0.7453761646502572,
975
- "grad_norm": 0.14485285236277412,
976
- "learning_rate": 3.678635720256737e-05,
977
- "loss": 0.6944,
978
  "step": 670
979
  },
980
  {
981
  "epoch": 0.7509386733416771,
982
- "grad_norm": 0.13749155498971605,
983
- "learning_rate": 3.529203230624747e-05,
984
- "loss": 0.6758,
985
  "step": 675
986
  },
987
  {
988
  "epoch": 0.7565011820330969,
989
- "grad_norm": 0.12869094791923574,
990
- "learning_rate": 3.3822162040954354e-05,
991
- "loss": 0.701,
992
  "step": 680
993
  },
994
  {
995
  "epoch": 0.7620636907245167,
996
- "grad_norm": 0.12906896257797407,
997
- "learning_rate": 3.237730190450816e-05,
998
- "loss": 0.7059,
999
  "step": 685
1000
  },
1001
  {
1002
  "epoch": 0.7676261994159366,
1003
- "grad_norm": 0.15164612968237504,
1004
- "learning_rate": 3.0957997942825336e-05,
1005
- "loss": 0.6925,
1006
  "step": 690
1007
  },
1008
  {
1009
  "epoch": 0.7731887081073564,
1010
- "grad_norm": 0.15092417362430902,
1011
- "learning_rate": 2.9564786543555388e-05,
1012
- "loss": 0.704,
1013
  "step": 695
1014
  },
1015
  {
1016
  "epoch": 0.7787512167987762,
1017
- "grad_norm": 0.12028807778273494,
1018
- "learning_rate": 2.819819423336775e-05,
1019
- "loss": 0.6615,
1020
  "step": 700
1021
  },
1022
  {
1023
  "epoch": 0.7843137254901961,
1024
- "grad_norm": 0.11668516511159772,
1025
- "learning_rate": 2.6858737478965035e-05,
1026
- "loss": 0.6761,
1027
  "step": 705
1028
  },
1029
  {
1030
  "epoch": 0.7898762341816159,
1031
- "grad_norm": 0.12013736025525155,
1032
- "learning_rate": 2.5546922491898495e-05,
1033
- "loss": 0.6812,
1034
  "step": 710
1035
  },
1036
  {
1037
  "epoch": 0.7954387428730357,
1038
- "grad_norm": 0.11406481330250699,
1039
- "learning_rate": 2.4263245037258995e-05,
1040
- "loss": 0.6653,
1041
  "step": 715
1042
  },
1043
  {
1044
  "epoch": 0.8010012515644556,
1045
- "grad_norm": 0.14003374126540685,
1046
- "learning_rate": 2.300819024631603e-05,
1047
- "loss": 0.7079,
1048
  "step": 720
1049
  },
1050
  {
1051
  "epoch": 0.8065637602558754,
1052
- "grad_norm": 0.14609209804734563,
1053
- "learning_rate": 2.178223243317532e-05,
1054
- "loss": 0.7126,
1055
  "step": 725
1056
  },
1057
  {
1058
  "epoch": 0.8121262689472952,
1059
- "grad_norm": 0.1412723606655406,
1060
- "learning_rate": 2.058583491552465e-05,
1061
- "loss": 0.6805,
1062
  "step": 730
1063
  },
1064
  {
1065
  "epoch": 0.8176887776387151,
1066
- "grad_norm": 0.13559476815952162,
1067
- "learning_rate": 1.941944983953552e-05,
1068
- "loss": 0.6687,
1069
  "step": 735
1070
  },
1071
  {
1072
  "epoch": 0.8232512863301349,
1073
- "grad_norm": 0.1212687591445283,
1074
- "learning_rate": 1.8283518008986567e-05,
1075
- "loss": 0.6949,
1076
  "step": 740
1077
  },
1078
  {
1079
  "epoch": 0.8288137950215547,
1080
- "grad_norm": 0.13418537722622348,
1081
- "learning_rate": 1.7178468718673714e-05,
1082
- "loss": 0.6947,
1083
  "step": 745
1084
  },
1085
  {
1086
  "epoch": 0.8343763037129746,
1087
- "grad_norm": 0.14626468131094617,
1088
- "learning_rate": 1.6104719592169902e-05,
1089
- "loss": 0.6974,
1090
  "step": 750
1091
  },
1092
  {
1093
  "epoch": 0.8399388124043944,
1094
- "grad_norm": 0.1505968834590085,
1095
- "learning_rate": 1.5062676423995247e-05,
1096
- "loss": 0.6699,
1097
  "step": 755
1098
  },
1099
  {
1100
  "epoch": 0.8455013210958142,
1101
- "grad_norm": 0.13840692663705956,
1102
- "learning_rate": 1.4052733026258281e-05,
1103
- "loss": 0.6775,
1104
  "step": 760
1105
  },
1106
  {
1107
  "epoch": 0.851063829787234,
1108
- "grad_norm": 0.15338747206444608,
1109
- "learning_rate": 1.3075271079825036e-05,
1110
- "loss": 0.6855,
1111
  "step": 765
1112
  },
1113
  {
1114
  "epoch": 0.8566263384786539,
1115
- "grad_norm": 0.15087298168068788,
1116
- "learning_rate": 1.2130659990073146e-05,
1117
- "loss": 0.7074,
1118
  "step": 770
1119
  },
1120
  {
1121
  "epoch": 0.8621888471700737,
1122
- "grad_norm": 0.15260125382082407,
1123
- "learning_rate": 1.1219256747285045e-05,
1124
- "loss": 0.6559,
1125
  "step": 775
1126
  },
1127
  {
1128
  "epoch": 0.8677513558614935,
1129
- "grad_norm": 0.13676984708600298,
1130
- "learning_rate": 1.0341405791733183e-05,
1131
- "loss": 0.7018,
1132
  "step": 780
1133
  },
1134
  {
1135
  "epoch": 0.8733138645529134,
1136
- "grad_norm": 0.12792881953620902,
1137
- "learning_rate": 9.49743888350798e-06,
1138
- "loss": 0.6758,
1139
  "step": 785
1140
  },
1141
  {
1142
  "epoch": 0.8788763732443332,
1143
- "grad_norm": 0.13024278184248392,
1144
- "learning_rate": 8.687674977138116e-06,
1145
- "loss": 0.703,
1146
  "step": 790
1147
  },
1148
  {
1149
  "epoch": 0.884438881935753,
1150
- "grad_norm": 0.15819074719833112,
1151
- "learning_rate": 7.912420101050367e-06,
1152
- "loss": 0.7026,
1153
  "step": 795
1154
  },
1155
  {
1156
  "epoch": 0.8900013906271729,
1157
- "grad_norm": 0.1363306447853743,
1158
- "learning_rate": 7.171967241914224e-06,
1159
- "loss": 0.7031,
1160
  "step": 800
1161
  },
1162
  {
1163
  "epoch": 0.8900013906271729,
1164
- "eval_loss": 0.6755391955375671,
1165
- "eval_runtime": 38.1439,
1166
- "eval_samples_per_second": 10.67,
1167
- "eval_steps_per_second": 0.682,
1168
  "step": 800
1169
  },
1170
  {
1171
  "epoch": 0.8955638993185927,
1172
- "grad_norm": 0.11599580684218645,
1173
- "learning_rate": 6.4665962339156005e-06,
1174
- "loss": 0.6559,
1175
  "step": 805
1176
  },
1177
  {
1178
  "epoch": 0.9011264080100125,
1179
- "grad_norm": 0.14301150354946243,
1180
- "learning_rate": 5.7965736530010916e-06,
1181
- "loss": 0.7436,
1182
  "step": 810
1183
  },
1184
  {
1185
  "epoch": 0.9066889167014324,
1186
- "grad_norm": 0.1454357862013425,
1187
- "learning_rate": 5.162152716132662e-06,
1188
- "loss": 0.6851,
1189
  "step": 815
1190
  },
1191
  {
1192
  "epoch": 0.9122514253928522,
1193
- "grad_norm": 0.1296885702439703,
1194
- "learning_rate": 4.563573185591219e-06,
1195
- "loss": 0.6799,
1196
  "step": 820
1197
  },
1198
  {
1199
  "epoch": 0.917813934084272,
1200
- "grad_norm": 0.13649350003605498,
1201
- "learning_rate": 4.0010612783648925e-06,
1202
- "loss": 0.6954,
1203
  "step": 825
1204
  },
1205
  {
1206
  "epoch": 0.9233764427756919,
1207
- "grad_norm": 0.11566680964002608,
1208
- "learning_rate": 3.4748295806564356e-06,
1209
- "loss": 0.6931,
1210
  "step": 830
1211
  },
1212
  {
1213
  "epoch": 0.9289389514671117,
1214
- "grad_norm": 0.1376951533333156,
1215
- "learning_rate": 2.9850769675419774e-06,
1216
- "loss": 0.7094,
1217
  "step": 835
1218
  },
1219
  {
1220
  "epoch": 0.9345014601585315,
1221
- "grad_norm": 0.13815883208860893,
1222
- "learning_rate": 2.5319885278115906e-06,
1223
- "loss": 0.6814,
1224
  "step": 840
1225
  },
1226
  {
1227
  "epoch": 0.9400639688499514,
1228
- "grad_norm": 0.12898446369151903,
1229
- "learning_rate": 2.115735494019966e-06,
1230
- "loss": 0.7252,
1231
  "step": 845
1232
  },
1233
  {
1234
  "epoch": 0.9456264775413712,
1235
- "grad_norm": 0.14801072441410335,
1236
- "learning_rate": 1.7364751777736332e-06,
1237
- "loss": 0.6701,
1238
  "step": 850
1239
  },
1240
  {
1241
  "epoch": 0.951188986232791,
1242
- "grad_norm": 0.1489167549426156,
1243
- "learning_rate": 1.394350910279385e-06,
1244
- "loss": 0.6852,
1245
  "step": 855
1246
  },
1247
  {
1248
  "epoch": 0.9567514949242109,
1249
- "grad_norm": 0.150107558255733,
1250
- "learning_rate": 1.089491988176017e-06,
1251
- "loss": 0.7177,
1252
  "step": 860
1253
  },
1254
  {
1255
  "epoch": 0.9623140036156307,
1256
- "grad_norm": 0.1462660934588179,
1257
- "learning_rate": 8.220136246701926e-07,
1258
- "loss": 0.6665,
1259
  "step": 865
1260
  },
1261
  {
1262
  "epoch": 0.9678765123070505,
1263
- "grad_norm": 0.13216262862731915,
1264
- "learning_rate": 5.920169059947411e-07,
1265
- "loss": 0.666,
1266
  "step": 870
1267
  },
1268
  {
1269
  "epoch": 0.9734390209984704,
1270
- "grad_norm": 0.1497443816075182,
1271
- "learning_rate": 3.9958875320580404e-07,
1272
- "loss": 0.6902,
1273
  "step": 875
1274
  },
1275
  {
1276
  "epoch": 0.9790015296898902,
1277
- "grad_norm": 0.1329859440152703,
1278
- "learning_rate": 2.448018893333681e-07,
1279
- "loss": 0.6959,
1280
  "step": 880
1281
  },
1282
  {
1283
  "epoch": 0.98456403838131,
1284
- "grad_norm": 0.14839381526763562,
1285
- "learning_rate": 1.277148118975835e-07,
1286
- "loss": 0.6884,
1287
  "step": 885
1288
  },
1289
  {
1290
  "epoch": 0.9901265470727298,
1291
- "grad_norm": 0.14357565386768453,
1292
- "learning_rate": 4.837177080119215e-08,
1293
- "loss": 0.6961,
1294
  "step": 890
1295
  },
1296
  {
1297
  "epoch": 0.9956890557641497,
1298
- "grad_norm": 0.12423764687193309,
1299
- "learning_rate": 6.8027516064606e-09,
1300
- "loss": 0.6802,
1301
  "step": 895
1302
  },
1303
  {
1304
- "epoch": 0.9990265609790016,
1305
- "step": 898,
1306
- "total_flos": 3899966689378304.0,
1307
- "train_loss": 0.7223505903458542,
1308
- "train_runtime": 18859.9671,
1309
- "train_samples_per_second": 3.05,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1310
  "train_steps_per_second": 0.048
1311
  }
1312
  ],
1313
  "logging_steps": 5,
1314
- "max_steps": 898,
1315
  "num_input_tokens_seen": 0,
1316
- "num_train_epochs": 1,
1317
  "save_steps": 100,
1318
- "total_flos": 3899966689378304.0,
1319
  "train_batch_size": 2,
1320
  "trial_name": null,
1321
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.998053121958003,
5
  "eval_steps": 200,
6
+ "global_step": 1796,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.001112501738283966,
13
+ "grad_norm": 0.6684235958622939,
14
+ "learning_rate": 1.1111111111111112e-06,
15
  "loss": 1.1751,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.00556250869141983,
20
+ "grad_norm": 0.7610951352005912,
21
+ "learning_rate": 5.555555555555556e-06,
22
+ "loss": 1.2823,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.01112501738283966,
27
+ "grad_norm": 0.8690361671075456,
28
+ "learning_rate": 1.1111111111111112e-05,
29
+ "loss": 1.2894,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.016687526074259492,
34
+ "grad_norm": 1.059736284119422,
35
+ "learning_rate": 1.6666666666666667e-05,
36
+ "loss": 1.3166,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.02225003476567932,
41
+ "grad_norm": 0.4906355740047051,
42
+ "learning_rate": 2.2222222222222223e-05,
43
+ "loss": 1.1771,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.027812543457099152,
48
+ "grad_norm": 1.04376108340149,
49
+ "learning_rate": 2.777777777777778e-05,
50
+ "loss": 1.1062,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.033375052148518984,
55
+ "grad_norm": 0.47593208759368216,
56
+ "learning_rate": 3.3333333333333335e-05,
57
+ "loss": 1.014,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.03893756083993881,
62
+ "grad_norm": 0.2526378868894225,
63
+ "learning_rate": 3.888888888888889e-05,
64
+ "loss": 0.9027,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.04450006953135864,
69
+ "grad_norm": 0.2750620823042748,
70
+ "learning_rate": 4.4444444444444447e-05,
71
+ "loss": 0.8556,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.05006257822277847,
76
+ "grad_norm": 0.22806716350963602,
77
+ "learning_rate": 5e-05,
78
+ "loss": 0.8714,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.055625086914198305,
83
+ "grad_norm": 0.31305577393443185,
84
+ "learning_rate": 5.555555555555556e-05,
85
+ "loss": 0.8431,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.061187595605618136,
90
+ "grad_norm": 0.3231781586973102,
91
+ "learning_rate": 6.111111111111112e-05,
92
+ "loss": 0.7896,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.06675010429703797,
97
+ "grad_norm": 0.2652381898767101,
98
+ "learning_rate": 6.666666666666667e-05,
99
+ "loss": 0.8031,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.0723126129884578,
104
+ "grad_norm": 0.16353190746623922,
105
+ "learning_rate": 7.222222222222222e-05,
106
+ "loss": 0.798,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.07787512167987762,
111
+ "grad_norm": 0.14957803129107605,
112
+ "learning_rate": 7.777777777777778e-05,
113
+ "loss": 0.8214,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.08343763037129745,
118
+ "grad_norm": 0.1558832457817548,
119
+ "learning_rate": 8.333333333333334e-05,
120
+ "loss": 0.8046,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.08900013906271728,
125
+ "grad_norm": 0.15611943332427167,
126
+ "learning_rate": 8.888888888888889e-05,
127
+ "loss": 0.7725,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.09456264775413711,
132
+ "grad_norm": 0.17878911552246932,
133
+ "learning_rate": 9.444444444444444e-05,
134
+ "loss": 0.758,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.10012515644555695,
139
+ "grad_norm": 0.21152631925896626,
140
+ "learning_rate": 0.0001,
141
+ "loss": 0.7741,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.10568766513697678,
146
+ "grad_norm": 0.15798064484261687,
147
+ "learning_rate": 0.00010555555555555557,
148
+ "loss": 0.7411,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.11125017382839661,
153
+ "grad_norm": 0.1770136407809846,
154
+ "learning_rate": 0.00011111111111111112,
155
+ "loss": 0.7582,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.11681268251981644,
160
+ "grad_norm": 0.1721697060786133,
161
+ "learning_rate": 0.00011666666666666668,
162
+ "loss": 0.7908,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.12237519121123627,
167
+ "grad_norm": 0.19163940220247466,
168
+ "learning_rate": 0.00012222222222222224,
169
+ "loss": 0.7769,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.1279376999026561,
174
+ "grad_norm": 0.17258655274801674,
175
+ "learning_rate": 0.00012777777777777776,
176
+ "loss": 0.7291,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.13350020859407594,
181
+ "grad_norm": 0.14647192105198578,
182
+ "learning_rate": 0.00013333333333333334,
183
+ "loss": 0.7147,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.13906271728549577,
188
+ "grad_norm": 0.15483066376892327,
189
+ "learning_rate": 0.0001388888888888889,
190
+ "loss": 0.7253,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.1446252259769156,
195
+ "grad_norm": 0.15985722819525555,
196
+ "learning_rate": 0.00014444444444444444,
197
+ "loss": 0.74,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.15018773466833543,
202
+ "grad_norm": 0.18677759796975052,
203
+ "learning_rate": 0.00015000000000000001,
204
+ "loss": 0.7578,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.15575024335975524,
209
+ "grad_norm": 0.1502765821288928,
210
+ "learning_rate": 0.00015555555555555556,
211
+ "loss": 0.7541,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.16131275205117507,
216
+ "grad_norm": 0.16754721310239581,
217
+ "learning_rate": 0.0001611111111111111,
218
+ "loss": 0.7338,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.1668752607425949,
223
+ "grad_norm": 0.15603842447844732,
224
+ "learning_rate": 0.0001666666666666667,
225
+ "loss": 0.7347,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.17243776943401473,
230
+ "grad_norm": 0.16036041756367805,
231
+ "learning_rate": 0.00017222222222222224,
232
+ "loss": 0.7275,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.17800027812543456,
237
+ "grad_norm": 0.17275116566192827,
238
+ "learning_rate": 0.00017777777777777779,
239
+ "loss": 0.7357,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.1835627868168544,
244
+ "grad_norm": 0.16362661400179987,
245
+ "learning_rate": 0.00018333333333333334,
246
+ "loss": 0.7454,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.18912529550827423,
251
+ "grad_norm": 0.1529061378986909,
252
+ "learning_rate": 0.00018888888888888888,
253
+ "loss": 0.7305,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.19468780419969406,
258
+ "grad_norm": 0.16390278757569096,
259
+ "learning_rate": 0.00019444444444444446,
260
+ "loss": 0.7383,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.2002503128911139,
265
+ "grad_norm": 0.16834812585150158,
266
+ "learning_rate": 0.0002,
267
+ "loss": 0.7067,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.20581282158253372,
272
+ "grad_norm": 0.15601116057262598,
273
+ "learning_rate": 0.0001999952758505736,
274
+ "loss": 0.7567,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.21137533027395355,
279
+ "grad_norm": 0.15788111366123353,
280
+ "learning_rate": 0.00019998110384864614,
281
+ "loss": 0.7434,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.2169378389653734,
286
+ "grad_norm": 0.16327699685803637,
287
+ "learning_rate": 0.00019995748533323075,
288
+ "loss": 0.7087,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.22250034765679322,
293
+ "grad_norm": 0.14758830274997248,
294
+ "learning_rate": 0.0001999244225358753,
295
+ "loss": 0.718,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.22250034765679322,
300
+ "eval_loss": 0.7089951634407043,
301
+ "eval_runtime": 38.6391,
302
+ "eval_samples_per_second": 10.533,
303
+ "eval_steps_per_second": 0.673,
304
  "step": 200
305
  },
306
  {
307
  "epoch": 0.22806285634821305,
308
+ "grad_norm": 0.16285906061792327,
309
+ "learning_rate": 0.00019988191858045178,
310
+ "loss": 0.7323,
311
  "step": 205
312
  },
313
  {
314
  "epoch": 0.23362536503963288,
315
+ "grad_norm": 0.15445314974456065,
316
+ "learning_rate": 0.00019982997748286082,
317
+ "loss": 0.7427,
318
  "step": 210
319
  },
320
  {
321
  "epoch": 0.2391878737310527,
322
+ "grad_norm": 0.14969344173380988,
323
+ "learning_rate": 0.00019976860415065256,
324
+ "loss": 0.7171,
325
  "step": 215
326
  },
327
  {
328
  "epoch": 0.24475038242247255,
329
+ "grad_norm": 0.15378571057430904,
330
+ "learning_rate": 0.00019969780438256293,
331
+ "loss": 0.726,
332
  "step": 220
333
  },
334
  {
335
  "epoch": 0.2503128911138924,
336
+ "grad_norm": 0.1655361908944567,
337
+ "learning_rate": 0.00019961758486796555,
338
+ "loss": 0.7463,
339
  "step": 225
340
  },
341
  {
342
  "epoch": 0.2558753998053122,
343
+ "grad_norm": 0.1343012669489156,
344
+ "learning_rate": 0.00019952795318623986,
345
+ "loss": 0.7115,
346
  "step": 230
347
  },
348
  {
349
  "epoch": 0.26143790849673204,
350
+ "grad_norm": 0.1597015078714079,
351
+ "learning_rate": 0.00019942891780605512,
352
+ "loss": 0.6901,
353
  "step": 235
354
  },
355
  {
356
  "epoch": 0.2670004171881519,
357
+ "grad_norm": 0.13156659502510382,
358
+ "learning_rate": 0.0001993204880845699,
359
+ "loss": 0.6974,
360
  "step": 240
361
  },
362
  {
363
  "epoch": 0.2725629258795717,
364
+ "grad_norm": 0.13487041292813268,
365
+ "learning_rate": 0.0001992026742665484,
366
+ "loss": 0.7264,
367
  "step": 245
368
  },
369
  {
370
  "epoch": 0.27812543457099154,
371
+ "grad_norm": 0.1452928301770259,
372
+ "learning_rate": 0.00019907548748339222,
373
+ "loss": 0.7132,
374
  "step": 250
375
  },
376
  {
377
  "epoch": 0.28368794326241137,
378
+ "grad_norm": 0.13005536436409637,
379
+ "learning_rate": 0.00019893893975208874,
380
+ "loss": 0.6924,
381
  "step": 255
382
  },
383
  {
384
  "epoch": 0.2892504519538312,
385
+ "grad_norm": 0.15551390193086698,
386
+ "learning_rate": 0.0001987930439740757,
387
+ "loss": 0.7137,
388
  "step": 260
389
  },
390
  {
391
  "epoch": 0.29481296064525103,
392
+ "grad_norm": 0.1411196790735591,
393
+ "learning_rate": 0.0001986378139340222,
394
+ "loss": 0.7003,
395
  "step": 265
396
  },
397
  {
398
  "epoch": 0.30037546933667086,
399
+ "grad_norm": 0.16701635405751747,
400
+ "learning_rate": 0.0001984732642985263,
401
+ "loss": 0.7318,
402
  "step": 270
403
  },
404
  {
405
  "epoch": 0.3059379780280907,
406
+ "grad_norm": 0.15781390142645427,
407
+ "learning_rate": 0.00019829941061472943,
408
+ "loss": 0.725,
409
  "step": 275
410
  },
411
  {
412
  "epoch": 0.31150048671951047,
413
+ "grad_norm": 0.1276451211299801,
414
+ "learning_rate": 0.0001981162693088471,
415
+ "loss": 0.7008,
416
  "step": 280
417
  },
418
  {
419
  "epoch": 0.3170629954109303,
420
+ "grad_norm": 0.14226714916323183,
421
+ "learning_rate": 0.00019792385768461723,
422
+ "loss": 0.7215,
423
  "step": 285
424
  },
425
  {
426
  "epoch": 0.32262550410235014,
427
+ "grad_norm": 0.15169879239239556,
428
+ "learning_rate": 0.00019772219392166519,
429
+ "loss": 0.7298,
430
  "step": 290
431
  },
432
  {
433
  "epoch": 0.32818801279376997,
434
+ "grad_norm": 0.13790872011113864,
435
+ "learning_rate": 0.00019751129707378583,
436
+ "loss": 0.7024,
437
  "step": 295
438
  },
439
  {
440
  "epoch": 0.3337505214851898,
441
+ "grad_norm": 0.13510983391572584,
442
+ "learning_rate": 0.00019729118706714375,
443
+ "loss": 0.7301,
444
  "step": 300
445
  },
446
  {
447
  "epoch": 0.33931303017660963,
448
+ "grad_norm": 0.13421290102386768,
449
+ "learning_rate": 0.00019706188469839012,
450
+ "loss": 0.6973,
451
  "step": 305
452
  },
453
  {
454
  "epoch": 0.34487553886802946,
455
+ "grad_norm": 0.17987364390951446,
456
+ "learning_rate": 0.000196823411632698,
457
+ "loss": 0.7285,
458
  "step": 310
459
  },
460
  {
461
  "epoch": 0.3504380475594493,
462
+ "grad_norm": 0.1463794049975603,
463
+ "learning_rate": 0.00019657579040171537,
464
+ "loss": 0.7474,
465
  "step": 315
466
  },
467
  {
468
  "epoch": 0.3560005562508691,
469
+ "grad_norm": 0.14423415874709103,
470
+ "learning_rate": 0.00019631904440143612,
471
+ "loss": 0.6938,
472
  "step": 320
473
  },
474
  {
475
  "epoch": 0.36156306494228896,
476
+ "grad_norm": 0.11858366900169025,
477
+ "learning_rate": 0.00019605319788998965,
478
+ "loss": 0.6845,
479
  "step": 325
480
  },
481
  {
482
  "epoch": 0.3671255736337088,
483
+ "grad_norm": 0.14664958369445155,
484
+ "learning_rate": 0.00019577827598534885,
485
+ "loss": 0.7085,
486
  "step": 330
487
  },
488
  {
489
  "epoch": 0.3726880823251286,
490
+ "grad_norm": 0.16873527257647247,
491
+ "learning_rate": 0.0001954943046629569,
492
+ "loss": 0.717,
493
  "step": 335
494
  },
495
  {
496
  "epoch": 0.37825059101654845,
497
+ "grad_norm": 0.1486177299286815,
498
+ "learning_rate": 0.00019520131075327298,
499
+ "loss": 0.7321,
500
  "step": 340
501
  },
502
  {
503
  "epoch": 0.3838130997079683,
504
+ "grad_norm": 0.13978486223349953,
505
+ "learning_rate": 0.00019489932193923735,
506
+ "loss": 0.7258,
507
  "step": 345
508
  },
509
  {
510
  "epoch": 0.3893756083993881,
511
+ "grad_norm": 0.15478013993645653,
512
+ "learning_rate": 0.00019458836675365556,
513
+ "loss": 0.7147,
514
  "step": 350
515
  },
516
  {
517
  "epoch": 0.39493811709080795,
518
+ "grad_norm": 0.1335712601381255,
519
+ "learning_rate": 0.00019426847457650292,
520
+ "loss": 0.705,
521
  "step": 355
522
  },
523
  {
524
  "epoch": 0.4005006257822278,
525
+ "grad_norm": 0.14215220233504217,
526
+ "learning_rate": 0.00019393967563214833,
527
+ "loss": 0.7109,
528
  "step": 360
529
  },
530
  {
531
  "epoch": 0.4060631344736476,
532
+ "grad_norm": 0.12737803269805706,
533
+ "learning_rate": 0.00019360200098649864,
534
+ "loss": 0.6759,
535
  "step": 365
536
  },
537
  {
538
  "epoch": 0.41162564316506745,
539
+ "grad_norm": 0.14212696306551803,
540
+ "learning_rate": 0.00019325548254406352,
541
+ "loss": 0.6884,
542
  "step": 370
543
  },
544
  {
545
  "epoch": 0.4171881518564873,
546
+ "grad_norm": 0.13427514345248226,
547
+ "learning_rate": 0.00019290015304494103,
548
+ "loss": 0.6997,
549
  "step": 375
550
  },
551
  {
552
  "epoch": 0.4227506605479071,
553
+ "grad_norm": 0.13648211126654036,
554
+ "learning_rate": 0.00019253604606172417,
555
+ "loss": 0.7211,
556
  "step": 380
557
  },
558
  {
559
  "epoch": 0.42831316923932694,
560
+ "grad_norm": 0.1328171684670597,
561
+ "learning_rate": 0.0001921631959963288,
562
+ "loss": 0.6879,
563
  "step": 385
564
  },
565
  {
566
  "epoch": 0.4338756779307468,
567
+ "grad_norm": 0.13003415855983963,
568
+ "learning_rate": 0.0001917816380767434,
569
+ "loss": 0.7164,
570
  "step": 390
571
  },
572
  {
573
  "epoch": 0.4394381866221666,
574
+ "grad_norm": 0.12331669293330129,
575
+ "learning_rate": 0.00019139140835370053,
576
+ "loss": 0.6782,
577
  "step": 395
578
  },
579
  {
580
  "epoch": 0.44500069531358644,
581
+ "grad_norm": 0.12354925404653357,
582
+ "learning_rate": 0.0001909925436972706,
583
+ "loss": 0.7205,
584
  "step": 400
585
  },
586
  {
587
  "epoch": 0.44500069531358644,
588
+ "eval_loss": 0.6896927952766418,
589
+ "eval_runtime": 38.0771,
590
+ "eval_samples_per_second": 10.689,
591
+ "eval_steps_per_second": 0.683,
592
  "step": 400
593
  },
594
  {
595
  "epoch": 0.45056320400500627,
596
+ "grad_norm": 0.12749730782556348,
597
+ "learning_rate": 0.0001905850817933784,
598
+ "loss": 0.7147,
599
  "step": 405
600
  },
601
  {
602
  "epoch": 0.4561257126964261,
603
+ "grad_norm": 0.1290830353607145,
604
+ "learning_rate": 0.0001901690611402423,
605
+ "loss": 0.7147,
606
  "step": 410
607
  },
608
  {
609
  "epoch": 0.46168822138784593,
610
+ "grad_norm": 0.14656956459115003,
611
+ "learning_rate": 0.0001897445210447369,
612
+ "loss": 0.7273,
613
  "step": 415
614
  },
615
  {
616
  "epoch": 0.46725073007926576,
617
+ "grad_norm": 0.13591979089604955,
618
+ "learning_rate": 0.00018931150161867916,
619
+ "loss": 0.7233,
620
  "step": 420
621
  },
622
  {
623
  "epoch": 0.4728132387706856,
624
+ "grad_norm": 0.14148246469362974,
625
+ "learning_rate": 0.00018887004377503858,
626
+ "loss": 0.6857,
627
  "step": 425
628
  },
629
  {
630
  "epoch": 0.4783757474621054,
631
+ "grad_norm": 0.13172632177206697,
632
+ "learning_rate": 0.0001884201892240715,
633
+ "loss": 0.6954,
634
  "step": 430
635
  },
636
  {
637
  "epoch": 0.48393825615352526,
638
+ "grad_norm": 0.15251049696040847,
639
+ "learning_rate": 0.00018796198046938033,
640
+ "loss": 0.7369,
641
  "step": 435
642
  },
643
  {
644
  "epoch": 0.4895007648449451,
645
+ "grad_norm": 0.14157664597377279,
646
+ "learning_rate": 0.00018749546080389757,
647
+ "loss": 0.6988,
648
  "step": 440
649
  },
650
  {
651
  "epoch": 0.4950632735363649,
652
+ "grad_norm": 0.14790889100169335,
653
+ "learning_rate": 0.00018702067430579543,
654
+ "loss": 0.7068,
655
  "step": 445
656
  },
657
  {
658
  "epoch": 0.5006257822277848,
659
+ "grad_norm": 0.13754915966044653,
660
+ "learning_rate": 0.00018653766583432113,
661
+ "loss": 0.7226,
662
  "step": 450
663
  },
664
  {
665
  "epoch": 0.5061882909192046,
666
+ "grad_norm": 0.1459944140173946,
667
+ "learning_rate": 0.00018604648102555856,
668
+ "loss": 0.6998,
669
  "step": 455
670
  },
671
  {
672
  "epoch": 0.5117507996106244,
673
+ "grad_norm": 0.14444314196036126,
674
+ "learning_rate": 0.0001855471662881164,
675
+ "loss": 0.7482,
676
  "step": 460
677
  },
678
  {
679
  "epoch": 0.5173133083020443,
680
+ "grad_norm": 0.1434929187274412,
681
+ "learning_rate": 0.00018503976879874322,
682
+ "loss": 0.695,
683
  "step": 465
684
  },
685
  {
686
  "epoch": 0.5228758169934641,
687
+ "grad_norm": 0.13128793518588147,
688
+ "learning_rate": 0.0001845243364978702,
689
+ "loss": 0.6895,
690
  "step": 470
691
  },
692
  {
693
  "epoch": 0.5284383256848839,
694
+ "grad_norm": 0.12701476810931256,
695
+ "learning_rate": 0.0001840009180850815,
696
+ "loss": 0.7061,
697
  "step": 475
698
  },
699
  {
700
  "epoch": 0.5340008343763037,
701
+ "grad_norm": 0.1264914632820831,
702
+ "learning_rate": 0.00018346956301451304,
703
+ "loss": 0.6962,
704
  "step": 480
705
  },
706
  {
707
  "epoch": 0.5395633430677236,
708
+ "grad_norm": 0.13208739238621547,
709
+ "learning_rate": 0.00018293032149017984,
710
+ "loss": 0.6863,
711
  "step": 485
712
  },
713
  {
714
  "epoch": 0.5451258517591434,
715
+ "grad_norm": 0.13323142479850197,
716
+ "learning_rate": 0.00018238324446123266,
717
+ "loss": 0.7039,
718
  "step": 490
719
  },
720
  {
721
  "epoch": 0.5506883604505632,
722
+ "grad_norm": 0.1588926427932463,
723
+ "learning_rate": 0.0001818283836171441,
724
+ "loss": 0.6924,
725
  "step": 495
726
  },
727
  {
728
  "epoch": 0.5562508691419831,
729
+ "grad_norm": 0.12667050310382044,
730
+ "learning_rate": 0.00018126579138282503,
731
+ "loss": 0.6991,
732
  "step": 500
733
  },
734
  {
735
  "epoch": 0.5618133778334029,
736
+ "grad_norm": 0.13081757063531854,
737
+ "learning_rate": 0.000180695520913671,
738
+ "loss": 0.6909,
739
  "step": 505
740
  },
741
  {
742
  "epoch": 0.5673758865248227,
743
+ "grad_norm": 0.13598420562667687,
744
+ "learning_rate": 0.0001801176260905402,
745
+ "loss": 0.7088,
746
  "step": 510
747
  },
748
  {
749
  "epoch": 0.5729383952162426,
750
+ "grad_norm": 0.1300574822823029,
751
+ "learning_rate": 0.00017953216151466263,
752
+ "loss": 0.6821,
753
  "step": 515
754
  },
755
  {
756
  "epoch": 0.5785009039076624,
757
+ "grad_norm": 0.14206907328811608,
758
+ "learning_rate": 0.00017893918250248104,
759
+ "loss": 0.6981,
760
  "step": 520
761
  },
762
  {
763
  "epoch": 0.5840634125990822,
764
+ "grad_norm": 0.1557643228084498,
765
+ "learning_rate": 0.00017833874508042466,
766
+ "loss": 0.7018,
767
  "step": 525
768
  },
769
  {
770
  "epoch": 0.5896259212905021,
771
+ "grad_norm": 0.13942706653578768,
772
+ "learning_rate": 0.00017773090597961554,
773
+ "loss": 0.6738,
774
  "step": 530
775
  },
776
  {
777
  "epoch": 0.5951884299819219,
778
+ "grad_norm": 0.12004314100744409,
779
+ "learning_rate": 0.00017711572263050845,
780
+ "loss": 0.7039,
781
  "step": 535
782
  },
783
  {
784
  "epoch": 0.6007509386733417,
785
+ "grad_norm": 0.12451500019905493,
786
+ "learning_rate": 0.00017649325315746478,
787
+ "loss": 0.7034,
788
  "step": 540
789
  },
790
  {
791
  "epoch": 0.6063134473647616,
792
+ "grad_norm": 0.1225444976108732,
793
+ "learning_rate": 0.00017586355637326054,
794
+ "loss": 0.709,
795
  "step": 545
796
  },
797
  {
798
  "epoch": 0.6118759560561814,
799
+ "grad_norm": 0.14795401636462246,
800
+ "learning_rate": 0.00017522669177352977,
801
+ "loss": 0.7415,
802
  "step": 550
803
  },
804
  {
805
  "epoch": 0.6174384647476012,
806
+ "grad_norm": 0.13482563035315856,
807
+ "learning_rate": 0.00017458271953114317,
808
+ "loss": 0.7117,
809
  "step": 555
810
  },
811
  {
812
  "epoch": 0.6230009734390209,
813
+ "grad_norm": 0.15422494211156157,
814
+ "learning_rate": 0.0001739317004905227,
815
+ "loss": 0.7027,
816
  "step": 560
817
  },
818
  {
819
  "epoch": 0.6285634821304408,
820
+ "grad_norm": 0.15479270007272883,
821
+ "learning_rate": 0.000173273696161893,
822
+ "loss": 0.7174,
823
  "step": 565
824
  },
825
  {
826
  "epoch": 0.6341259908218606,
827
+ "grad_norm": 0.13736882345033063,
828
+ "learning_rate": 0.00017260876871546936,
829
+ "loss": 0.7072,
830
  "step": 570
831
  },
832
  {
833
  "epoch": 0.6396884995132804,
834
+ "grad_norm": 0.130113338971742,
835
+ "learning_rate": 0.00017193698097558416,
836
+ "loss": 0.6987,
837
  "step": 575
838
  },
839
  {
840
  "epoch": 0.6452510082047003,
841
+ "grad_norm": 0.15120739345519632,
842
+ "learning_rate": 0.00017125839641475072,
843
+ "loss": 0.693,
844
  "step": 580
845
  },
846
  {
847
  "epoch": 0.6508135168961201,
848
+ "grad_norm": 0.17345133663648482,
849
+ "learning_rate": 0.00017057307914766624,
850
+ "loss": 0.6933,
851
  "step": 585
852
  },
853
  {
854
  "epoch": 0.6563760255875399,
855
+ "grad_norm": 0.14577660905768358,
856
+ "learning_rate": 0.0001698810939251543,
857
+ "loss": 0.6847,
858
  "step": 590
859
  },
860
  {
861
  "epoch": 0.6619385342789598,
862
+ "grad_norm": 0.13729541550408036,
863
+ "learning_rate": 0.00016918250612804673,
864
+ "loss": 0.7058,
865
  "step": 595
866
  },
867
  {
868
  "epoch": 0.6675010429703796,
869
+ "grad_norm": 0.12792609813747083,
870
+ "learning_rate": 0.00016847738176100632,
871
+ "loss": 0.7203,
872
  "step": 600
873
  },
874
  {
875
  "epoch": 0.6675010429703796,
876
+ "eval_loss": 0.6808404326438904,
877
+ "eval_runtime": 38.0896,
878
+ "eval_samples_per_second": 10.685,
879
+ "eval_steps_per_second": 0.683,
880
  "step": 600
881
  },
882
  {
883
  "epoch": 0.6730635516617994,
884
+ "grad_norm": 0.12614147228685735,
885
+ "learning_rate": 0.00016776578744629052,
886
+ "loss": 0.6984,
887
  "step": 605
888
  },
889
  {
890
  "epoch": 0.6786260603532193,
891
+ "grad_norm": 0.14053504602609868,
892
+ "learning_rate": 0.00016704779041745686,
893
+ "loss": 0.7017,
894
  "step": 610
895
  },
896
  {
897
  "epoch": 0.6841885690446391,
898
+ "grad_norm": 0.144524525939437,
899
+ "learning_rate": 0.00016632345851301031,
900
+ "loss": 0.7031,
901
  "step": 615
902
  },
903
  {
904
  "epoch": 0.6897510777360589,
905
+ "grad_norm": 0.1240886307637927,
906
+ "learning_rate": 0.000165592860169994,
907
+ "loss": 0.6856,
908
  "step": 620
909
  },
910
  {
911
  "epoch": 0.6953135864274788,
912
+ "grad_norm": 0.12592281039225006,
913
+ "learning_rate": 0.0001648560644175227,
914
+ "loss": 0.6832,
915
  "step": 625
916
  },
917
  {
918
  "epoch": 0.7008760951188986,
919
+ "grad_norm": 0.1312108925603738,
920
+ "learning_rate": 0.00016411314087026106,
921
+ "loss": 0.7266,
922
  "step": 630
923
  },
924
  {
925
  "epoch": 0.7064386038103184,
926
+ "grad_norm": 0.12871371439878188,
927
+ "learning_rate": 0.00016336415972184612,
928
+ "loss": 0.6892,
929
  "step": 635
930
  },
931
  {
932
  "epoch": 0.7120011125017383,
933
+ "grad_norm": 0.1423393128313328,
934
+ "learning_rate": 0.00016260919173825508,
935
+ "loss": 0.684,
936
  "step": 640
937
  },
938
  {
939
  "epoch": 0.7175636211931581,
940
+ "grad_norm": 0.1452224910144209,
941
+ "learning_rate": 0.00016184830825111924,
942
+ "loss": 0.6824,
943
  "step": 645
944
  },
945
  {
946
  "epoch": 0.7231261298845779,
947
+ "grad_norm": 0.12119255762333672,
948
+ "learning_rate": 0.00016108158115098444,
949
+ "loss": 0.686,
950
  "step": 650
951
  },
952
  {
953
  "epoch": 0.7286886385759977,
954
+ "grad_norm": 0.1411776440916737,
955
+ "learning_rate": 0.0001603090828805185,
956
+ "loss": 0.6878,
957
  "step": 655
958
  },
959
  {
960
  "epoch": 0.7342511472674176,
961
+ "grad_norm": 0.17300629248103022,
962
+ "learning_rate": 0.0001595308864276666,
963
+ "loss": 0.7139,
964
  "step": 660
965
  },
966
  {
967
  "epoch": 0.7398136559588374,
968
+ "grad_norm": 0.1458091025518703,
969
+ "learning_rate": 0.0001587470653187553,
970
+ "loss": 0.729,
971
  "step": 665
972
  },
973
  {
974
  "epoch": 0.7453761646502572,
975
+ "grad_norm": 0.14056535678032148,
976
+ "learning_rate": 0.00015795769361154547,
977
+ "loss": 0.6972,
978
  "step": 670
979
  },
980
  {
981
  "epoch": 0.7509386733416771,
982
+ "grad_norm": 0.13734797531202647,
983
+ "learning_rate": 0.000157162845888235,
984
+ "loss": 0.6779,
985
  "step": 675
986
  },
987
  {
988
  "epoch": 0.7565011820330969,
989
+ "grad_norm": 0.12703010084902983,
990
+ "learning_rate": 0.00015636259724841222,
991
+ "loss": 0.7027,
992
  "step": 680
993
  },
994
  {
995
  "epoch": 0.7620636907245167,
996
+ "grad_norm": 0.12697872232577906,
997
+ "learning_rate": 0.00015555702330196023,
998
+ "loss": 0.7079,
999
  "step": 685
1000
  },
1001
  {
1002
  "epoch": 0.7676261994159366,
1003
+ "grad_norm": 0.15015643114183339,
1004
+ "learning_rate": 0.00015474620016191294,
1005
+ "loss": 0.6938,
1006
  "step": 690
1007
  },
1008
  {
1009
  "epoch": 0.7731887081073564,
1010
+ "grad_norm": 0.1468310767380953,
1011
+ "learning_rate": 0.00015393020443726381,
1012
+ "loss": 0.7061,
1013
  "step": 695
1014
  },
1015
  {
1016
  "epoch": 0.7787512167987762,
1017
+ "grad_norm": 0.1178258952392868,
1018
+ "learning_rate": 0.00015310911322572753,
1019
+ "loss": 0.6635,
1020
  "step": 700
1021
  },
1022
  {
1023
  "epoch": 0.7843137254901961,
1024
+ "grad_norm": 0.1182521343472938,
1025
+ "learning_rate": 0.00015228300410645556,
1026
+ "loss": 0.6783,
1027
  "step": 705
1028
  },
1029
  {
1030
  "epoch": 0.7898762341816159,
1031
+ "grad_norm": 0.11865217322316654,
1032
+ "learning_rate": 0.00015145195513270644,
1033
+ "loss": 0.6822,
1034
  "step": 710
1035
  },
1036
  {
1037
  "epoch": 0.7954387428730357,
1038
+ "grad_norm": 0.11500353884529863,
1039
+ "learning_rate": 0.00015061604482447075,
1040
+ "loss": 0.6668,
1041
  "step": 715
1042
  },
1043
  {
1044
  "epoch": 0.8010012515644556,
1045
+ "grad_norm": 0.1314805700111582,
1046
+ "learning_rate": 0.0001497753521610526,
1047
+ "loss": 0.7105,
1048
  "step": 720
1049
  },
1050
  {
1051
  "epoch": 0.8065637602558754,
1052
+ "grad_norm": 0.14124896793550357,
1053
+ "learning_rate": 0.00014892995657360717,
1054
+ "loss": 0.7142,
1055
  "step": 725
1056
  },
1057
  {
1058
  "epoch": 0.8121262689472952,
1059
+ "grad_norm": 0.13739934885223082,
1060
+ "learning_rate": 0.00014807993793763619,
1061
+ "loss": 0.6815,
1062
  "step": 730
1063
  },
1064
  {
1065
  "epoch": 0.8176887776387151,
1066
+ "grad_norm": 0.13281564930523101,
1067
+ "learning_rate": 0.0001472253765654406,
1068
+ "loss": 0.67,
1069
  "step": 735
1070
  },
1071
  {
1072
  "epoch": 0.8232512863301349,
1073
+ "grad_norm": 0.11821804301835571,
1074
+ "learning_rate": 0.00014636635319853275,
1075
+ "loss": 0.6969,
1076
  "step": 740
1077
  },
1078
  {
1079
  "epoch": 0.8288137950215547,
1080
+ "grad_norm": 0.13929966861379298,
1081
+ "learning_rate": 0.00014550294900000753,
1082
+ "loss": 0.696,
1083
  "step": 745
1084
  },
1085
  {
1086
  "epoch": 0.8343763037129746,
1087
+ "grad_norm": 0.1358532247107506,
1088
+ "learning_rate": 0.00014463524554687399,
1089
+ "loss": 0.6987,
1090
  "step": 750
1091
  },
1092
  {
1093
  "epoch": 0.8399388124043944,
1094
+ "grad_norm": 0.14607145415562992,
1095
+ "learning_rate": 0.00014376332482234747,
1096
+ "loss": 0.6713,
1097
  "step": 755
1098
  },
1099
  {
1100
  "epoch": 0.8455013210958142,
1101
+ "grad_norm": 0.13045660818329083,
1102
+ "learning_rate": 0.0001428872692081038,
1103
+ "loss": 0.6782,
1104
  "step": 760
1105
  },
1106
  {
1107
  "epoch": 0.851063829787234,
1108
+ "grad_norm": 0.14868265540995262,
1109
+ "learning_rate": 0.00014200716147649557,
1110
+ "loss": 0.6856,
1111
  "step": 765
1112
  },
1113
  {
1114
  "epoch": 0.8566263384786539,
1115
+ "grad_norm": 0.1523898280318719,
1116
+ "learning_rate": 0.00014112308478273145,
1117
+ "loss": 0.7077,
1118
  "step": 770
1119
  },
1120
  {
1121
  "epoch": 0.8621888471700737,
1122
+ "grad_norm": 0.14374188108756408,
1123
+ "learning_rate": 0.00014023512265701955,
1124
+ "loss": 0.6557,
1125
  "step": 775
1126
  },
1127
  {
1128
  "epoch": 0.8677513558614935,
1129
+ "grad_norm": 0.13285163140650738,
1130
+ "learning_rate": 0.00013934335899667527,
1131
+ "loss": 0.7028,
1132
  "step": 780
1133
  },
1134
  {
1135
  "epoch": 0.8733138645529134,
1136
+ "grad_norm": 0.12419599179555058,
1137
+ "learning_rate": 0.0001384478780581942,
1138
+ "loss": 0.6753,
1139
  "step": 785
1140
  },
1141
  {
1142
  "epoch": 0.8788763732443332,
1143
+ "grad_norm": 0.12619683911200477,
1144
+ "learning_rate": 0.00013754876444929166,
1145
+ "loss": 0.7033,
1146
  "step": 790
1147
  },
1148
  {
1149
  "epoch": 0.884438881935753,
1150
+ "grad_norm": 0.14380964532401144,
1151
+ "learning_rate": 0.00013664610312090838,
1152
+ "loss": 0.7022,
1153
  "step": 795
1154
  },
1155
  {
1156
  "epoch": 0.8900013906271729,
1157
+ "grad_norm": 0.13341619640026267,
1158
+ "learning_rate": 0.0001357399793591844,
1159
+ "loss": 0.703,
1160
  "step": 800
1161
  },
1162
  {
1163
  "epoch": 0.8900013906271729,
1164
+ "eval_loss": 0.6755693554878235,
1165
+ "eval_runtime": 38.0742,
1166
+ "eval_samples_per_second": 10.69,
1167
+ "eval_steps_per_second": 0.683,
1168
  "step": 800
1169
  },
1170
  {
1171
  "epoch": 0.8955638993185927,
1172
+ "grad_norm": 0.11373621352884696,
1173
+ "learning_rate": 0.00013483047877740055,
1174
+ "loss": 0.6554,
1175
  "step": 805
1176
  },
1177
  {
1178
  "epoch": 0.9011264080100125,
1179
+ "grad_norm": 0.1418177008323639,
1180
+ "learning_rate": 0.00013391768730789002,
1181
+ "loss": 0.7435,
1182
  "step": 810
1183
  },
1184
  {
1185
  "epoch": 0.9066889167014324,
1186
+ "grad_norm": 0.14095795493706695,
1187
+ "learning_rate": 0.00013300169119391864,
1188
+ "loss": 0.6856,
1189
  "step": 815
1190
  },
1191
  {
1192
  "epoch": 0.9122514253928522,
1193
+ "grad_norm": 0.13583867929382498,
1194
+ "learning_rate": 0.00013208257698153677,
1195
+ "loss": 0.6791,
1196
  "step": 820
1197
  },
1198
  {
1199
  "epoch": 0.917813934084272,
1200
+ "grad_norm": 0.13643641315332133,
1201
+ "learning_rate": 0.00013116043151140203,
1202
+ "loss": 0.6958,
1203
  "step": 825
1204
  },
1205
  {
1206
  "epoch": 0.9233764427756919,
1207
+ "grad_norm": 0.12156612730761757,
1208
+ "learning_rate": 0.00013023534191057426,
1209
+ "loss": 0.6912,
1210
  "step": 830
1211
  },
1212
  {
1213
  "epoch": 0.9289389514671117,
1214
+ "grad_norm": 0.14020483760043728,
1215
+ "learning_rate": 0.0001293073955842836,
1216
+ "loss": 0.7086,
1217
  "step": 835
1218
  },
1219
  {
1220
  "epoch": 0.9345014601585315,
1221
+ "grad_norm": 0.13715121587687112,
1222
+ "learning_rate": 0.0001283766802076722,
1223
+ "loss": 0.6804,
1224
  "step": 840
1225
  },
1226
  {
1227
  "epoch": 0.9400639688499514,
1228
+ "grad_norm": 0.12395531650931035,
1229
+ "learning_rate": 0.00012744328371751024,
1230
+ "loss": 0.7246,
1231
  "step": 845
1232
  },
1233
  {
1234
  "epoch": 0.9456264775413712,
1235
+ "grad_norm": 0.1351012642378018,
1236
+ "learning_rate": 0.00012650729430388764,
1237
+ "loss": 0.6695,
1238
  "step": 850
1239
  },
1240
  {
1241
  "epoch": 0.951188986232791,
1242
+ "grad_norm": 0.14258700665297835,
1243
+ "learning_rate": 0.00012556880040188144,
1244
+ "loss": 0.6844,
1245
  "step": 855
1246
  },
1247
  {
1248
  "epoch": 0.9567514949242109,
1249
+ "grad_norm": 0.14370950835852644,
1250
+ "learning_rate": 0.00012462789068320017,
1251
+ "loss": 0.7156,
1252
  "step": 860
1253
  },
1254
  {
1255
  "epoch": 0.9623140036156307,
1256
+ "grad_norm": 0.1390057701240165,
1257
+ "learning_rate": 0.00012368465404780598,
1258
+ "loss": 0.6652,
1259
  "step": 865
1260
  },
1261
  {
1262
  "epoch": 0.9678765123070505,
1263
+ "grad_norm": 0.127046163008197,
1264
+ "learning_rate": 0.00012273917961551513,
1265
+ "loss": 0.6641,
1266
  "step": 870
1267
  },
1268
  {
1269
  "epoch": 0.9734390209984704,
1270
+ "grad_norm": 0.14753709766070622,
1271
+ "learning_rate": 0.00012179155671757754,
1272
+ "loss": 0.689,
1273
  "step": 875
1274
  },
1275
  {
1276
  "epoch": 0.9790015296898902,
1277
+ "grad_norm": 0.13366681839894046,
1278
+ "learning_rate": 0.00012084187488823657,
1279
+ "loss": 0.6938,
1280
  "step": 880
1281
  },
1282
  {
1283
  "epoch": 0.98456403838131,
1284
+ "grad_norm": 0.14364888110758262,
1285
+ "learning_rate": 0.00011989022385626967,
1286
+ "loss": 0.6867,
1287
  "step": 885
1288
  },
1289
  {
1290
  "epoch": 0.9901265470727298,
1291
+ "grad_norm": 0.13750035239201705,
1292
+ "learning_rate": 0.00011893669353651031,
1293
+ "loss": 0.6926,
1294
  "step": 890
1295
  },
1296
  {
1297
  "epoch": 0.9956890557641497,
1298
+ "grad_norm": 0.12165819999979594,
1299
+ "learning_rate": 0.00011798137402135277,
1300
+ "loss": 0.6781,
1301
  "step": 895
1302
  },
1303
  {
1304
+ "epoch": 1.0012515644555695,
1305
+ "grad_norm": 0.13460664662956676,
1306
+ "learning_rate": 0.00011702435557223987,
1307
+ "loss": 0.6907,
1308
+ "step": 900
1309
+ },
1310
+ {
1311
+ "epoch": 1.0068140731469892,
1312
+ "grad_norm": 0.14812357106021773,
1313
+ "learning_rate": 0.00011606572861113474,
1314
+ "loss": 0.6739,
1315
+ "step": 905
1316
+ },
1317
+ {
1318
+ "epoch": 1.0123765818384092,
1319
+ "grad_norm": 0.14414234656282895,
1320
+ "learning_rate": 0.00011510558371197753,
1321
+ "loss": 0.6581,
1322
+ "step": 910
1323
+ },
1324
+ {
1325
+ "epoch": 1.017939090529829,
1326
+ "grad_norm": 0.1417200021185479,
1327
+ "learning_rate": 0.00011414401159212778,
1328
+ "loss": 0.6886,
1329
+ "step": 915
1330
+ },
1331
+ {
1332
+ "epoch": 1.0235015992212488,
1333
+ "grad_norm": 0.13876983385482528,
1334
+ "learning_rate": 0.00011318110310379301,
1335
+ "loss": 0.6737,
1336
+ "step": 920
1337
+ },
1338
+ {
1339
+ "epoch": 1.0290641079126686,
1340
+ "grad_norm": 0.15251072221895987,
1341
+ "learning_rate": 0.00011221694922544491,
1342
+ "loss": 0.6692,
1343
+ "step": 925
1344
+ },
1345
+ {
1346
+ "epoch": 1.0346266166040885,
1347
+ "grad_norm": 0.14026789842910758,
1348
+ "learning_rate": 0.0001112516410532233,
1349
+ "loss": 0.6562,
1350
+ "step": 930
1351
+ },
1352
+ {
1353
+ "epoch": 1.0401891252955082,
1354
+ "grad_norm": 0.12707738202474425,
1355
+ "learning_rate": 0.00011028526979232913,
1356
+ "loss": 0.6618,
1357
+ "step": 935
1358
+ },
1359
+ {
1360
+ "epoch": 1.0457516339869282,
1361
+ "grad_norm": 0.14606846473361,
1362
+ "learning_rate": 0.00010931792674840718,
1363
+ "loss": 0.6746,
1364
+ "step": 940
1365
+ },
1366
+ {
1367
+ "epoch": 1.0513141426783479,
1368
+ "grad_norm": 0.14011972796939615,
1369
+ "learning_rate": 0.00010834970331891914,
1370
+ "loss": 0.6869,
1371
+ "step": 945
1372
+ },
1373
+ {
1374
+ "epoch": 1.0568766513697678,
1375
+ "grad_norm": 0.1563818365447484,
1376
+ "learning_rate": 0.0001073806909845082,
1377
+ "loss": 0.66,
1378
+ "step": 950
1379
+ },
1380
+ {
1381
+ "epoch": 1.0624391600611875,
1382
+ "grad_norm": 0.12113541850928697,
1383
+ "learning_rate": 0.00010641098130035562,
1384
+ "loss": 0.6567,
1385
+ "step": 955
1386
+ },
1387
+ {
1388
+ "epoch": 1.0680016687526075,
1389
+ "grad_norm": 0.12784803662349467,
1390
+ "learning_rate": 0.00010544066588753044,
1391
+ "loss": 0.6478,
1392
+ "step": 960
1393
+ },
1394
+ {
1395
+ "epoch": 1.0735641774440272,
1396
+ "grad_norm": 0.13555213849737605,
1397
+ "learning_rate": 0.00010446983642433259,
1398
+ "loss": 0.6423,
1399
+ "step": 965
1400
+ },
1401
+ {
1402
+ "epoch": 1.0791266861354472,
1403
+ "grad_norm": 0.14956837158318062,
1404
+ "learning_rate": 0.00010349858463763113,
1405
+ "loss": 0.6482,
1406
+ "step": 970
1407
+ },
1408
+ {
1409
+ "epoch": 1.0846891948268669,
1410
+ "grad_norm": 0.14302685723153047,
1411
+ "learning_rate": 0.0001025270022941975,
1412
+ "loss": 0.6619,
1413
+ "step": 975
1414
+ },
1415
+ {
1416
+ "epoch": 1.0902517035182868,
1417
+ "grad_norm": 0.1531273940282806,
1418
+ "learning_rate": 0.0001015551811920351,
1419
+ "loss": 0.6818,
1420
+ "step": 980
1421
+ },
1422
+ {
1423
+ "epoch": 1.0958142122097065,
1424
+ "grad_norm": 0.1431917561070241,
1425
+ "learning_rate": 0.00010058321315170596,
1426
+ "loss": 0.6597,
1427
+ "step": 985
1428
+ },
1429
+ {
1430
+ "epoch": 1.1013767209011265,
1431
+ "grad_norm": 0.14533278983667672,
1432
+ "learning_rate": 9.961119000765531e-05,
1433
+ "loss": 0.641,
1434
+ "step": 990
1435
+ },
1436
+ {
1437
+ "epoch": 1.1069392295925462,
1438
+ "grad_norm": 0.1426693103942941,
1439
+ "learning_rate": 9.863920359953474e-05,
1440
+ "loss": 0.6864,
1441
+ "step": 995
1442
+ },
1443
+ {
1444
+ "epoch": 1.1125017382839661,
1445
+ "grad_norm": 0.14515333605223138,
1446
+ "learning_rate": 9.766734576352478e-05,
1447
+ "loss": 0.6759,
1448
+ "step": 1000
1449
+ },
1450
+ {
1451
+ "epoch": 1.1125017382839661,
1452
+ "eval_loss": 0.6748408079147339,
1453
+ "eval_runtime": 38.0571,
1454
+ "eval_samples_per_second": 10.694,
1455
+ "eval_steps_per_second": 0.683,
1456
+ "step": 1000
1457
+ },
1458
+ {
1459
+ "epoch": 1.1180642469753859,
1460
+ "grad_norm": 0.13884691864173954,
1461
+ "learning_rate": 9.669570832365838e-05,
1462
+ "loss": 0.6536,
1463
+ "step": 1005
1464
+ },
1465
+ {
1466
+ "epoch": 1.1236267556668058,
1467
+ "grad_norm": 0.14190147387793983,
1468
+ "learning_rate": 9.572438308314446e-05,
1469
+ "loss": 0.6448,
1470
+ "step": 1010
1471
+ },
1472
+ {
1473
+ "epoch": 1.1291892643582255,
1474
+ "grad_norm": 0.1460356352229923,
1475
+ "learning_rate": 9.475346181569467e-05,
1476
+ "loss": 0.6709,
1477
+ "step": 1015
1478
+ },
1479
+ {
1480
+ "epoch": 1.1347517730496455,
1481
+ "grad_norm": 0.15004591423623692,
1482
+ "learning_rate": 9.378303625685195e-05,
1483
+ "loss": 0.6338,
1484
+ "step": 1020
1485
+ },
1486
+ {
1487
+ "epoch": 1.1403142817410652,
1488
+ "grad_norm": 0.14709044334780488,
1489
+ "learning_rate": 9.281319809532329e-05,
1490
+ "loss": 0.6482,
1491
+ "step": 1025
1492
+ },
1493
+ {
1494
+ "epoch": 1.1458767904324851,
1495
+ "grad_norm": 0.1339983467335465,
1496
+ "learning_rate": 9.18440389643165e-05,
1497
+ "loss": 0.6564,
1498
+ "step": 1030
1499
+ },
1500
+ {
1501
+ "epoch": 1.1514392991239049,
1502
+ "grad_norm": 0.15465369318467384,
1503
+ "learning_rate": 9.08756504328827e-05,
1504
+ "loss": 0.631,
1505
+ "step": 1035
1506
+ },
1507
+ {
1508
+ "epoch": 1.1570018078153248,
1509
+ "grad_norm": 0.14414339429238215,
1510
+ "learning_rate": 8.990812399726435e-05,
1511
+ "loss": 0.63,
1512
+ "step": 1040
1513
+ },
1514
+ {
1515
+ "epoch": 1.1625643165067445,
1516
+ "grad_norm": 0.1530406190835784,
1517
+ "learning_rate": 8.894155107225062e-05,
1518
+ "loss": 0.6599,
1519
+ "step": 1045
1520
+ },
1521
+ {
1522
+ "epoch": 1.1681268251981645,
1523
+ "grad_norm": 0.18004713578221632,
1524
+ "learning_rate": 8.797602298254004e-05,
1525
+ "loss": 0.6588,
1526
+ "step": 1050
1527
+ },
1528
+ {
1529
+ "epoch": 1.1736893338895842,
1530
+ "grad_norm": 0.1395761709805775,
1531
+ "learning_rate": 8.701163095411212e-05,
1532
+ "loss": 0.6388,
1533
+ "step": 1055
1534
+ },
1535
+ {
1536
+ "epoch": 1.1792518425810041,
1537
+ "grad_norm": 0.14800345539370582,
1538
+ "learning_rate": 8.604846610560771e-05,
1539
+ "loss": 0.6469,
1540
+ "step": 1060
1541
+ },
1542
+ {
1543
+ "epoch": 1.1848143512724238,
1544
+ "grad_norm": 0.18153262554102811,
1545
+ "learning_rate": 8.508661943972021e-05,
1546
+ "loss": 0.6476,
1547
+ "step": 1065
1548
+ },
1549
+ {
1550
+ "epoch": 1.1903768599638438,
1551
+ "grad_norm": 0.1460085066143512,
1552
+ "learning_rate": 8.412618183459708e-05,
1553
+ "loss": 0.6417,
1554
+ "step": 1070
1555
+ },
1556
+ {
1557
+ "epoch": 1.1959393686552635,
1558
+ "grad_norm": 0.1563928211034322,
1559
+ "learning_rate": 8.316724403525359e-05,
1560
+ "loss": 0.6655,
1561
+ "step": 1075
1562
+ },
1563
+ {
1564
+ "epoch": 1.2015018773466832,
1565
+ "grad_norm": 0.13021633068783775,
1566
+ "learning_rate": 8.220989664499878e-05,
1567
+ "loss": 0.6495,
1568
+ "step": 1080
1569
+ },
1570
+ {
1571
+ "epoch": 1.2070643860381032,
1572
+ "grad_norm": 0.15359952593058618,
1573
+ "learning_rate": 8.125423011687524e-05,
1574
+ "loss": 0.6518,
1575
+ "step": 1085
1576
+ },
1577
+ {
1578
+ "epoch": 1.2126268947295231,
1579
+ "grad_norm": 0.14327581045453255,
1580
+ "learning_rate": 8.030033474511249e-05,
1581
+ "loss": 0.6428,
1582
+ "step": 1090
1583
+ },
1584
+ {
1585
+ "epoch": 1.2181894034209428,
1586
+ "grad_norm": 0.14877277012088305,
1587
+ "learning_rate": 7.934830065659599e-05,
1588
+ "loss": 0.6823,
1589
+ "step": 1095
1590
+ },
1591
+ {
1592
+ "epoch": 1.2237519121123626,
1593
+ "grad_norm": 0.1350097173259691,
1594
+ "learning_rate": 7.839821780235168e-05,
1595
+ "loss": 0.6656,
1596
+ "step": 1100
1597
+ },
1598
+ {
1599
+ "epoch": 1.2293144208037825,
1600
+ "grad_norm": 0.14805261697578703,
1601
+ "learning_rate": 7.74501759490469e-05,
1602
+ "loss": 0.6649,
1603
+ "step": 1105
1604
+ },
1605
+ {
1606
+ "epoch": 1.2348769294952024,
1607
+ "grad_norm": 0.14241031431419957,
1608
+ "learning_rate": 7.650426467050926e-05,
1609
+ "loss": 0.645,
1610
+ "step": 1110
1611
+ },
1612
+ {
1613
+ "epoch": 1.2404394381866222,
1614
+ "grad_norm": 0.14677979540609026,
1615
+ "learning_rate": 7.556057333926318e-05,
1616
+ "loss": 0.6322,
1617
+ "step": 1115
1618
+ },
1619
+ {
1620
+ "epoch": 1.2460019468780419,
1621
+ "grad_norm": 0.1540439921077116,
1622
+ "learning_rate": 7.461919111808595e-05,
1623
+ "loss": 0.6425,
1624
+ "step": 1120
1625
+ },
1626
+ {
1627
+ "epoch": 1.2515644555694618,
1628
+ "grad_norm": 0.16093917993335288,
1629
+ "learning_rate": 7.368020695158312e-05,
1630
+ "loss": 0.6708,
1631
+ "step": 1125
1632
+ },
1633
+ {
1634
+ "epoch": 1.2571269642608818,
1635
+ "grad_norm": 0.14072702856543545,
1636
+ "learning_rate": 7.274370955778498e-05,
1637
+ "loss": 0.6412,
1638
+ "step": 1130
1639
+ },
1640
+ {
1641
+ "epoch": 1.2626894729523015,
1642
+ "grad_norm": 0.1372982963866414,
1643
+ "learning_rate": 7.180978741976397e-05,
1644
+ "loss": 0.6354,
1645
+ "step": 1135
1646
+ },
1647
+ {
1648
+ "epoch": 1.2682519816437212,
1649
+ "grad_norm": 0.1361010979443787,
1650
+ "learning_rate": 7.087852877727481e-05,
1651
+ "loss": 0.6297,
1652
+ "step": 1140
1653
+ },
1654
+ {
1655
+ "epoch": 1.2738144903351412,
1656
+ "grad_norm": 0.15152880519317802,
1657
+ "learning_rate": 6.995002161841708e-05,
1658
+ "loss": 0.6648,
1659
+ "step": 1145
1660
+ },
1661
+ {
1662
+ "epoch": 1.279376999026561,
1663
+ "grad_norm": 0.15111271273299604,
1664
+ "learning_rate": 6.902435367132208e-05,
1665
+ "loss": 0.6811,
1666
+ "step": 1150
1667
+ },
1668
+ {
1669
+ "epoch": 1.2849395077179808,
1670
+ "grad_norm": 0.1555612197330931,
1671
+ "learning_rate": 6.810161239586375e-05,
1672
+ "loss": 0.6639,
1673
+ "step": 1155
1674
+ },
1675
+ {
1676
+ "epoch": 1.2905020164094005,
1677
+ "grad_norm": 0.1404429960918057,
1678
+ "learning_rate": 6.718188497539554e-05,
1679
+ "loss": 0.6416,
1680
+ "step": 1160
1681
+ },
1682
+ {
1683
+ "epoch": 1.2960645251008205,
1684
+ "grad_norm": 0.1493351620856526,
1685
+ "learning_rate": 6.626525830851267e-05,
1686
+ "loss": 0.6284,
1687
+ "step": 1165
1688
+ },
1689
+ {
1690
+ "epoch": 1.3016270337922404,
1691
+ "grad_norm": 0.15944635272625493,
1692
+ "learning_rate": 6.535181900084206e-05,
1693
+ "loss": 0.6402,
1694
+ "step": 1170
1695
+ },
1696
+ {
1697
+ "epoch": 1.3071895424836601,
1698
+ "grad_norm": 0.1534235543873645,
1699
+ "learning_rate": 6.444165335685927e-05,
1700
+ "loss": 0.6373,
1701
+ "step": 1175
1702
+ },
1703
+ {
1704
+ "epoch": 1.3127520511750799,
1705
+ "grad_norm": 0.1404290668723592,
1706
+ "learning_rate": 6.35348473717345e-05,
1707
+ "loss": 0.6596,
1708
+ "step": 1180
1709
+ },
1710
+ {
1711
+ "epoch": 1.3183145598664998,
1712
+ "grad_norm": 0.15190370802350445,
1713
+ "learning_rate": 6.263148672320714e-05,
1714
+ "loss": 0.6519,
1715
+ "step": 1185
1716
+ },
1717
+ {
1718
+ "epoch": 1.3238770685579198,
1719
+ "grad_norm": 0.1396377515861422,
1720
+ "learning_rate": 6.173165676349103e-05,
1721
+ "loss": 0.6539,
1722
+ "step": 1190
1723
+ },
1724
+ {
1725
+ "epoch": 1.3294395772493395,
1726
+ "grad_norm": 0.1413554398489568,
1727
+ "learning_rate": 6.083544251120993e-05,
1728
+ "loss": 0.6436,
1729
+ "step": 1195
1730
+ },
1731
+ {
1732
+ "epoch": 1.3350020859407592,
1733
+ "grad_norm": 0.175529243963575,
1734
+ "learning_rate": 5.9942928643364724e-05,
1735
+ "loss": 0.6533,
1736
+ "step": 1200
1737
+ },
1738
+ {
1739
+ "epoch": 1.3350020859407592,
1740
+ "eval_loss": 0.6695060133934021,
1741
+ "eval_runtime": 38.0647,
1742
+ "eval_samples_per_second": 10.692,
1743
+ "eval_steps_per_second": 0.683,
1744
+ "step": 1200
1745
+ },
1746
+ {
1747
+ "epoch": 1.3405645946321791,
1748
+ "grad_norm": 0.16769090072157003,
1749
+ "learning_rate": 5.905419948733302e-05,
1750
+ "loss": 0.6434,
1751
+ "step": 1205
1752
+ },
1753
+ {
1754
+ "epoch": 1.3461271033235989,
1755
+ "grad_norm": 0.17558035684023568,
1756
+ "learning_rate": 5.816933901290136e-05,
1757
+ "loss": 0.6954,
1758
+ "step": 1210
1759
+ },
1760
+ {
1761
+ "epoch": 1.3516896120150188,
1762
+ "grad_norm": 0.15028360304997448,
1763
+ "learning_rate": 5.728843082433193e-05,
1764
+ "loss": 0.6563,
1765
+ "step": 1215
1766
+ },
1767
+ {
1768
+ "epoch": 1.3572521207064385,
1769
+ "grad_norm": 0.16888366454421866,
1770
+ "learning_rate": 5.6411558152462894e-05,
1771
+ "loss": 0.6383,
1772
+ "step": 1220
1773
+ },
1774
+ {
1775
+ "epoch": 1.3628146293978585,
1776
+ "grad_norm": 0.1507413542567497,
1777
+ "learning_rate": 5.553880384684493e-05,
1778
+ "loss": 0.6382,
1779
+ "step": 1225
1780
+ },
1781
+ {
1782
+ "epoch": 1.3683771380892782,
1783
+ "grad_norm": 0.15944821317229088,
1784
+ "learning_rate": 5.4670250367913023e-05,
1785
+ "loss": 0.6589,
1786
+ "step": 1230
1787
+ },
1788
+ {
1789
+ "epoch": 1.3739396467806981,
1790
+ "grad_norm": 0.14215107373647873,
1791
+ "learning_rate": 5.380597977919557e-05,
1792
+ "loss": 0.6346,
1793
+ "step": 1235
1794
+ },
1795
+ {
1796
+ "epoch": 1.3795021554721179,
1797
+ "grad_norm": 0.16243215498334238,
1798
+ "learning_rate": 5.2946073739560706e-05,
1799
+ "loss": 0.6527,
1800
+ "step": 1240
1801
+ },
1802
+ {
1803
+ "epoch": 1.3850646641635378,
1804
+ "grad_norm": 0.1562039482784561,
1805
+ "learning_rate": 5.209061349550095e-05,
1806
+ "loss": 0.66,
1807
+ "step": 1245
1808
+ },
1809
+ {
1810
+ "epoch": 1.3906271728549575,
1811
+ "grad_norm": 0.1680769018618033,
1812
+ "learning_rate": 5.1239679873456634e-05,
1813
+ "loss": 0.6699,
1814
+ "step": 1250
1815
+ },
1816
+ {
1817
+ "epoch": 1.3961896815463775,
1818
+ "grad_norm": 0.1478360199044516,
1819
+ "learning_rate": 5.039335327217951e-05,
1820
+ "loss": 0.6502,
1821
+ "step": 1255
1822
+ },
1823
+ {
1824
+ "epoch": 1.4017521902377972,
1825
+ "grad_norm": 0.14915391173643802,
1826
+ "learning_rate": 4.955171365513603e-05,
1827
+ "loss": 0.6143,
1828
+ "step": 1260
1829
+ },
1830
+ {
1831
+ "epoch": 1.4073146989292171,
1832
+ "grad_norm": 0.15851697773820939,
1833
+ "learning_rate": 4.871484054295258e-05,
1834
+ "loss": 0.6572,
1835
+ "step": 1265
1836
+ },
1837
+ {
1838
+ "epoch": 1.4128772076206368,
1839
+ "grad_norm": 0.15216697385980452,
1840
+ "learning_rate": 4.7882813005901696e-05,
1841
+ "loss": 0.657,
1842
+ "step": 1270
1843
+ },
1844
+ {
1845
+ "epoch": 1.4184397163120568,
1846
+ "grad_norm": 0.1437749041937587,
1847
+ "learning_rate": 4.705570965643176e-05,
1848
+ "loss": 0.6411,
1849
+ "step": 1275
1850
+ },
1851
+ {
1852
+ "epoch": 1.4240022250034765,
1853
+ "grad_norm": 0.16683978696735308,
1854
+ "learning_rate": 4.623360864173893e-05,
1855
+ "loss": 0.651,
1856
+ "step": 1280
1857
+ },
1858
+ {
1859
+ "epoch": 1.4295647336948965,
1860
+ "grad_norm": 0.1621572781654395,
1861
+ "learning_rate": 4.541658763638406e-05,
1862
+ "loss": 0.6492,
1863
+ "step": 1285
1864
+ },
1865
+ {
1866
+ "epoch": 1.4351272423863162,
1867
+ "grad_norm": 0.15994302691177126,
1868
+ "learning_rate": 4.460472383495331e-05,
1869
+ "loss": 0.6577,
1870
+ "step": 1290
1871
+ },
1872
+ {
1873
+ "epoch": 1.4406897510777361,
1874
+ "grad_norm": 0.14692563499946057,
1875
+ "learning_rate": 4.379809394476501e-05,
1876
+ "loss": 0.6496,
1877
+ "step": 1295
1878
+ },
1879
+ {
1880
+ "epoch": 1.4462522597691558,
1881
+ "grad_norm": 0.17176689432434047,
1882
+ "learning_rate": 4.2996774178621736e-05,
1883
+ "loss": 0.6433,
1884
+ "step": 1300
1885
+ },
1886
+ {
1887
+ "epoch": 1.4518147684605758,
1888
+ "grad_norm": 0.1446292926663316,
1889
+ "learning_rate": 4.220084024760982e-05,
1890
+ "loss": 0.6223,
1891
+ "step": 1305
1892
+ },
1893
+ {
1894
+ "epoch": 1.4573772771519955,
1895
+ "grad_norm": 0.15172393816056104,
1896
+ "learning_rate": 4.141036735394574e-05,
1897
+ "loss": 0.6481,
1898
+ "step": 1310
1899
+ },
1900
+ {
1901
+ "epoch": 1.4629397858434154,
1902
+ "grad_norm": 0.16544844619207105,
1903
+ "learning_rate": 4.0625430183870796e-05,
1904
+ "loss": 0.6411,
1905
+ "step": 1315
1906
+ },
1907
+ {
1908
+ "epoch": 1.4685022945348352,
1909
+ "grad_norm": 0.15934504430033075,
1910
+ "learning_rate": 3.984610290059467e-05,
1911
+ "loss": 0.6678,
1912
+ "step": 1320
1913
+ },
1914
+ {
1915
+ "epoch": 1.474064803226255,
1916
+ "grad_norm": 0.15204965773692086,
1917
+ "learning_rate": 3.907245913728807e-05,
1918
+ "loss": 0.6549,
1919
+ "step": 1325
1920
+ },
1921
+ {
1922
+ "epoch": 1.4796273119176748,
1923
+ "grad_norm": 0.15256724381583675,
1924
+ "learning_rate": 3.830457199012585e-05,
1925
+ "loss": 0.6478,
1926
+ "step": 1330
1927
+ },
1928
+ {
1929
+ "epoch": 1.4851898206090948,
1930
+ "grad_norm": 0.15150975920035156,
1931
+ "learning_rate": 3.754251401138051e-05,
1932
+ "loss": 0.6385,
1933
+ "step": 1335
1934
+ },
1935
+ {
1936
+ "epoch": 1.4907523293005145,
1937
+ "grad_norm": 0.17068279689989202,
1938
+ "learning_rate": 3.678635720256737e-05,
1939
+ "loss": 0.6659,
1940
+ "step": 1340
1941
+ },
1942
+ {
1943
+ "epoch": 1.4963148379919344,
1944
+ "grad_norm": 0.15001304767115145,
1945
+ "learning_rate": 3.6036173007641435e-05,
1946
+ "loss": 0.66,
1947
+ "step": 1345
1948
+ },
1949
+ {
1950
+ "epoch": 1.5018773466833542,
1951
+ "grad_norm": 0.14424590095876771,
1952
+ "learning_rate": 3.529203230624747e-05,
1953
+ "loss": 0.651,
1954
+ "step": 1350
1955
+ },
1956
+ {
1957
+ "epoch": 1.5074398553747739,
1958
+ "grad_norm": 0.16690307331407334,
1959
+ "learning_rate": 3.455400540702274e-05,
1960
+ "loss": 0.6508,
1961
+ "step": 1355
1962
+ },
1963
+ {
1964
+ "epoch": 1.5130023640661938,
1965
+ "grad_norm": 0.1506720689857848,
1966
+ "learning_rate": 3.3822162040954354e-05,
1967
+ "loss": 0.6347,
1968
+ "step": 1360
1969
+ },
1970
+ {
1971
+ "epoch": 1.5185648727576138,
1972
+ "grad_norm": 0.15108676554413317,
1973
+ "learning_rate": 3.309657135479065e-05,
1974
+ "loss": 0.6449,
1975
+ "step": 1365
1976
+ },
1977
+ {
1978
+ "epoch": 1.5241273814490335,
1979
+ "grad_norm": 0.1397876543411103,
1980
+ "learning_rate": 3.237730190450816e-05,
1981
+ "loss": 0.6382,
1982
+ "step": 1370
1983
+ },
1984
+ {
1985
+ "epoch": 1.5296898901404532,
1986
+ "grad_norm": 0.15000237273739636,
1987
+ "learning_rate": 3.166442164883403e-05,
1988
+ "loss": 0.6607,
1989
+ "step": 1375
1990
+ },
1991
+ {
1992
+ "epoch": 1.5352523988318731,
1993
+ "grad_norm": 0.15901665795803346,
1994
+ "learning_rate": 3.0957997942825336e-05,
1995
+ "loss": 0.6403,
1996
+ "step": 1380
1997
+ },
1998
+ {
1999
+ "epoch": 1.540814907523293,
2000
+ "grad_norm": 0.1500239251091428,
2001
+ "learning_rate": 3.0258097531504937e-05,
2002
+ "loss": 0.6595,
2003
+ "step": 1385
2004
+ },
2005
+ {
2006
+ "epoch": 1.5463774162147128,
2007
+ "grad_norm": 0.16536122507336246,
2008
+ "learning_rate": 2.9564786543555388e-05,
2009
+ "loss": 0.6515,
2010
+ "step": 1390
2011
+ },
2012
+ {
2013
+ "epoch": 1.5519399249061325,
2014
+ "grad_norm": 0.14948767796744467,
2015
+ "learning_rate": 2.8878130485070852e-05,
2016
+ "loss": 0.6341,
2017
+ "step": 1395
2018
+ },
2019
+ {
2020
+ "epoch": 1.5575024335975525,
2021
+ "grad_norm": 0.16630242366164855,
2022
+ "learning_rate": 2.819819423336775e-05,
2023
+ "loss": 0.6458,
2024
+ "step": 1400
2025
+ },
2026
+ {
2027
+ "epoch": 1.5575024335975525,
2028
+ "eval_loss": 0.6669326424598694,
2029
+ "eval_runtime": 38.0635,
2030
+ "eval_samples_per_second": 10.693,
2031
+ "eval_steps_per_second": 0.683,
2032
+ "step": 1400
2033
+ },
2034
+ {
2035
+ "epoch": 1.5630649422889724,
2036
+ "grad_norm": 0.15092670649471096,
2037
+ "learning_rate": 2.7525042030855218e-05,
2038
+ "loss": 0.6285,
2039
+ "step": 1405
2040
+ },
2041
+ {
2042
+ "epoch": 1.5686274509803921,
2043
+ "grad_norm": 0.17817145439236162,
2044
+ "learning_rate": 2.6858737478965035e-05,
2045
+ "loss": 0.6786,
2046
+ "step": 1410
2047
+ },
2048
+ {
2049
+ "epoch": 1.5741899596718119,
2050
+ "grad_norm": 0.14411321645345596,
2051
+ "learning_rate": 2.6199343532142573e-05,
2052
+ "loss": 0.6487,
2053
+ "step": 1415
2054
+ },
2055
+ {
2056
+ "epoch": 1.5797524683632318,
2057
+ "grad_norm": 0.1729923377951436,
2058
+ "learning_rate": 2.5546922491898495e-05,
2059
+ "loss": 0.6466,
2060
+ "step": 1420
2061
+ },
2062
+ {
2063
+ "epoch": 1.5853149770546517,
2064
+ "grad_norm": 0.17040206170029934,
2065
+ "learning_rate": 2.4901536000922497e-05,
2066
+ "loss": 0.665,
2067
+ "step": 1425
2068
+ },
2069
+ {
2070
+ "epoch": 1.5908774857460715,
2071
+ "grad_norm": 0.16055497144569197,
2072
+ "learning_rate": 2.4263245037258995e-05,
2073
+ "loss": 0.6665,
2074
+ "step": 1430
2075
+ },
2076
+ {
2077
+ "epoch": 1.5964399944374912,
2078
+ "grad_norm": 0.1655874585771864,
2079
+ "learning_rate": 2.363210990854582e-05,
2080
+ "loss": 0.6734,
2081
+ "step": 1435
2082
+ },
2083
+ {
2084
+ "epoch": 1.6020025031289111,
2085
+ "grad_norm": 0.1651519002975595,
2086
+ "learning_rate": 2.300819024631603e-05,
2087
+ "loss": 0.6691,
2088
+ "step": 1440
2089
+ },
2090
+ {
2091
+ "epoch": 1.607565011820331,
2092
+ "grad_norm": 0.162990807682259,
2093
+ "learning_rate": 2.239154500036399e-05,
2094
+ "loss": 0.6805,
2095
+ "step": 1445
2096
+ },
2097
+ {
2098
+ "epoch": 1.6131275205117508,
2099
+ "grad_norm": 0.1717600360257139,
2100
+ "learning_rate": 2.178223243317532e-05,
2101
+ "loss": 0.6399,
2102
+ "step": 1450
2103
+ },
2104
+ {
2105
+ "epoch": 1.6186900292031705,
2106
+ "grad_norm": 0.1739831864715599,
2107
+ "learning_rate": 2.1180310114422362e-05,
2108
+ "loss": 0.6689,
2109
+ "step": 1455
2110
+ },
2111
+ {
2112
+ "epoch": 1.6242525378945905,
2113
+ "grad_norm": 0.15498282638359004,
2114
+ "learning_rate": 2.058583491552465e-05,
2115
+ "loss": 0.6675,
2116
+ "step": 1460
2117
+ },
2118
+ {
2119
+ "epoch": 1.6298150465860104,
2120
+ "grad_norm": 0.17242195548428726,
2121
+ "learning_rate": 1.9998863004275593e-05,
2122
+ "loss": 0.6618,
2123
+ "step": 1465
2124
+ },
2125
+ {
2126
+ "epoch": 1.6353775552774301,
2127
+ "grad_norm": 0.16154448145412242,
2128
+ "learning_rate": 1.941944983953552e-05,
2129
+ "loss": 0.6406,
2130
+ "step": 1470
2131
+ },
2132
+ {
2133
+ "epoch": 1.6409400639688498,
2134
+ "grad_norm": 0.1624668193562119,
2135
+ "learning_rate": 1.884765016599186e-05,
2136
+ "loss": 0.6223,
2137
+ "step": 1475
2138
+ },
2139
+ {
2140
+ "epoch": 1.6465025726602698,
2141
+ "grad_norm": 0.16026073767873994,
2142
+ "learning_rate": 1.8283518008986567e-05,
2143
+ "loss": 0.6424,
2144
+ "step": 1480
2145
+ },
2146
+ {
2147
+ "epoch": 1.6520650813516897,
2148
+ "grad_norm": 0.14391017994489602,
2149
+ "learning_rate": 1.7727106669411776e-05,
2150
+ "loss": 0.6323,
2151
+ "step": 1485
2152
+ },
2153
+ {
2154
+ "epoch": 1.6576275900431094,
2155
+ "grad_norm": 0.16566629921954873,
2156
+ "learning_rate": 1.7178468718673714e-05,
2157
+ "loss": 0.6605,
2158
+ "step": 1490
2159
+ },
2160
+ {
2161
+ "epoch": 1.6631900987345292,
2162
+ "grad_norm": 0.15202811145565842,
2163
+ "learning_rate": 1.6637655993725598e-05,
2164
+ "loss": 0.6745,
2165
+ "step": 1495
2166
+ },
2167
+ {
2168
+ "epoch": 1.668752607425949,
2169
+ "grad_norm": 0.14529589529297973,
2170
+ "learning_rate": 1.6104719592169902e-05,
2171
+ "loss": 0.6415,
2172
+ "step": 1500
2173
+ },
2174
+ {
2175
+ "epoch": 1.674315116117369,
2176
+ "grad_norm": 0.16206187759632693,
2177
+ "learning_rate": 1.5579709867430514e-05,
2178
+ "loss": 0.6757,
2179
+ "step": 1505
2180
+ },
2181
+ {
2182
+ "epoch": 1.6798776248087888,
2183
+ "grad_norm": 0.1689334880075168,
2184
+ "learning_rate": 1.5062676423995247e-05,
2185
+ "loss": 0.6648,
2186
+ "step": 1510
2187
+ },
2188
+ {
2189
+ "epoch": 1.6854401335002085,
2190
+ "grad_norm": 0.17002942586162523,
2191
+ "learning_rate": 1.4553668112729025e-05,
2192
+ "loss": 0.6593,
2193
+ "step": 1515
2194
+ },
2195
+ {
2196
+ "epoch": 1.6910026421916284,
2197
+ "grad_norm": 0.15390481859714705,
2198
+ "learning_rate": 1.4052733026258281e-05,
2199
+ "loss": 0.6337,
2200
+ "step": 1520
2201
+ },
2202
+ {
2203
+ "epoch": 1.6965651508830484,
2204
+ "grad_norm": 0.16322817116461236,
2205
+ "learning_rate": 1.3559918494427015e-05,
2206
+ "loss": 0.6671,
2207
+ "step": 1525
2208
+ },
2209
+ {
2210
+ "epoch": 1.702127659574468,
2211
+ "grad_norm": 0.17381037300710486,
2212
+ "learning_rate": 1.3075271079825036e-05,
2213
+ "loss": 0.6452,
2214
+ "step": 1530
2215
+ },
2216
+ {
2217
+ "epoch": 1.7076901682658878,
2218
+ "grad_norm": 0.14385216586204455,
2219
+ "learning_rate": 1.2598836573388383e-05,
2220
+ "loss": 0.6517,
2221
+ "step": 1535
2222
+ },
2223
+ {
2224
+ "epoch": 1.7132526769573078,
2225
+ "grad_norm": 0.13695810287509047,
2226
+ "learning_rate": 1.2130659990073146e-05,
2227
+ "loss": 0.6115,
2228
+ "step": 1540
2229
+ },
2230
+ {
2231
+ "epoch": 1.7188151856487277,
2232
+ "grad_norm": 0.15619086164642515,
2233
+ "learning_rate": 1.1670785564601972e-05,
2234
+ "loss": 0.6592,
2235
+ "step": 1545
2236
+ },
2237
+ {
2238
+ "epoch": 1.7243776943401474,
2239
+ "grad_norm": 0.1359615918791439,
2240
+ "learning_rate": 1.1219256747285045e-05,
2241
+ "loss": 0.6489,
2242
+ "step": 1550
2243
+ },
2244
+ {
2245
+ "epoch": 1.7299402030315671,
2246
+ "grad_norm": 0.1530561734129938,
2247
+ "learning_rate": 1.0776116199914343e-05,
2248
+ "loss": 0.6341,
2249
+ "step": 1555
2250
+ },
2251
+ {
2252
+ "epoch": 1.735502711722987,
2253
+ "grad_norm": 0.1550580447696997,
2254
+ "learning_rate": 1.0341405791733183e-05,
2255
+ "loss": 0.679,
2256
+ "step": 1560
2257
+ },
2258
+ {
2259
+ "epoch": 1.741065220414407,
2260
+ "grad_norm": 0.15343391714899443,
2261
+ "learning_rate": 9.915166595480018e-06,
2262
+ "loss": 0.6174,
2263
+ "step": 1565
2264
+ },
2265
+ {
2266
+ "epoch": 1.7466277291058268,
2267
+ "grad_norm": 0.14886169752976763,
2268
+ "learning_rate": 9.49743888350798e-06,
2269
+ "loss": 0.658,
2270
+ "step": 1570
2271
+ },
2272
+ {
2273
+ "epoch": 1.7521902377972465,
2274
+ "grad_norm": 0.16009066506717334,
2275
+ "learning_rate": 9.088262123979652e-06,
2276
+ "loss": 0.6547,
2277
+ "step": 1575
2278
+ },
2279
+ {
2280
+ "epoch": 1.7577527464886664,
2281
+ "grad_norm": 0.1527884120085583,
2282
+ "learning_rate": 8.687674977138116e-06,
2283
+ "loss": 0.6355,
2284
+ "step": 1580
2285
+ },
2286
+ {
2287
+ "epoch": 1.7633152551800864,
2288
+ "grad_norm": 0.15622894390259232,
2289
+ "learning_rate": 8.295715291654205e-06,
2290
+ "loss": 0.6415,
2291
+ "step": 1585
2292
+ },
2293
+ {
2294
+ "epoch": 1.768877763871506,
2295
+ "grad_norm": 0.13821573876875842,
2296
+ "learning_rate": 7.912420101050367e-06,
2297
+ "loss": 0.6386,
2298
+ "step": 1590
2299
+ },
2300
+ {
2301
+ "epoch": 1.7744402725629258,
2302
+ "grad_norm": 0.1605607168007031,
2303
+ "learning_rate": 7.537825620201699e-06,
2304
+ "loss": 0.6395,
2305
+ "step": 1595
2306
+ },
2307
+ {
2308
+ "epoch": 1.7800027812543457,
2309
+ "grad_norm": 0.14785735762140897,
2310
+ "learning_rate": 7.171967241914224e-06,
2311
+ "loss": 0.632,
2312
+ "step": 1600
2313
+ },
2314
+ {
2315
+ "epoch": 1.7800027812543457,
2316
+ "eval_loss": 0.6654813289642334,
2317
+ "eval_runtime": 38.0745,
2318
+ "eval_samples_per_second": 10.69,
2319
+ "eval_steps_per_second": 0.683,
2320
+ "step": 1600
2321
+ },
2322
+ {
2323
+ "epoch": 1.7855652899457657,
2324
+ "grad_norm": 0.15930848069616485,
2325
+ "learning_rate": 6.814879533580898e-06,
2326
+ "loss": 0.6298,
2327
+ "step": 1605
2328
+ },
2329
+ {
2330
+ "epoch": 1.7911277986371854,
2331
+ "grad_norm": 0.16408233749219603,
2332
+ "learning_rate": 6.4665962339156005e-06,
2333
+ "loss": 0.6383,
2334
+ "step": 1610
2335
+ },
2336
+ {
2337
+ "epoch": 1.7966903073286051,
2338
+ "grad_norm": 0.1507405199602565,
2339
+ "learning_rate": 6.127150249765335e-06,
2340
+ "loss": 0.6538,
2341
+ "step": 1615
2342
+ },
2343
+ {
2344
+ "epoch": 1.802252816020025,
2345
+ "grad_norm": 0.15449896745701766,
2346
+ "learning_rate": 5.7965736530010916e-06,
2347
+ "loss": 0.6443,
2348
+ "step": 1620
2349
+ },
2350
+ {
2351
+ "epoch": 1.807815324711445,
2352
+ "grad_norm": 0.15430381806456947,
2353
+ "learning_rate": 5.474897677487711e-06,
2354
+ "loss": 0.6714,
2355
+ "step": 1625
2356
+ },
2357
+ {
2358
+ "epoch": 1.8133778334028647,
2359
+ "grad_norm": 0.1522219198315468,
2360
+ "learning_rate": 5.162152716132662e-06,
2361
+ "loss": 0.6462,
2362
+ "step": 1630
2363
+ },
2364
+ {
2365
+ "epoch": 1.8189403420942845,
2366
+ "grad_norm": 0.14950817969442623,
2367
+ "learning_rate": 4.858368318014572e-06,
2368
+ "loss": 0.6309,
2369
+ "step": 1635
2370
+ },
2371
+ {
2372
+ "epoch": 1.8245028507857044,
2373
+ "grad_norm": 0.16343103461857358,
2374
+ "learning_rate": 4.563573185591219e-06,
2375
+ "loss": 0.6229,
2376
+ "step": 1640
2377
+ },
2378
+ {
2379
+ "epoch": 1.8300653594771243,
2380
+ "grad_norm": 0.16046392285596128,
2381
+ "learning_rate": 4.2777951719877415e-06,
2382
+ "loss": 0.6526,
2383
+ "step": 1645
2384
+ },
2385
+ {
2386
+ "epoch": 1.835627868168544,
2387
+ "grad_norm": 0.16358481948661036,
2388
+ "learning_rate": 4.0010612783648925e-06,
2389
+ "loss": 0.6407,
2390
+ "step": 1650
2391
+ },
2392
+ {
2393
+ "epoch": 1.8411903768599638,
2394
+ "grad_norm": 0.15364511127769814,
2395
+ "learning_rate": 3.7333976513680093e-06,
2396
+ "loss": 0.6375,
2397
+ "step": 1655
2398
+ },
2399
+ {
2400
+ "epoch": 1.8467528855513837,
2401
+ "grad_norm": 0.1582298293905816,
2402
+ "learning_rate": 3.4748295806564356e-06,
2403
+ "loss": 0.6486,
2404
+ "step": 1660
2405
+ },
2406
+ {
2407
+ "epoch": 1.8523153942428037,
2408
+ "grad_norm": 0.1626273297442105,
2409
+ "learning_rate": 3.2253814965142683e-06,
2410
+ "loss": 0.6682,
2411
+ "step": 1665
2412
+ },
2413
+ {
2414
+ "epoch": 1.8578779029342232,
2415
+ "grad_norm": 0.15566169967079257,
2416
+ "learning_rate": 2.9850769675419774e-06,
2417
+ "loss": 0.6531,
2418
+ "step": 1670
2419
+ },
2420
+ {
2421
+ "epoch": 1.8634404116256431,
2422
+ "grad_norm": 0.1785306629362279,
2423
+ "learning_rate": 2.7539386984296147e-06,
2424
+ "loss": 0.6406,
2425
+ "step": 1675
2426
+ },
2427
+ {
2428
+ "epoch": 1.869002920317063,
2429
+ "grad_norm": 0.14973888951794095,
2430
+ "learning_rate": 2.5319885278115906e-06,
2431
+ "loss": 0.6441,
2432
+ "step": 1680
2433
+ },
2434
+ {
2435
+ "epoch": 1.8745654290084828,
2436
+ "grad_norm": 0.1493522152423315,
2437
+ "learning_rate": 2.3192474262033638e-06,
2438
+ "loss": 0.6377,
2439
+ "step": 1685
2440
+ },
2441
+ {
2442
+ "epoch": 1.8801279376999025,
2443
+ "grad_norm": 0.1611037940241023,
2444
+ "learning_rate": 2.115735494019966e-06,
2445
+ "loss": 0.6785,
2446
+ "step": 1690
2447
+ },
2448
+ {
2449
+ "epoch": 1.8856904463913224,
2450
+ "grad_norm": 0.15767721827989367,
2451
+ "learning_rate": 1.921471959676957e-06,
2452
+ "loss": 0.667,
2453
+ "step": 1695
2454
+ },
2455
+ {
2456
+ "epoch": 1.8912529550827424,
2457
+ "grad_norm": 0.15987860636699017,
2458
+ "learning_rate": 1.7364751777736332e-06,
2459
+ "loss": 0.6374,
2460
+ "step": 1700
2461
+ },
2462
+ {
2463
+ "epoch": 1.896815463774162,
2464
+ "grad_norm": 0.14274620551242612,
2465
+ "learning_rate": 1.5607626273588138e-06,
2466
+ "loss": 0.6305,
2467
+ "step": 1705
2468
+ },
2469
+ {
2470
+ "epoch": 1.9023779724655818,
2471
+ "grad_norm": 0.1539956730318316,
2472
+ "learning_rate": 1.394350910279385e-06,
2473
+ "loss": 0.6446,
2474
+ "step": 1710
2475
+ },
2476
+ {
2477
+ "epoch": 1.9079404811570018,
2478
+ "grad_norm": 0.16255063308547452,
2479
+ "learning_rate": 1.2372557496116877e-06,
2480
+ "loss": 0.6577,
2481
+ "step": 1715
2482
+ },
2483
+ {
2484
+ "epoch": 1.9135029898484217,
2485
+ "grad_norm": 0.1457148304946188,
2486
+ "learning_rate": 1.089491988176017e-06,
2487
+ "loss": 0.6671,
2488
+ "step": 1720
2489
+ },
2490
+ {
2491
+ "epoch": 1.9190654985398414,
2492
+ "grad_norm": 0.16991284681377727,
2493
+ "learning_rate": 9.510735871341103e-07,
2494
+ "loss": 0.6693,
2495
+ "step": 1725
2496
+ },
2497
+ {
2498
+ "epoch": 1.9246280072312612,
2499
+ "grad_norm": 0.15748923168566117,
2500
+ "learning_rate": 8.220136246701926e-07,
2501
+ "loss": 0.6491,
2502
+ "step": 1730
2503
+ },
2504
+ {
2505
+ "epoch": 1.930190515922681,
2506
+ "grad_norm": 0.15744123180048702,
2507
+ "learning_rate": 7.023242947552078e-07,
2508
+ "loss": 0.6366,
2509
+ "step": 1735
2510
+ },
2511
+ {
2512
+ "epoch": 1.935753024614101,
2513
+ "grad_norm": 0.15109132227982583,
2514
+ "learning_rate": 5.920169059947411e-07,
2515
+ "loss": 0.6553,
2516
+ "step": 1740
2517
+ },
2518
+ {
2519
+ "epoch": 1.9413155333055208,
2520
+ "grad_norm": 0.16542652334128852,
2521
+ "learning_rate": 4.911018805605406e-07,
2522
+ "loss": 0.6395,
2523
+ "step": 1745
2524
+ },
2525
+ {
2526
+ "epoch": 1.9468780419969405,
2527
+ "grad_norm": 0.15918345337184334,
2528
+ "learning_rate": 3.9958875320580404e-07,
2529
+ "loss": 0.6502,
2530
+ "step": 1750
2531
+ },
2532
+ {
2533
+ "epoch": 1.9524405506883604,
2534
+ "grad_norm": 0.17686582944306006,
2535
+ "learning_rate": 3.1748617036427843e-07,
2536
+ "loss": 0.6586,
2537
+ "step": 1755
2538
+ },
2539
+ {
2540
+ "epoch": 1.9580030593797804,
2541
+ "grad_norm": 0.1426907741483869,
2542
+ "learning_rate": 2.448018893333681e-07,
2543
+ "loss": 0.6345,
2544
+ "step": 1760
2545
+ },
2546
+ {
2547
+ "epoch": 1.9635655680712,
2548
+ "grad_norm": 0.14223763889913488,
2549
+ "learning_rate": 1.815427775411549e-07,
2550
+ "loss": 0.6296,
2551
+ "step": 1765
2552
+ },
2553
+ {
2554
+ "epoch": 1.9691280767626198,
2555
+ "grad_norm": 0.14802968415791257,
2556
+ "learning_rate": 1.277148118975835e-07,
2557
+ "loss": 0.6167,
2558
+ "step": 1770
2559
+ },
2560
+ {
2561
+ "epoch": 1.9746905854540397,
2562
+ "grad_norm": 0.16664999019696625,
2563
+ "learning_rate": 8.332307822971342e-08,
2564
+ "loss": 0.6522,
2565
+ "step": 1775
2566
+ },
2567
+ {
2568
+ "epoch": 1.9802530941454597,
2569
+ "grad_norm": 0.15531644850942158,
2570
+ "learning_rate": 4.837177080119215e-08,
2571
+ "loss": 0.6587,
2572
+ "step": 1780
2573
+ },
2574
+ {
2575
+ "epoch": 1.9858156028368794,
2576
+ "grad_norm": 0.17774727599107124,
2577
+ "learning_rate": 2.286419191601663e-08,
2578
+ "loss": 0.6634,
2579
+ "step": 1785
2580
+ },
2581
+ {
2582
+ "epoch": 1.9913781115282991,
2583
+ "grad_norm": 0.1591360139721982,
2584
+ "learning_rate": 6.8027516064606e-09,
2585
+ "loss": 0.6466,
2586
+ "step": 1790
2587
+ },
2588
+ {
2589
+ "epoch": 1.996940620219719,
2590
+ "grad_norm": 0.1816483541352177,
2591
+ "learning_rate": 1.8896740540119696e-10,
2592
+ "loss": 0.6307,
2593
+ "step": 1795
2594
+ },
2595
+ {
2596
+ "epoch": 1.998053121958003,
2597
+ "step": 1796,
2598
+ "total_flos": 7795310658584576.0,
2599
+ "train_loss": 0.6907267017260957,
2600
+ "train_runtime": 37673.7111,
2601
+ "train_samples_per_second": 3.054,
2602
  "train_steps_per_second": 0.048
2603
  }
2604
  ],
2605
  "logging_steps": 5,
2606
+ "max_steps": 1796,
2607
  "num_input_tokens_seen": 0,
2608
+ "num_train_epochs": 2,
2609
  "save_steps": 100,
2610
+ "total_flos": 7795310658584576.0,
2611
  "train_batch_size": 2,
2612
  "trial_name": null,
2613
  "trial_params": null