moussaKam commited on
Commit
03b8e23
·
verified ·
1 Parent(s): 7e719e1
Files changed (6) hide show
  1. config.json +1 -1
  2. latest +1 -1
  3. model.safetensors +1 -1
  4. trainer_state.json +606 -2510
  5. training_args.bin +2 -2
  6. zero_to_fp32.py +84 -14
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "moussaKam/fr-qwen-1.5B-base",
3
  "architectures": [
4
  "Qwen2ForCausalLM"
5
  ],
 
1
  {
2
+ "_name_or_path": "/lustre/fsn1/projects/rech/gkb/uua32zb/grand_challenge/checkpoints/Qwen__Qwen2.5-1.5B-pretraining-fineweb2-0.0001LR-8192CL-1GAS-4BS-1EPOCHS-0.9BETA1-0.95BETA2/",
3
  "architectures": [
4
  "Qwen2ForCausalLM"
5
  ],
latest CHANGED
@@ -1 +1 @@
1
- global_step3962
 
1
+ global_step6237
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b7c002ba46b916c79fff8f94759cc0cf8fbec829a682cd017fb9e23609dfab5
3
  size 3554214752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc5edf1bc45f08dfaeca221b40e789f61302043d25115984818a52a274f213be
3
  size 3554214752
trainer_state.json CHANGED
@@ -1,2791 +1,887 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9998107374929026,
5
- "eval_steps": 500,
6
- "global_step": 3962,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0025235000946312535,
13
- "grad_norm": 3.572803497314453,
14
- "learning_rate": 1.2594458438287156e-06,
15
- "loss": 1.0672,
16
- "step": 10
17
- },
18
- {
19
- "epoch": 0.005047000189262507,
20
- "grad_norm": 1.6470932960510254,
21
- "learning_rate": 2.518891687657431e-06,
22
- "loss": 1.0001,
23
- "step": 20
24
- },
25
- {
26
- "epoch": 0.007570500283893761,
27
- "grad_norm": 1.1262171268463135,
28
- "learning_rate": 3.7783375314861467e-06,
29
- "loss": 0.9414,
30
- "step": 30
31
- },
32
- {
33
- "epoch": 0.010094000378525014,
34
- "grad_norm": 0.8495129346847534,
35
- "learning_rate": 5.037783375314862e-06,
36
- "loss": 0.9321,
37
- "step": 40
38
- },
39
- {
40
- "epoch": 0.012617500473156268,
41
- "grad_norm": 0.8612141013145447,
42
- "learning_rate": 6.297229219143577e-06,
43
- "loss": 0.8746,
44
- "step": 50
45
- },
46
- {
47
- "epoch": 0.015141000567787522,
48
- "grad_norm": 0.8412306308746338,
49
- "learning_rate": 7.556675062972293e-06,
50
- "loss": 0.9044,
51
- "step": 60
52
- },
53
- {
54
- "epoch": 0.017664500662418776,
55
- "grad_norm": 0.8401440978050232,
56
- "learning_rate": 8.816120906801008e-06,
57
- "loss": 0.9022,
58
- "step": 70
59
- },
60
- {
61
- "epoch": 0.020188000757050028,
62
- "grad_norm": 0.86940997838974,
63
- "learning_rate": 1.0075566750629725e-05,
64
- "loss": 0.8887,
65
- "step": 80
66
- },
67
- {
68
- "epoch": 0.022711500851681284,
69
- "grad_norm": 0.8858376741409302,
70
- "learning_rate": 1.133501259445844e-05,
71
- "loss": 0.8715,
72
- "step": 90
73
- },
74
- {
75
- "epoch": 0.025235000946312536,
76
- "grad_norm": 0.8635324239730835,
77
- "learning_rate": 1.2594458438287154e-05,
78
- "loss": 0.8709,
79
- "step": 100
80
- },
81
- {
82
- "epoch": 0.027758501040943788,
83
- "grad_norm": 0.9026916027069092,
84
- "learning_rate": 1.385390428211587e-05,
85
- "loss": 0.9428,
86
- "step": 110
87
- },
88
- {
89
- "epoch": 0.030282001135575044,
90
- "grad_norm": 0.6949167847633362,
91
- "learning_rate": 1.5113350125944587e-05,
92
- "loss": 0.9041,
93
- "step": 120
94
- },
95
- {
96
- "epoch": 0.0328055012302063,
97
- "grad_norm": 0.9495663046836853,
98
- "learning_rate": 1.63727959697733e-05,
99
- "loss": 0.8636,
100
- "step": 130
101
- },
102
- {
103
- "epoch": 0.03532900132483755,
104
- "grad_norm": 0.8648976683616638,
105
- "learning_rate": 1.7632241813602016e-05,
106
- "loss": 0.8471,
107
- "step": 140
108
- },
109
- {
110
- "epoch": 0.037852501419468804,
111
- "grad_norm": 0.8415878415107727,
112
- "learning_rate": 1.8891687657430733e-05,
113
- "loss": 0.9108,
114
- "step": 150
115
- },
116
- {
117
- "epoch": 0.040376001514100056,
118
- "grad_norm": 0.8484784364700317,
119
- "learning_rate": 2.015113350125945e-05,
120
- "loss": 0.8883,
121
- "step": 160
122
- },
123
- {
124
- "epoch": 0.04289950160873131,
125
- "grad_norm": 0.8620557188987732,
126
- "learning_rate": 2.1410579345088162e-05,
127
- "loss": 0.8657,
128
- "step": 170
129
- },
130
- {
131
- "epoch": 0.04542300170336257,
132
- "grad_norm": 0.8222241401672363,
133
- "learning_rate": 2.267002518891688e-05,
134
- "loss": 0.9174,
135
- "step": 180
136
- },
137
- {
138
- "epoch": 0.04794650179799382,
139
- "grad_norm": 0.8526296019554138,
140
- "learning_rate": 2.392947103274559e-05,
141
- "loss": 0.8997,
142
- "step": 190
143
- },
144
- {
145
- "epoch": 0.05047000189262507,
146
- "grad_norm": 0.8018633723258972,
147
- "learning_rate": 2.5188916876574308e-05,
148
- "loss": 0.9076,
149
- "step": 200
150
- },
151
- {
152
- "epoch": 0.052993501987256324,
153
- "grad_norm": 0.859157145023346,
154
- "learning_rate": 2.6448362720403024e-05,
155
- "loss": 0.8891,
156
- "step": 210
157
- },
158
- {
159
- "epoch": 0.055517002081887576,
160
- "grad_norm": 0.7516281604766846,
161
- "learning_rate": 2.770780856423174e-05,
162
- "loss": 0.9026,
163
- "step": 220
164
- },
165
- {
166
- "epoch": 0.058040502176518835,
167
- "grad_norm": 0.8353524804115295,
168
- "learning_rate": 2.8967254408060457e-05,
169
- "loss": 0.8393,
170
- "step": 230
171
- },
172
- {
173
- "epoch": 0.06056400227115009,
174
- "grad_norm": 0.7622519731521606,
175
- "learning_rate": 3.0226700251889174e-05,
176
- "loss": 0.8524,
177
- "step": 240
178
- },
179
- {
180
- "epoch": 0.06308750236578134,
181
- "grad_norm": 0.8780621290206909,
182
- "learning_rate": 3.148614609571788e-05,
183
- "loss": 0.9286,
184
- "step": 250
185
- },
186
- {
187
- "epoch": 0.0656110024604126,
188
- "grad_norm": 0.9684115052223206,
189
- "learning_rate": 3.27455919395466e-05,
190
- "loss": 0.8974,
191
- "step": 260
192
- },
193
- {
194
- "epoch": 0.06813450255504384,
195
- "grad_norm": 0.8870697617530823,
196
- "learning_rate": 3.4005037783375316e-05,
197
- "loss": 0.8969,
198
- "step": 270
199
- },
200
- {
201
- "epoch": 0.0706580026496751,
202
- "grad_norm": 0.8952546119689941,
203
- "learning_rate": 3.526448362720403e-05,
204
- "loss": 0.8689,
205
- "step": 280
206
- },
207
- {
208
- "epoch": 0.07318150274430635,
209
- "grad_norm": 0.9221532344818115,
210
- "learning_rate": 3.652392947103275e-05,
211
- "loss": 0.883,
212
- "step": 290
213
- },
214
- {
215
- "epoch": 0.07570500283893761,
216
- "grad_norm": 0.9585578441619873,
217
- "learning_rate": 3.7783375314861465e-05,
218
- "loss": 0.8508,
219
- "step": 300
220
- },
221
- {
222
- "epoch": 0.07822850293356887,
223
- "grad_norm": 0.9648734331130981,
224
- "learning_rate": 3.904282115869018e-05,
225
- "loss": 0.8982,
226
- "step": 310
227
- },
228
- {
229
- "epoch": 0.08075200302820011,
230
- "grad_norm": 0.8147997260093689,
231
- "learning_rate": 4.03022670025189e-05,
232
- "loss": 0.8694,
233
- "step": 320
234
- },
235
- {
236
- "epoch": 0.08327550312283137,
237
- "grad_norm": 0.8099369406700134,
238
- "learning_rate": 4.1561712846347615e-05,
239
- "loss": 0.8929,
240
- "step": 330
241
- },
242
- {
243
- "epoch": 0.08579900321746262,
244
- "grad_norm": 0.8512017130851746,
245
- "learning_rate": 4.2821158690176324e-05,
246
- "loss": 0.86,
247
- "step": 340
248
- },
249
- {
250
- "epoch": 0.08832250331209388,
251
- "grad_norm": 0.8499712347984314,
252
- "learning_rate": 4.408060453400504e-05,
253
- "loss": 0.8423,
254
- "step": 350
255
- },
256
- {
257
- "epoch": 0.09084600340672513,
258
- "grad_norm": 0.8530069589614868,
259
- "learning_rate": 4.534005037783376e-05,
260
- "loss": 0.8852,
261
- "step": 360
262
- },
263
- {
264
- "epoch": 0.09336950350135638,
265
- "grad_norm": 0.8837921023368835,
266
- "learning_rate": 4.659949622166247e-05,
267
- "loss": 0.8864,
268
- "step": 370
269
- },
270
- {
271
- "epoch": 0.09589300359598764,
272
- "grad_norm": 0.8840718865394592,
273
- "learning_rate": 4.785894206549118e-05,
274
- "loss": 0.8999,
275
- "step": 380
276
- },
277
- {
278
- "epoch": 0.09841650369061888,
279
- "grad_norm": 0.7395176887512207,
280
- "learning_rate": 4.91183879093199e-05,
281
- "loss": 0.8954,
282
- "step": 390
283
- },
284
- {
285
- "epoch": 0.10094000378525014,
286
- "grad_norm": 0.8697903752326965,
287
- "learning_rate": 4.999991263591223e-05,
288
- "loss": 0.8353,
289
- "step": 400
290
- },
291
- {
292
- "epoch": 0.1034635038798814,
293
- "grad_norm": 0.8527745008468628,
294
- "learning_rate": 4.9998359513560176e-05,
295
- "loss": 0.8591,
296
- "step": 410
297
- },
298
- {
299
- "epoch": 0.10598700397451265,
300
- "grad_norm": 0.806920051574707,
301
- "learning_rate": 4.999486510586282e-05,
302
- "loss": 0.9076,
303
- "step": 420
304
- },
305
- {
306
- "epoch": 0.1085105040691439,
307
- "grad_norm": 0.8428360223770142,
308
- "learning_rate": 4.9989429684183686e-05,
309
- "loss": 0.9032,
310
- "step": 430
311
- },
312
- {
313
- "epoch": 0.11103400416377515,
314
- "grad_norm": 0.8436294198036194,
315
- "learning_rate": 4.9982053670618626e-05,
316
- "loss": 0.8894,
317
- "step": 440
318
- },
319
- {
320
- "epoch": 0.11355750425840641,
321
- "grad_norm": 0.7664760947227478,
322
- "learning_rate": 4.997273763796312e-05,
323
- "loss": 0.8732,
324
- "step": 450
325
- },
326
- {
327
- "epoch": 0.11608100435303767,
328
- "grad_norm": 0.9134059548377991,
329
- "learning_rate": 4.996148230966775e-05,
330
- "loss": 0.8438,
331
- "step": 460
332
- },
333
- {
334
- "epoch": 0.11860450444766892,
335
- "grad_norm": 0.849233090877533,
336
- "learning_rate": 4.994828855978202e-05,
337
- "loss": 0.9281,
338
- "step": 470
339
- },
340
- {
341
- "epoch": 0.12112800454230017,
342
- "grad_norm": 0.8473492860794067,
343
- "learning_rate": 4.99331574128865e-05,
344
- "loss": 0.8368,
345
- "step": 480
346
- },
347
- {
348
- "epoch": 0.12365150463693142,
349
- "grad_norm": 0.8474506735801697,
350
- "learning_rate": 4.991609004401324e-05,
351
- "loss": 0.8852,
352
- "step": 490
353
- },
354
- {
355
- "epoch": 0.12617500473156268,
356
- "grad_norm": 0.7737945318222046,
357
- "learning_rate": 4.989708777855453e-05,
358
- "loss": 0.8881,
359
- "step": 500
360
- },
361
- {
362
- "epoch": 0.12869850482619394,
363
- "grad_norm": 0.6961573958396912,
364
- "learning_rate": 4.9876152092159994e-05,
365
- "loss": 0.9173,
366
- "step": 510
367
- },
368
- {
369
- "epoch": 0.1312220049208252,
370
- "grad_norm": 0.7320950031280518,
371
- "learning_rate": 4.985328461062195e-05,
372
- "loss": 0.8899,
373
- "step": 520
374
- },
375
- {
376
- "epoch": 0.13374550501545643,
377
- "grad_norm": 0.7261886596679688,
378
- "learning_rate": 4.98284871097492e-05,
379
- "loss": 0.8855,
380
- "step": 530
381
- },
382
- {
383
- "epoch": 0.1362690051100877,
384
- "grad_norm": 0.7850842475891113,
385
- "learning_rate": 4.98017615152291e-05,
386
- "loss": 0.884,
387
- "step": 540
388
- },
389
- {
390
- "epoch": 0.13879250520471895,
391
- "grad_norm": 0.8015512228012085,
392
- "learning_rate": 4.977310990247807e-05,
393
- "loss": 0.8767,
394
- "step": 550
395
- },
396
- {
397
- "epoch": 0.1413160052993502,
398
- "grad_norm": 0.8083379864692688,
399
- "learning_rate": 4.974253449648031e-05,
400
- "loss": 0.8861,
401
- "step": 560
402
- },
403
- {
404
- "epoch": 0.14383950539398146,
405
- "grad_norm": 0.7726438045501709,
406
- "learning_rate": 4.971003767161516e-05,
407
- "loss": 0.8747,
408
- "step": 570
409
- },
410
- {
411
- "epoch": 0.1463630054886127,
412
- "grad_norm": 0.7719607949256897,
413
- "learning_rate": 4.9675621951472584e-05,
414
- "loss": 0.8862,
415
- "step": 580
416
- },
417
- {
418
- "epoch": 0.14888650558324396,
419
- "grad_norm": 0.7348030209541321,
420
- "learning_rate": 4.9639290008657304e-05,
421
- "loss": 0.8915,
422
- "step": 590
423
- },
424
- {
425
- "epoch": 0.15141000567787521,
426
- "grad_norm": 0.7903275489807129,
427
- "learning_rate": 4.960104466458118e-05,
428
- "loss": 0.8916,
429
- "step": 600
430
- },
431
- {
432
- "epoch": 0.15393350577250647,
433
- "grad_norm": 0.778893232345581,
434
- "learning_rate": 4.956088888924414e-05,
435
- "loss": 0.8674,
436
- "step": 610
437
- },
438
- {
439
- "epoch": 0.15645700586713773,
440
- "grad_norm": 0.7972787618637085,
441
- "learning_rate": 4.951882580100353e-05,
442
- "loss": 0.8908,
443
- "step": 620
444
- },
445
- {
446
- "epoch": 0.15898050596176896,
447
- "grad_norm": 0.741663932800293,
448
- "learning_rate": 4.947485866633199e-05,
449
- "loss": 0.8876,
450
- "step": 630
451
- },
452
- {
453
- "epoch": 0.16150400605640022,
454
- "grad_norm": 0.8059448003768921,
455
- "learning_rate": 4.94289908995637e-05,
456
- "loss": 0.8164,
457
- "step": 640
458
- },
459
- {
460
- "epoch": 0.16402750615103148,
461
- "grad_norm": 0.8004572987556458,
462
- "learning_rate": 4.938122606262936e-05,
463
- "loss": 0.9075,
464
- "step": 650
465
- },
466
- {
467
- "epoch": 0.16655100624566274,
468
- "grad_norm": 1.5734790563583374,
469
- "learning_rate": 4.9331567864779457e-05,
470
- "loss": 0.9146,
471
- "step": 660
472
- },
473
- {
474
- "epoch": 0.169074506340294,
475
- "grad_norm": 0.7215042114257812,
476
- "learning_rate": 4.928002016229634e-05,
477
- "loss": 0.885,
478
- "step": 670
479
- },
480
- {
481
- "epoch": 0.17159800643492523,
482
- "grad_norm": 0.6652220487594604,
483
- "learning_rate": 4.9226586958194647e-05,
484
- "loss": 0.9085,
485
- "step": 680
486
- },
487
- {
488
- "epoch": 0.1741215065295565,
489
- "grad_norm": 0.676682710647583,
490
- "learning_rate": 4.9171272401910504e-05,
491
- "loss": 0.837,
492
- "step": 690
493
- },
494
- {
495
- "epoch": 0.17664500662418775,
496
- "grad_norm": 0.7034597396850586,
497
- "learning_rate": 4.9114080788979284e-05,
498
- "loss": 0.8905,
499
- "step": 700
500
- },
501
- {
502
- "epoch": 0.179168506718819,
503
- "grad_norm": 0.7657853960990906,
504
- "learning_rate": 4.905501656070202e-05,
505
- "loss": 0.8945,
506
- "step": 710
507
- },
508
- {
509
- "epoch": 0.18169200681345027,
510
- "grad_norm": 0.7395844459533691,
511
- "learning_rate": 4.8994084303800525e-05,
512
- "loss": 0.8762,
513
- "step": 720
514
- },
515
- {
516
- "epoch": 0.1842155069080815,
517
- "grad_norm": 0.7073786854743958,
518
- "learning_rate": 4.89312887500612e-05,
519
- "loss": 0.8824,
520
- "step": 730
521
- },
522
- {
523
- "epoch": 0.18673900700271276,
524
- "grad_norm": 0.7239210605621338,
525
- "learning_rate": 4.8866634775967544e-05,
526
- "loss": 0.8855,
527
- "step": 740
528
- },
529
- {
530
- "epoch": 0.18926250709734402,
531
- "grad_norm": 0.6406372785568237,
532
- "learning_rate": 4.880012740232154e-05,
533
- "loss": 0.8775,
534
- "step": 750
535
- },
536
- {
537
- "epoch": 0.19178600719197528,
538
- "grad_norm": 0.76404869556427,
539
- "learning_rate": 4.873177179385368e-05,
540
- "loss": 0.862,
541
- "step": 760
542
- },
543
- {
544
- "epoch": 0.19430950728660654,
545
- "grad_norm": 0.7401562929153442,
546
- "learning_rate": 4.866157325882192e-05,
547
- "loss": 0.8734,
548
- "step": 770
549
- },
550
- {
551
- "epoch": 0.19683300738123777,
552
- "grad_norm": 0.7563286423683167,
553
- "learning_rate": 4.858953724859948e-05,
554
- "loss": 0.8652,
555
- "step": 780
556
- },
557
- {
558
- "epoch": 0.19935650747586903,
559
- "grad_norm": 0.7244860529899597,
560
- "learning_rate": 4.851566935725147e-05,
561
- "loss": 0.8436,
562
- "step": 790
563
- },
564
- {
565
- "epoch": 0.20188000757050029,
566
- "grad_norm": 0.7061064839363098,
567
- "learning_rate": 4.843997532110051e-05,
568
- "loss": 0.8717,
569
- "step": 800
570
- },
571
- {
572
- "epoch": 0.20440350766513155,
573
- "grad_norm": 0.9287751913070679,
574
- "learning_rate": 4.836246101828124e-05,
575
- "loss": 0.884,
576
- "step": 810
577
- },
578
- {
579
- "epoch": 0.2069270077597628,
580
- "grad_norm": 0.6689929366111755,
581
- "learning_rate": 4.828313246828386e-05,
582
- "loss": 0.8871,
583
- "step": 820
584
- },
585
- {
586
- "epoch": 0.20945050785439404,
587
- "grad_norm": 0.7176743149757385,
588
- "learning_rate": 4.820199583148667e-05,
589
- "loss": 0.8799,
590
- "step": 830
591
- },
592
- {
593
- "epoch": 0.2119740079490253,
594
- "grad_norm": 0.6979175209999084,
595
- "learning_rate": 4.811905740867769e-05,
596
- "loss": 0.8874,
597
- "step": 840
598
- },
599
- {
600
- "epoch": 0.21449750804365655,
601
- "grad_norm": 0.780451774597168,
602
- "learning_rate": 4.803432364056535e-05,
603
- "loss": 0.8843,
604
- "step": 850
605
- },
606
- {
607
- "epoch": 0.2170210081382878,
608
- "grad_norm": 0.7175182700157166,
609
- "learning_rate": 4.794780110727832e-05,
610
- "loss": 0.8578,
611
- "step": 860
612
- },
613
- {
614
- "epoch": 0.21954450823291907,
615
- "grad_norm": 0.724116861820221,
616
- "learning_rate": 4.785949652785453e-05,
617
- "loss": 0.8869,
618
- "step": 870
619
- },
620
- {
621
- "epoch": 0.2220680083275503,
622
- "grad_norm": 0.8724785447120667,
623
- "learning_rate": 4.776941675971941e-05,
624
- "loss": 0.8648,
625
- "step": 880
626
- },
627
- {
628
- "epoch": 0.22459150842218156,
629
- "grad_norm": 0.7354777455329895,
630
- "learning_rate": 4.767756879815334e-05,
631
- "loss": 0.8683,
632
- "step": 890
633
- },
634
- {
635
- "epoch": 0.22711500851681282,
636
- "grad_norm": 0.7593517899513245,
637
- "learning_rate": 4.758395977574841e-05,
638
- "loss": 0.9101,
639
- "step": 900
640
- },
641
- {
642
- "epoch": 0.22963850861144408,
643
- "grad_norm": 0.7243201732635498,
644
- "learning_rate": 4.748859696185458e-05,
645
- "loss": 0.8945,
646
- "step": 910
647
- },
648
- {
649
- "epoch": 0.23216200870607534,
650
- "grad_norm": 0.6870005130767822,
651
- "learning_rate": 4.739148776201512e-05,
652
- "loss": 0.8158,
653
- "step": 920
654
- },
655
- {
656
- "epoch": 0.23468550880070657,
657
- "grad_norm": 0.7116649746894836,
658
- "learning_rate": 4.729263971739154e-05,
659
- "loss": 0.8855,
660
- "step": 930
661
- },
662
- {
663
- "epoch": 0.23720900889533783,
664
- "grad_norm": 0.6931096911430359,
665
- "learning_rate": 4.719206050417796e-05,
666
- "loss": 0.8674,
667
- "step": 940
668
- },
669
- {
670
- "epoch": 0.2397325089899691,
671
- "grad_norm": 0.7314079999923706,
672
- "learning_rate": 4.7089757933005016e-05,
673
- "loss": 0.8743,
674
- "step": 950
675
- },
676
- {
677
- "epoch": 0.24225600908460035,
678
- "grad_norm": 0.7538678646087646,
679
- "learning_rate": 4.698573994833332e-05,
680
- "loss": 0.866,
681
- "step": 960
682
- },
683
- {
684
- "epoch": 0.2447795091792316,
685
- "grad_norm": 0.6961751580238342,
686
- "learning_rate": 4.688001462783648e-05,
687
- "loss": 0.8528,
688
- "step": 970
689
- },
690
- {
691
- "epoch": 0.24730300927386284,
692
- "grad_norm": 0.7808176875114441,
693
- "learning_rate": 4.6772590181773866e-05,
694
- "loss": 0.8315,
695
- "step": 980
696
- },
697
- {
698
- "epoch": 0.2498265093684941,
699
- "grad_norm": 0.716074526309967,
700
- "learning_rate": 4.6663474952353004e-05,
701
- "loss": 0.8372,
702
- "step": 990
703
- },
704
- {
705
- "epoch": 0.25235000946312536,
706
- "grad_norm": 0.8192372918128967,
707
- "learning_rate": 4.6552677413081756e-05,
708
- "loss": 0.902,
709
- "step": 1000
710
- },
711
- {
712
- "epoch": 0.2548735095577566,
713
- "grad_norm": 0.7088383436203003,
714
- "learning_rate": 4.644020616811029e-05,
715
- "loss": 0.8847,
716
- "step": 1010
717
- },
718
- {
719
- "epoch": 0.2573970096523879,
720
- "grad_norm": 0.8579234480857849,
721
- "learning_rate": 4.6326069951562924e-05,
722
- "loss": 0.9071,
723
- "step": 1020
724
- },
725
- {
726
- "epoch": 0.25992050974701914,
727
- "grad_norm": 0.6537004709243774,
728
- "learning_rate": 4.6210277626859856e-05,
729
- "loss": 0.8187,
730
- "step": 1030
731
- },
732
- {
733
- "epoch": 0.2624440098416504,
734
- "grad_norm": 0.6265996694564819,
735
- "learning_rate": 4.609283818602884e-05,
736
- "loss": 0.8744,
737
- "step": 1040
738
- },
739
- {
740
- "epoch": 0.2649675099362816,
741
- "grad_norm": 0.7445203065872192,
742
- "learning_rate": 4.5973760749006963e-05,
743
- "loss": 0.8831,
744
- "step": 1050
745
- },
746
- {
747
- "epoch": 0.26749101003091286,
748
- "grad_norm": 0.7054116129875183,
749
- "learning_rate": 4.585305456293235e-05,
750
- "loss": 0.9171,
751
- "step": 1060
752
- },
753
- {
754
- "epoch": 0.2700145101255441,
755
- "grad_norm": 1.429075837135315,
756
- "learning_rate": 4.5730729001426083e-05,
757
- "loss": 0.8894,
758
- "step": 1070
759
- },
760
- {
761
- "epoch": 0.2725380102201754,
762
- "grad_norm": 0.6793610453605652,
763
- "learning_rate": 4.5606793563864316e-05,
764
- "loss": 0.8629,
765
- "step": 1080
766
- },
767
- {
768
- "epoch": 0.27506151031480663,
769
- "grad_norm": 0.6932589411735535,
770
- "learning_rate": 4.548125787464054e-05,
771
- "loss": 0.8564,
772
- "step": 1090
773
- },
774
- {
775
- "epoch": 0.2775850104094379,
776
- "grad_norm": 0.6600730419158936,
777
- "learning_rate": 4.535413168241821e-05,
778
- "loss": 0.8685,
779
- "step": 1100
780
- },
781
- {
782
- "epoch": 0.28010851050406915,
783
- "grad_norm": 0.6784124970436096,
784
- "learning_rate": 4.522542485937369e-05,
785
- "loss": 0.9024,
786
- "step": 1110
787
- },
788
- {
789
- "epoch": 0.2826320105987004,
790
- "grad_norm": 0.6841257214546204,
791
- "learning_rate": 4.509514740042962e-05,
792
- "loss": 0.8698,
793
- "step": 1120
794
- },
795
- {
796
- "epoch": 0.28515551069333167,
797
- "grad_norm": 0.7785212397575378,
798
- "learning_rate": 4.496330942247873e-05,
799
- "loss": 0.8731,
800
- "step": 1130
801
- },
802
- {
803
- "epoch": 0.28767901078796293,
804
- "grad_norm": 0.730110228061676,
805
- "learning_rate": 4.482992116359824e-05,
806
- "loss": 0.8542,
807
- "step": 1140
808
- },
809
- {
810
- "epoch": 0.29020251088259413,
811
- "grad_norm": 0.6644122004508972,
812
- "learning_rate": 4.469499298225473e-05,
813
- "loss": 0.8246,
814
- "step": 1150
815
- },
816
- {
817
- "epoch": 0.2927260109772254,
818
- "grad_norm": 0.7170603275299072,
819
- "learning_rate": 4.455853535649984e-05,
820
- "loss": 0.8576,
821
- "step": 1160
822
- },
823
- {
824
- "epoch": 0.29524951107185665,
825
- "grad_norm": 0.6883527040481567,
826
- "learning_rate": 4.442055888315646e-05,
827
- "loss": 0.8639,
828
- "step": 1170
829
- },
830
- {
831
- "epoch": 0.2977730111664879,
832
- "grad_norm": 0.6971318125724792,
833
- "learning_rate": 4.4281074276995936e-05,
834
- "loss": 0.8218,
835
- "step": 1180
836
- },
837
- {
838
- "epoch": 0.30029651126111917,
839
- "grad_norm": 0.7020850777626038,
840
- "learning_rate": 4.4140092369905914e-05,
841
- "loss": 0.8376,
842
- "step": 1190
843
- },
844
- {
845
- "epoch": 0.30282001135575043,
846
- "grad_norm": 0.6218104362487793,
847
- "learning_rate": 4.399762411004922e-05,
848
- "loss": 0.8741,
849
- "step": 1200
850
- },
851
- {
852
- "epoch": 0.3053435114503817,
853
- "grad_norm": 0.8031836152076721,
854
- "learning_rate": 4.3853680561013647e-05,
855
- "loss": 0.8977,
856
- "step": 1210
857
- },
858
- {
859
- "epoch": 0.30786701154501295,
860
- "grad_norm": 0.6999651789665222,
861
- "learning_rate": 4.370827290095277e-05,
862
- "loss": 0.8628,
863
- "step": 1220
864
- },
865
- {
866
- "epoch": 0.3103905116396442,
867
- "grad_norm": 0.6727817058563232,
868
- "learning_rate": 4.356141242171795e-05,
869
- "loss": 0.8674,
870
- "step": 1230
871
- },
872
- {
873
- "epoch": 0.31291401173427547,
874
- "grad_norm": 0.6965411305427551,
875
- "learning_rate": 4.3413110527981406e-05,
876
- "loss": 0.8416,
877
- "step": 1240
878
- },
879
- {
880
- "epoch": 0.31543751182890667,
881
- "grad_norm": 0.7655733823776245,
882
- "learning_rate": 4.3263378736350566e-05,
883
- "loss": 0.8662,
884
- "step": 1250
885
- },
886
- {
887
- "epoch": 0.31796101192353793,
888
- "grad_norm": 0.7115268111228943,
889
- "learning_rate": 4.311222867447375e-05,
890
- "loss": 0.9022,
891
- "step": 1260
892
- },
893
- {
894
- "epoch": 0.3204845120181692,
895
- "grad_norm": 0.7572771310806274,
896
- "learning_rate": 4.295967208013717e-05,
897
- "loss": 0.8649,
898
- "step": 1270
899
- },
900
- {
901
- "epoch": 0.32300801211280045,
902
- "grad_norm": 0.6894986629486084,
903
- "learning_rate": 4.280572080035348e-05,
904
- "loss": 0.8659,
905
- "step": 1280
906
- },
907
- {
908
- "epoch": 0.3255315122074317,
909
- "grad_norm": 0.6966748833656311,
910
- "learning_rate": 4.2650386790441696e-05,
911
- "loss": 0.8558,
912
- "step": 1290
913
- },
914
- {
915
- "epoch": 0.32805501230206296,
916
- "grad_norm": 0.7241553664207458,
917
- "learning_rate": 4.2493682113098855e-05,
918
- "loss": 0.8666,
919
- "step": 1300
920
- },
921
- {
922
- "epoch": 0.3305785123966942,
923
- "grad_norm": 0.6839144825935364,
924
- "learning_rate": 4.233561893746323e-05,
925
- "loss": 0.8879,
926
- "step": 1310
927
- },
928
- {
929
- "epoch": 0.3331020124913255,
930
- "grad_norm": 0.6955851912498474,
931
- "learning_rate": 4.217620953816935e-05,
932
- "loss": 0.8446,
933
- "step": 1320
934
- },
935
- {
936
- "epoch": 0.33562551258595674,
937
- "grad_norm": 0.6097539067268372,
938
- "learning_rate": 4.2015466294394756e-05,
939
- "loss": 0.8816,
940
- "step": 1330
941
- },
942
- {
943
- "epoch": 0.338149012680588,
944
- "grad_norm": 0.7663230299949646,
945
- "learning_rate": 4.185340168889868e-05,
946
- "loss": 0.8518,
947
- "step": 1340
948
- },
949
- {
950
- "epoch": 0.3406725127752192,
951
- "grad_norm": 0.6563027501106262,
952
- "learning_rate": 4.169002830705274e-05,
953
- "loss": 0.8516,
954
- "step": 1350
955
- },
956
- {
957
- "epoch": 0.34319601286985046,
958
- "grad_norm": 0.639011025428772,
959
- "learning_rate": 4.152535883586352e-05,
960
- "loss": 0.8324,
961
- "step": 1360
962
- },
963
- {
964
- "epoch": 0.3457195129644817,
965
- "grad_norm": 0.7072712779045105,
966
- "learning_rate": 4.135940606298738e-05,
967
- "loss": 0.8445,
968
- "step": 1370
969
- },
970
- {
971
- "epoch": 0.348243013059113,
972
- "grad_norm": 0.6532591581344604,
973
- "learning_rate": 4.119218287573743e-05,
974
- "loss": 0.8293,
975
- "step": 1380
976
- },
977
- {
978
- "epoch": 0.35076651315374424,
979
- "grad_norm": 0.6421136260032654,
980
- "learning_rate": 4.102370226008271e-05,
981
- "loss": 0.8809,
982
- "step": 1390
983
- },
984
- {
985
- "epoch": 0.3532900132483755,
986
- "grad_norm": 0.6466293931007385,
987
- "learning_rate": 4.085397729963976e-05,
988
- "loss": 0.8478,
989
- "step": 1400
990
- },
991
- {
992
- "epoch": 0.35581351334300676,
993
- "grad_norm": 0.7026222348213196,
994
- "learning_rate": 4.06830211746566e-05,
995
- "loss": 0.8855,
996
- "step": 1410
997
- },
998
- {
999
- "epoch": 0.358337013437638,
1000
- "grad_norm": 0.7792401313781738,
1001
- "learning_rate": 4.051084716098921e-05,
1002
- "loss": 0.8523,
1003
- "step": 1420
1004
- },
1005
- {
1006
- "epoch": 0.3608605135322693,
1007
- "grad_norm": 0.641736626625061,
1008
- "learning_rate": 4.0337468629070496e-05,
1009
- "loss": 0.8605,
1010
- "step": 1430
1011
- },
1012
- {
1013
- "epoch": 0.36338401362690054,
1014
- "grad_norm": 0.6911234855651855,
1015
- "learning_rate": 4.016289904287212e-05,
1016
- "loss": 0.8492,
1017
- "step": 1440
1018
- },
1019
- {
1020
- "epoch": 0.36590751372153174,
1021
- "grad_norm": 0.7274027466773987,
1022
- "learning_rate": 3.9987151958858794e-05,
1023
- "loss": 0.8623,
1024
- "step": 1450
1025
- },
1026
- {
1027
- "epoch": 0.368431013816163,
1028
- "grad_norm": 0.6672956347465515,
1029
- "learning_rate": 3.981024102493566e-05,
1030
- "loss": 0.8309,
1031
- "step": 1460
1032
- },
1033
- {
1034
- "epoch": 0.37095451391079426,
1035
- "grad_norm": 0.7280237078666687,
1036
- "learning_rate": 3.963217997938834e-05,
1037
- "loss": 0.8633,
1038
- "step": 1470
1039
- },
1040
- {
1041
- "epoch": 0.3734780140054255,
1042
- "grad_norm": 0.749769389629364,
1043
- "learning_rate": 3.945298264981614e-05,
1044
- "loss": 0.8433,
1045
- "step": 1480
1046
- },
1047
- {
1048
- "epoch": 0.3760015141000568,
1049
- "grad_norm": 0.7026387453079224,
1050
- "learning_rate": 3.927266295205818e-05,
1051
- "loss": 0.8665,
1052
- "step": 1490
1053
- },
1054
- {
1055
- "epoch": 0.37852501419468804,
1056
- "grad_norm": 0.6626182794570923,
1057
- "learning_rate": 3.9091234889112815e-05,
1058
- "loss": 0.8597,
1059
- "step": 1500
1060
- },
1061
- {
1062
- "epoch": 0.3810485142893193,
1063
- "grad_norm": 0.6502306461334229,
1064
- "learning_rate": 3.8908712550050154e-05,
1065
- "loss": 0.8652,
1066
- "step": 1510
1067
- },
1068
- {
1069
- "epoch": 0.38357201438395055,
1070
- "grad_norm": 0.6474471688270569,
1071
- "learning_rate": 3.8725110108917975e-05,
1072
- "loss": 0.8258,
1073
- "step": 1520
1074
- },
1075
- {
1076
- "epoch": 0.3860955144785818,
1077
- "grad_norm": 0.6739810109138489,
1078
- "learning_rate": 3.854044182364098e-05,
1079
- "loss": 0.8578,
1080
- "step": 1530
1081
- },
1082
- {
1083
- "epoch": 0.3886190145732131,
1084
- "grad_norm": 0.7030637264251709,
1085
- "learning_rate": 3.835472203491367e-05,
1086
- "loss": 0.8468,
1087
- "step": 1540
1088
- },
1089
- {
1090
- "epoch": 0.3911425146678443,
1091
- "grad_norm": 0.6305805444717407,
1092
- "learning_rate": 3.816796516508658e-05,
1093
- "loss": 0.8476,
1094
- "step": 1550
1095
- },
1096
- {
1097
- "epoch": 0.39366601476247554,
1098
- "grad_norm": 0.6209976077079773,
1099
- "learning_rate": 3.798018571704638e-05,
1100
- "loss": 0.8376,
1101
- "step": 1560
1102
- },
1103
- {
1104
- "epoch": 0.3961895148571068,
1105
- "grad_norm": 0.6698387265205383,
1106
- "learning_rate": 3.779139827308956e-05,
1107
- "loss": 0.8744,
1108
- "step": 1570
1109
- },
1110
- {
1111
- "epoch": 0.39871301495173805,
1112
- "grad_norm": 0.7300374507904053,
1113
- "learning_rate": 3.760161749379008e-05,
1114
- "loss": 0.8609,
1115
- "step": 1580
1116
- },
1117
- {
1118
- "epoch": 0.4012365150463693,
1119
- "grad_norm": 0.6837272047996521,
1120
- "learning_rate": 3.7410858116860836e-05,
1121
- "loss": 0.837,
1122
- "step": 1590
1123
- },
1124
- {
1125
- "epoch": 0.40376001514100057,
1126
- "grad_norm": 0.6649072170257568,
1127
- "learning_rate": 3.721913495600923e-05,
1128
- "loss": 0.8694,
1129
- "step": 1600
1130
- },
1131
- {
1132
- "epoch": 0.40628351523563183,
1133
- "grad_norm": 0.5960752367973328,
1134
- "learning_rate": 3.7026462899786726e-05,
1135
- "loss": 0.8129,
1136
- "step": 1610
1137
- },
1138
- {
1139
- "epoch": 0.4088070153302631,
1140
- "grad_norm": 0.6648868322372437,
1141
- "learning_rate": 3.683285691043272e-05,
1142
- "loss": 0.8634,
1143
- "step": 1620
1144
- },
1145
- {
1146
- "epoch": 0.41133051542489435,
1147
- "grad_norm": 0.7035058736801147,
1148
- "learning_rate": 3.663833202271257e-05,
1149
- "loss": 0.8685,
1150
- "step": 1630
1151
- },
1152
- {
1153
- "epoch": 0.4138540155195256,
1154
- "grad_norm": 0.6673656702041626,
1155
- "learning_rate": 3.6442903342750084e-05,
1156
- "loss": 0.8063,
1157
- "step": 1640
1158
- },
1159
- {
1160
- "epoch": 0.4163775156141568,
1161
- "grad_norm": 0.6990562081336975,
1162
- "learning_rate": 3.624658604685443e-05,
1163
- "loss": 0.8335,
1164
- "step": 1650
1165
- },
1166
- {
1167
- "epoch": 0.41890101570878807,
1168
- "grad_norm": 0.7190445065498352,
1169
- "learning_rate": 3.604939538034158e-05,
1170
- "loss": 0.8509,
1171
- "step": 1660
1172
- },
1173
- {
1174
- "epoch": 0.42142451580341933,
1175
- "grad_norm": 0.7450734376907349,
1176
- "learning_rate": 3.585134665635041e-05,
1177
- "loss": 0.8446,
1178
- "step": 1670
1179
- },
1180
- {
1181
- "epoch": 0.4239480158980506,
1182
- "grad_norm": 0.6475887298583984,
1183
- "learning_rate": 3.565245525465355e-05,
1184
- "loss": 0.8836,
1185
- "step": 1680
1186
- },
1187
- {
1188
- "epoch": 0.42647151599268185,
1189
- "grad_norm": 0.6419990658760071,
1190
- "learning_rate": 3.5452736620463064e-05,
1191
- "loss": 0.8428,
1192
- "step": 1690
1193
- },
1194
- {
1195
- "epoch": 0.4289950160873131,
1196
- "grad_norm": 0.7428763508796692,
1197
- "learning_rate": 3.525220626323097e-05,
1198
- "loss": 0.8247,
1199
- "step": 1700
1200
- },
1201
- {
1202
- "epoch": 0.43151851618194437,
1203
- "grad_norm": 0.6717978119850159,
1204
- "learning_rate": 3.5050879755444877e-05,
1205
- "loss": 0.881,
1206
- "step": 1710
1207
- },
1208
- {
1209
- "epoch": 0.4340420162765756,
1210
- "grad_norm": 0.6862205862998962,
1211
- "learning_rate": 3.484877273141866e-05,
1212
- "loss": 0.8511,
1213
- "step": 1720
1214
- },
1215
- {
1216
- "epoch": 0.4365655163712069,
1217
- "grad_norm": 0.6874988079071045,
1218
- "learning_rate": 3.464590088607839e-05,
1219
- "loss": 0.8649,
1220
- "step": 1730
1221
- },
1222
- {
1223
- "epoch": 0.43908901646583814,
1224
- "grad_norm": 0.6635965704917908,
1225
- "learning_rate": 3.444227997374345e-05,
1226
- "loss": 0.8719,
1227
- "step": 1740
1228
- },
1229
- {
1230
- "epoch": 0.44161251656046935,
1231
- "grad_norm": 0.7285788655281067,
1232
- "learning_rate": 3.4237925806903184e-05,
1233
- "loss": 0.8534,
1234
- "step": 1750
1235
- },
1236
- {
1237
- "epoch": 0.4441360166551006,
1238
- "grad_norm": 0.6177170872688293,
1239
- "learning_rate": 3.403285425498889e-05,
1240
- "loss": 0.8516,
1241
- "step": 1760
1242
- },
1243
- {
1244
- "epoch": 0.44665951674973187,
1245
- "grad_norm": 0.7633406519889832,
1246
- "learning_rate": 3.3827081243141534e-05,
1247
- "loss": 0.8193,
1248
- "step": 1770
1249
- },
1250
- {
1251
- "epoch": 0.4491830168443631,
1252
- "grad_norm": 0.6661052107810974,
1253
- "learning_rate": 3.362062275097496e-05,
1254
- "loss": 0.8745,
1255
- "step": 1780
1256
- },
1257
- {
1258
- "epoch": 0.4517065169389944,
1259
- "grad_norm": 0.7744668126106262,
1260
- "learning_rate": 3.341349481133507e-05,
1261
- "loss": 0.8158,
1262
- "step": 1790
1263
- },
1264
- {
1265
- "epoch": 0.45423001703362564,
1266
- "grad_norm": 0.6634140014648438,
1267
- "learning_rate": 3.320571350905466e-05,
1268
- "loss": 0.8574,
1269
- "step": 1800
1270
- },
1271
- {
1272
- "epoch": 0.4567535171282569,
1273
- "grad_norm": 0.7289906740188599,
1274
- "learning_rate": 3.299729497970444e-05,
1275
- "loss": 0.8776,
1276
- "step": 1810
1277
- },
1278
- {
1279
- "epoch": 0.45927701722288816,
1280
- "grad_norm": 0.6595107913017273,
1281
- "learning_rate": 3.278825540833995e-05,
1282
- "loss": 0.8416,
1283
- "step": 1820
1284
- },
1285
- {
1286
- "epoch": 0.4618005173175194,
1287
- "grad_norm": 0.6596432328224182,
1288
- "learning_rate": 3.2578611028244656e-05,
1289
- "loss": 0.8295,
1290
- "step": 1830
1291
- },
1292
- {
1293
- "epoch": 0.4643240174121507,
1294
- "grad_norm": 0.7007511258125305,
1295
- "learning_rate": 3.2368378119669363e-05,
1296
- "loss": 0.8075,
1297
- "step": 1840
1298
- },
1299
- {
1300
- "epoch": 0.4668475175067819,
1301
- "grad_norm": 0.5890100598335266,
1302
- "learning_rate": 3.215757300856796e-05,
1303
- "loss": 0.8331,
1304
- "step": 1850
1305
  },
1306
  {
1307
- "epoch": 0.46937101760141314,
1308
- "grad_norm": 0.670438826084137,
1309
- "learning_rate": 3.194621206532957e-05,
1310
- "loss": 0.8739,
1311
- "step": 1860
1312
  },
1313
  {
1314
- "epoch": 0.4718945176960444,
1315
- "grad_norm": 0.6237263083457947,
1316
- "learning_rate": 3.173431170350732e-05,
1317
- "loss": 0.8377,
1318
- "step": 1870
1319
  },
1320
  {
1321
- "epoch": 0.47441801779067566,
1322
- "grad_norm": 0.7160887122154236,
1323
- "learning_rate": 3.152188837854369e-05,
1324
- "loss": 0.8708,
1325
- "step": 1880
1326
  },
1327
  {
1328
- "epoch": 0.4769415178853069,
1329
- "grad_norm": 0.6525737643241882,
1330
- "learning_rate": 3.130895858649264e-05,
1331
- "loss": 0.8207,
1332
- "step": 1890
1333
  },
1334
  {
1335
- "epoch": 0.4794650179799382,
1336
- "grad_norm": 0.7249549627304077,
1337
- "learning_rate": 3.109553886273863e-05,
1338
- "loss": 0.8516,
1339
- "step": 1900
1340
  },
1341
  {
1342
- "epoch": 0.48198851807456944,
1343
- "grad_norm": 0.6668533682823181,
1344
- "learning_rate": 3.088164578071246e-05,
1345
- "loss": 0.8275,
1346
- "step": 1910
1347
  },
1348
  {
1349
- "epoch": 0.4845120181692007,
1350
- "grad_norm": 0.7262100577354431,
1351
- "learning_rate": 3.066729595060431e-05,
1352
- "loss": 0.8147,
1353
- "step": 1920
1354
  },
1355
  {
1356
- "epoch": 0.48703551826383196,
1357
- "grad_norm": 0.7166665196418762,
1358
- "learning_rate": 3.0452506018073833e-05,
1359
- "loss": 0.8514,
1360
- "step": 1930
1361
  },
1362
  {
1363
- "epoch": 0.4895590183584632,
1364
- "grad_norm": 0.6810010075569153,
1365
- "learning_rate": 3.0237292662957473e-05,
1366
- "loss": 0.8323,
1367
- "step": 1940
1368
  },
1369
  {
1370
- "epoch": 0.4920825184530944,
1371
- "grad_norm": 0.6473044157028198,
1372
- "learning_rate": 3.0021672597973207e-05,
1373
- "loss": 0.8265,
1374
- "step": 1950
1375
  },
1376
  {
1377
- "epoch": 0.4946060185477257,
1378
- "grad_norm": 0.6784878969192505,
1379
- "learning_rate": 2.9805662567422676e-05,
1380
- "loss": 0.8636,
1381
- "step": 1960
1382
  },
1383
  {
1384
- "epoch": 0.49712951864235694,
1385
- "grad_norm": 0.7378344535827637,
1386
- "learning_rate": 2.9589279345890895e-05,
1387
- "loss": 0.8483,
1388
- "step": 1970
1389
  },
1390
  {
1391
- "epoch": 0.4996530187369882,
1392
- "grad_norm": 0.5715174078941345,
1393
- "learning_rate": 2.9372539736943577e-05,
1394
- "loss": 0.8434,
1395
- "step": 1980
1396
  },
1397
  {
1398
- "epoch": 0.5021765188316195,
1399
- "grad_norm": 0.5842220783233643,
1400
- "learning_rate": 2.9155460571822245e-05,
1401
- "loss": 0.8305,
1402
- "step": 1990
1403
  },
1404
  {
1405
- "epoch": 0.5047000189262507,
1406
- "grad_norm": 0.7206842303276062,
1407
- "learning_rate": 2.893805870813717e-05,
1408
- "loss": 0.8127,
1409
- "step": 2000
1410
  },
1411
  {
1412
- "epoch": 0.5072235190208819,
1413
- "grad_norm": 0.6641551852226257,
1414
- "learning_rate": 2.872035102855826e-05,
1415
- "loss": 0.8272,
1416
- "step": 2010
1417
  },
1418
  {
1419
- "epoch": 0.5097470191155132,
1420
- "grad_norm": 0.6917135119438171,
1421
- "learning_rate": 2.850235443950402e-05,
1422
- "loss": 0.7998,
1423
- "step": 2020
1424
  },
1425
  {
1426
- "epoch": 0.5122705192101444,
1427
- "grad_norm": 0.6133066415786743,
1428
- "learning_rate": 2.8284085869828665e-05,
1429
- "loss": 0.8413,
1430
- "step": 2030
1431
  },
1432
  {
1433
- "epoch": 0.5147940193047758,
1434
- "grad_norm": 0.6827579140663147,
1435
- "learning_rate": 2.8065562269507463e-05,
1436
- "loss": 0.8452,
1437
- "step": 2040
1438
  },
1439
  {
1440
- "epoch": 0.517317519399407,
1441
- "grad_norm": 0.7090153694152832,
1442
- "learning_rate": 2.7846800608320485e-05,
1443
- "loss": 0.8293,
1444
- "step": 2050
1445
  },
1446
  {
1447
- "epoch": 0.5198410194940383,
1448
- "grad_norm": 0.6256769299507141,
1449
- "learning_rate": 2.7627817874534762e-05,
1450
- "loss": 0.8159,
1451
- "step": 2060
1452
  },
1453
  {
1454
- "epoch": 0.5223645195886695,
1455
- "grad_norm": 0.6957070231437683,
1456
- "learning_rate": 2.7408631073585068e-05,
1457
- "loss": 0.8023,
1458
- "step": 2070
1459
  },
1460
  {
1461
- "epoch": 0.5248880196833008,
1462
- "grad_norm": 0.6817536950111389,
1463
- "learning_rate": 2.7189257226753305e-05,
1464
- "loss": 0.8334,
1465
- "step": 2080
1466
  },
1467
  {
1468
- "epoch": 0.527411519777932,
1469
- "grad_norm": 0.6535147428512573,
1470
- "learning_rate": 2.696971336984672e-05,
1471
- "loss": 0.8558,
1472
- "step": 2090
1473
  },
1474
  {
1475
- "epoch": 0.5299350198725632,
1476
- "grad_norm": 0.7457418441772461,
1477
- "learning_rate": 2.6750016551874945e-05,
1478
- "loss": 0.8244,
1479
- "step": 2100
1480
  },
1481
  {
1482
- "epoch": 0.5324585199671945,
1483
- "grad_norm": 0.6570724248886108,
1484
- "learning_rate": 2.6530183833726025e-05,
1485
- "loss": 0.8283,
1486
- "step": 2110
1487
  },
1488
  {
1489
- "epoch": 0.5349820200618257,
1490
- "grad_norm": 0.7065024375915527,
1491
- "learning_rate": 2.6310232286841546e-05,
1492
- "loss": 0.8565,
1493
- "step": 2120
1494
  },
1495
  {
1496
- "epoch": 0.537505520156457,
1497
- "grad_norm": 0.671667218208313,
1498
- "learning_rate": 2.609017899189092e-05,
1499
- "loss": 0.8447,
1500
- "step": 2130
1501
  },
1502
  {
1503
- "epoch": 0.5400290202510882,
1504
- "grad_norm": 0.6672875285148621,
1505
- "learning_rate": 2.587004103744495e-05,
1506
- "loss": 0.7912,
1507
- "step": 2140
1508
  },
1509
  {
1510
- "epoch": 0.5425525203457195,
1511
- "grad_norm": 0.6282544732093811,
1512
- "learning_rate": 2.564983551864882e-05,
1513
- "loss": 0.8079,
1514
- "step": 2150
1515
  },
1516
  {
1517
- "epoch": 0.5450760204403508,
1518
- "grad_norm": 0.7435926795005798,
1519
- "learning_rate": 2.54295795358945e-05,
1520
- "loss": 0.8342,
1521
- "step": 2160
1522
  },
1523
  {
1524
- "epoch": 0.5475995205349821,
1525
- "grad_norm": 0.6785821318626404,
1526
- "learning_rate": 2.5209290193492834e-05,
1527
- "loss": 0.8281,
1528
- "step": 2170
1529
  },
1530
  {
1531
- "epoch": 0.5501230206296133,
1532
- "grad_norm": 0.6483226418495178,
1533
- "learning_rate": 2.4988984598345247e-05,
1534
- "loss": 0.79,
1535
- "step": 2180
1536
  },
1537
  {
1538
- "epoch": 0.5526465207242445,
1539
- "grad_norm": 0.6465590000152588,
1540
- "learning_rate": 2.4768679858615304e-05,
1541
- "loss": 0.841,
1542
- "step": 2190
1543
  },
1544
  {
1545
- "epoch": 0.5551700208188758,
1546
- "grad_norm": 0.7468442916870117,
1547
- "learning_rate": 2.454839308240014e-05,
1548
- "loss": 0.8717,
1549
- "step": 2200
1550
  },
1551
  {
1552
- "epoch": 0.557693520913507,
1553
- "grad_norm": 0.6535473465919495,
1554
- "learning_rate": 2.4328141376401903e-05,
1555
- "loss": 0.826,
1556
- "step": 2210
1557
  },
1558
  {
1559
- "epoch": 0.5602170210081383,
1560
- "grad_norm": 0.6404563188552856,
1561
- "learning_rate": 2.4107941844599312e-05,
1562
- "loss": 0.8062,
1563
- "step": 2220
1564
  },
1565
  {
1566
- "epoch": 0.5627405211027695,
1567
- "grad_norm": 0.6602795720100403,
1568
- "learning_rate": 2.3887811586919424e-05,
1569
- "loss": 0.8418,
1570
- "step": 2230
1571
  },
1572
  {
1573
- "epoch": 0.5652640211974008,
1574
- "grad_norm": 0.6988357305526733,
1575
- "learning_rate": 2.3667767697909694e-05,
1576
- "loss": 0.8177,
1577
- "step": 2240
1578
  },
1579
  {
1580
- "epoch": 0.567787521292032,
1581
- "grad_norm": 0.6755298376083374,
1582
- "learning_rate": 2.3447827265410517e-05,
1583
- "loss": 0.8653,
1584
- "step": 2250
1585
  },
1586
  {
1587
- "epoch": 0.5703110213866633,
1588
- "grad_norm": 0.72756028175354,
1589
- "learning_rate": 2.3228007369228178e-05,
1590
- "loss": 0.8896,
1591
- "step": 2260
1592
  },
1593
  {
1594
- "epoch": 0.5728345214812945,
1595
- "grad_norm": 0.6584864854812622,
1596
- "learning_rate": 2.3008325079808576e-05,
1597
- "loss": 0.8393,
1598
- "step": 2270
1599
  },
1600
  {
1601
- "epoch": 0.5753580215759259,
1602
- "grad_norm": 0.6699262857437134,
1603
- "learning_rate": 2.2788797456911503e-05,
1604
- "loss": 0.7976,
1605
- "step": 2280
1606
  },
1607
  {
1608
- "epoch": 0.5778815216705571,
1609
- "grad_norm": 0.7463390827178955,
1610
- "learning_rate": 2.2569441548285934e-05,
1611
- "loss": 0.8321,
1612
- "step": 2290
1613
  },
1614
  {
1615
- "epoch": 0.5804050217651883,
1616
- "grad_norm": 0.542870283126831,
1617
- "learning_rate": 2.2350274388346064e-05,
1618
- "loss": 0.786,
1619
  "step": 2300
1620
  },
1621
  {
1622
- "epoch": 0.5829285218598196,
1623
- "grad_norm": 0.652056872844696,
1624
- "learning_rate": 2.213131299684858e-05,
1625
- "loss": 0.848,
1626
- "step": 2310
1627
- },
1628
- {
1629
- "epoch": 0.5854520219544508,
1630
- "grad_norm": 0.7307469248771667,
1631
- "learning_rate": 2.191257437757086e-05,
1632
- "loss": 0.8117,
1633
- "step": 2320
1634
- },
1635
- {
1636
- "epoch": 0.5879755220490821,
1637
- "grad_norm": 0.6336262822151184,
1638
- "learning_rate": 2.16940755169906e-05,
1639
- "loss": 0.8417,
1640
- "step": 2330
1641
- },
1642
- {
1643
- "epoch": 0.5904990221437133,
1644
- "grad_norm": 0.7636166214942932,
1645
- "learning_rate": 2.1475833382966647e-05,
1646
- "loss": 0.8786,
1647
- "step": 2340
1648
- },
1649
- {
1650
- "epoch": 0.5930225222383446,
1651
- "grad_norm": 0.6622100472450256,
1652
- "learning_rate": 2.1257864923421404e-05,
1653
- "loss": 0.8629,
1654
  "step": 2350
1655
  },
1656
  {
1657
- "epoch": 0.5955460223329758,
1658
- "grad_norm": 0.602483332157135,
1659
- "learning_rate": 2.1040187065024605e-05,
1660
- "loss": 0.7786,
1661
- "step": 2360
1662
- },
1663
- {
1664
- "epoch": 0.5980695224276071,
1665
- "grad_norm": 0.6503065824508667,
1666
- "learning_rate": 2.0822816711878978e-05,
1667
- "loss": 0.8445,
1668
- "step": 2370
1669
- },
1670
- {
1671
- "epoch": 0.6005930225222383,
1672
- "grad_norm": 0.6901794672012329,
1673
- "learning_rate": 2.0605770744207413e-05,
1674
- "loss": 0.8259,
1675
- "step": 2380
1676
- },
1677
- {
1678
- "epoch": 0.6031165226168695,
1679
- "grad_norm": 0.7173271179199219,
1680
- "learning_rate": 2.0389066017042192e-05,
1681
- "loss": 0.802,
1682
- "step": 2390
1683
- },
1684
- {
1685
- "epoch": 0.6056400227115009,
1686
- "grad_norm": 0.7431663870811462,
1687
- "learning_rate": 2.0172719358916042e-05,
1688
- "loss": 0.8092,
1689
  "step": 2400
1690
  },
1691
  {
1692
- "epoch": 0.6081635228061321,
1693
- "grad_norm": 0.7227687239646912,
1694
- "learning_rate": 1.9956747570555288e-05,
1695
- "loss": 0.8563,
1696
- "step": 2410
1697
- },
1698
- {
1699
- "epoch": 0.6106870229007634,
1700
- "grad_norm": 0.6300061345100403,
1701
- "learning_rate": 1.9741167423575186e-05,
1702
- "loss": 0.7849,
1703
- "step": 2420
1704
- },
1705
- {
1706
- "epoch": 0.6132105229953946,
1707
- "grad_norm": 0.6208367347717285,
1708
- "learning_rate": 1.9525995659177484e-05,
1709
- "loss": 0.8239,
1710
- "step": 2430
1711
- },
1712
- {
1713
- "epoch": 0.6157340230900259,
1714
- "grad_norm": 0.6272019147872925,
1715
- "learning_rate": 1.9311248986850365e-05,
1716
- "loss": 0.8102,
1717
- "step": 2440
1718
- },
1719
- {
1720
- "epoch": 0.6182575231846571,
1721
- "grad_norm": 0.6594968438148499,
1722
- "learning_rate": 1.9096944083070866e-05,
1723
- "loss": 0.8266,
1724
  "step": 2450
1725
  },
1726
  {
1727
- "epoch": 0.6207810232792884,
1728
- "grad_norm": 0.673553466796875,
1729
- "learning_rate": 1.8883097590009775e-05,
1730
- "loss": 0.8375,
1731
- "step": 2460
1732
- },
1733
- {
1734
- "epoch": 0.6233045233739196,
1735
- "grad_norm": 0.7199084162712097,
1736
- "learning_rate": 1.866972611423936e-05,
1737
- "loss": 0.8188,
1738
- "step": 2470
1739
- },
1740
- {
1741
- "epoch": 0.6258280234685509,
1742
- "grad_norm": 0.697413444519043,
1743
- "learning_rate": 1.8456846225443648e-05,
1744
- "loss": 0.7709,
1745
- "step": 2480
1746
- },
1747
- {
1748
- "epoch": 0.6283515235631821,
1749
- "grad_norm": 0.6711037158966064,
1750
- "learning_rate": 1.8244474455131792e-05,
1751
- "loss": 0.8156,
1752
- "step": 2490
1753
- },
1754
- {
1755
- "epoch": 0.6308750236578133,
1756
- "grad_norm": 0.7030087113380432,
1757
- "learning_rate": 1.8032627295354183e-05,
1758
- "loss": 0.8125,
1759
  "step": 2500
1760
  },
1761
  {
1762
- "epoch": 0.6333985237524447,
1763
- "grad_norm": 0.7960418462753296,
1764
- "learning_rate": 1.7821321197421837e-05,
1765
- "loss": 0.8604,
1766
- "step": 2510
1767
- },
1768
- {
1769
- "epoch": 0.6359220238470759,
1770
- "grad_norm": 0.6948102116584778,
1771
- "learning_rate": 1.761057257062876e-05,
1772
- "loss": 0.8301,
1773
- "step": 2520
1774
- },
1775
- {
1776
- "epoch": 0.6384455239417072,
1777
- "grad_norm": 0.5919877290725708,
1778
- "learning_rate": 1.740039778097772e-05,
1779
- "loss": 0.7821,
1780
- "step": 2530
1781
- },
1782
- {
1783
- "epoch": 0.6409690240363384,
1784
- "grad_norm": 0.6569110751152039,
1785
- "learning_rate": 1.7190813149909274e-05,
1786
- "loss": 0.8213,
1787
- "step": 2540
1788
- },
1789
- {
1790
- "epoch": 0.6434925241309697,
1791
- "grad_norm": 0.677099347114563,
1792
- "learning_rate": 1.6981834953034344e-05,
1793
- "loss": 0.8278,
1794
  "step": 2550
1795
  },
1796
  {
1797
- "epoch": 0.6460160242256009,
1798
- "grad_norm": 0.7233052253723145,
1799
- "learning_rate": 1.677347941887028e-05,
1800
- "loss": 0.7919,
1801
- "step": 2560
1802
- },
1803
- {
1804
- "epoch": 0.6485395243202322,
1805
- "grad_norm": 0.7088631987571716,
1806
- "learning_rate": 1.656576272758061e-05,
1807
- "loss": 0.8444,
1808
- "step": 2570
1809
- },
1810
- {
1811
- "epoch": 0.6510630244148634,
1812
- "grad_norm": 0.6909515857696533,
1813
- "learning_rate": 1.6358701009718577e-05,
1814
- "loss": 0.8222,
1815
- "step": 2580
1816
- },
1817
- {
1818
- "epoch": 0.6535865245094946,
1819
- "grad_norm": 0.5979318618774414,
1820
- "learning_rate": 1.615231034497444e-05,
1821
- "loss": 0.8376,
1822
- "step": 2590
1823
- },
1824
- {
1825
- "epoch": 0.6561100246041259,
1826
- "grad_norm": 0.7273426055908203,
1827
- "learning_rate": 1.5946606760926865e-05,
1828
- "loss": 0.8037,
1829
  "step": 2600
1830
  },
1831
  {
1832
- "epoch": 0.6586335246987571,
1833
- "grad_norm": 0.719450056552887,
1834
- "learning_rate": 1.574160623179816e-05,
1835
- "loss": 0.8268,
1836
- "step": 2610
1837
- },
1838
- {
1839
- "epoch": 0.6611570247933884,
1840
- "grad_norm": 0.7163055539131165,
1841
- "learning_rate": 1.553732467721392e-05,
1842
- "loss": 0.7853,
1843
- "step": 2620
1844
- },
1845
- {
1846
- "epoch": 0.6636805248880197,
1847
- "grad_norm": 0.6172025799751282,
1848
- "learning_rate": 1.5333777960966616e-05,
1849
- "loss": 0.7926,
1850
- "step": 2630
1851
- },
1852
- {
1853
- "epoch": 0.666204024982651,
1854
- "grad_norm": 0.6272744536399841,
1855
- "learning_rate": 1.5130981889783795e-05,
1856
- "loss": 0.7982,
1857
- "step": 2640
1858
- },
1859
- {
1860
- "epoch": 0.6687275250772822,
1861
- "grad_norm": 0.680596649646759,
1862
- "learning_rate": 1.4928952212100483e-05,
1863
- "loss": 0.8312,
1864
  "step": 2650
1865
  },
1866
  {
1867
- "epoch": 0.6712510251719135,
1868
- "grad_norm": 0.6080834865570068,
1869
- "learning_rate": 1.4727704616836296e-05,
1870
- "loss": 0.8273,
1871
- "step": 2660
1872
- },
1873
- {
1874
- "epoch": 0.6737745252665447,
1875
- "grad_norm": 0.6613759398460388,
1876
- "learning_rate": 1.4527254732177043e-05,
1877
- "loss": 0.8141,
1878
- "step": 2670
1879
- },
1880
- {
1881
- "epoch": 0.676298025361176,
1882
- "grad_norm": 0.6180728077888489,
1883
- "learning_rate": 1.4327618124361114e-05,
1884
- "loss": 0.8231,
1885
- "step": 2680
1886
- },
1887
- {
1888
- "epoch": 0.6788215254558072,
1889
- "grad_norm": 0.6406080722808838,
1890
- "learning_rate": 1.412881029647065e-05,
1891
- "loss": 0.7876,
1892
- "step": 2690
1893
- },
1894
- {
1895
- "epoch": 0.6813450255504384,
1896
- "grad_norm": 0.6109746098518372,
1897
- "learning_rate": 1.3930846687227664e-05,
1898
- "loss": 0.7957,
1899
  "step": 2700
1900
  },
1901
  {
1902
- "epoch": 0.6838685256450697,
1903
- "grad_norm": 0.6827517747879028,
1904
- "learning_rate": 1.3733742669795049e-05,
1905
- "loss": 0.8428,
1906
- "step": 2710
1907
- },
1908
- {
1909
- "epoch": 0.6863920257397009,
1910
- "grad_norm": 0.7277110815048218,
1911
- "learning_rate": 1.3537513550582853e-05,
1912
- "loss": 0.8326,
1913
- "step": 2720
1914
- },
1915
- {
1916
- "epoch": 0.6889155258343322,
1917
- "grad_norm": 0.597568154335022,
1918
- "learning_rate": 1.3342174568059527e-05,
1919
- "loss": 0.7998,
1920
- "step": 2730
1921
- },
1922
- {
1923
- "epoch": 0.6914390259289634,
1924
- "grad_norm": 0.6378962993621826,
1925
- "learning_rate": 1.3147740891568661e-05,
1926
- "loss": 0.785,
1927
- "step": 2740
1928
- },
1929
- {
1930
- "epoch": 0.6939625260235948,
1931
- "grad_norm": 0.6579405069351196,
1932
- "learning_rate": 1.2954227620150904e-05,
1933
- "loss": 0.8332,
1934
  "step": 2750
1935
  },
1936
  {
1937
- "epoch": 0.696486026118226,
1938
- "grad_norm": 0.6977427005767822,
1939
- "learning_rate": 1.2761649781371479e-05,
1940
- "loss": 0.8095,
1941
- "step": 2760
1942
- },
1943
- {
1944
- "epoch": 0.6990095262128573,
1945
- "grad_norm": 0.6410185098648071,
1946
- "learning_rate": 1.257002233015318e-05,
1947
- "loss": 0.8341,
1948
- "step": 2770
1949
- },
1950
- {
1951
- "epoch": 0.7015330263074885,
1952
- "grad_norm": 0.6869609355926514,
1953
- "learning_rate": 1.2379360147614994e-05,
1954
- "loss": 0.8023,
1955
- "step": 2780
1956
- },
1957
- {
1958
- "epoch": 0.7040565264021197,
1959
- "grad_norm": 0.6658973097801208,
1960
- "learning_rate": 1.2189678039916532e-05,
1961
- "loss": 0.7755,
1962
- "step": 2790
1963
- },
1964
- {
1965
- "epoch": 0.706580026496751,
1966
- "grad_norm": 0.6188139915466309,
1967
- "learning_rate": 1.2000990737108225e-05,
1968
- "loss": 0.796,
1969
  "step": 2800
1970
  },
1971
  {
1972
- "epoch": 0.7091035265913822,
1973
- "grad_norm": 0.7432144284248352,
1974
- "learning_rate": 1.1813312891987392e-05,
1975
- "loss": 0.8381,
1976
- "step": 2810
1977
- },
1978
- {
1979
- "epoch": 0.7116270266860135,
1980
- "grad_norm": 0.6776263117790222,
1981
- "learning_rate": 1.1626659078960424e-05,
1982
- "loss": 0.8087,
1983
- "step": 2820
1984
- },
1985
- {
1986
- "epoch": 0.7141505267806447,
1987
- "grad_norm": 0.6468738913536072,
1988
- "learning_rate": 1.1441043792910936e-05,
1989
- "loss": 0.8032,
1990
- "step": 2830
1991
- },
1992
- {
1993
- "epoch": 0.716674026875276,
1994
- "grad_norm": 0.7177358865737915,
1995
- "learning_rate": 1.1256481448074179e-05,
1996
- "loss": 0.8039,
1997
- "step": 2840
1998
- },
1999
- {
2000
- "epoch": 0.7191975269699072,
2001
- "grad_norm": 0.6401441693305969,
2002
- "learning_rate": 1.1072986376917638e-05,
2003
- "loss": 0.8135,
2004
  "step": 2850
2005
  },
2006
  {
2007
- "epoch": 0.7217210270645386,
2008
- "grad_norm": 0.6511224508285522,
2009
- "learning_rate": 1.0890572829028087e-05,
2010
- "loss": 0.8496,
2011
- "step": 2860
2012
- },
2013
- {
2014
- "epoch": 0.7242445271591698,
2015
- "grad_norm": 0.632625162601471,
2016
- "learning_rate": 1.0709254970004937e-05,
2017
- "loss": 0.7964,
2018
- "step": 2870
2019
- },
2020
- {
2021
- "epoch": 0.7267680272538011,
2022
- "grad_norm": 0.5535660982131958,
2023
- "learning_rate": 1.0529046880360263e-05,
2024
- "loss": 0.7932,
2025
- "step": 2880
2026
- },
2027
- {
2028
- "epoch": 0.7292915273484323,
2029
- "grad_norm": 0.5996463298797607,
2030
- "learning_rate": 1.034996255442529e-05,
2031
- "loss": 0.8437,
2032
- "step": 2890
2033
- },
2034
- {
2035
- "epoch": 0.7318150274430635,
2036
- "grad_norm": 0.6257640719413757,
2037
- "learning_rate": 1.0172015899263712e-05,
2038
- "loss": 0.8069,
2039
  "step": 2900
2040
  },
2041
  {
2042
- "epoch": 0.7343385275376948,
2043
- "grad_norm": 0.6533858776092529,
2044
- "learning_rate": 9.995220733591639e-06,
2045
- "loss": 0.7921,
2046
- "step": 2910
2047
- },
2048
- {
2049
- "epoch": 0.736862027632326,
2050
- "grad_norm": 0.6002010107040405,
2051
- "learning_rate": 9.819590786704572e-06,
2052
- "loss": 0.8307,
2053
- "step": 2920
2054
- },
2055
- {
2056
- "epoch": 0.7393855277269573,
2057
- "grad_norm": 0.6418666243553162,
2058
- "learning_rate": 9.645139697411149e-06,
2059
- "loss": 0.8036,
2060
- "step": 2930
2061
- },
2062
- {
2063
- "epoch": 0.7419090278215885,
2064
- "grad_norm": 0.6554102897644043,
2065
- "learning_rate": 9.471881012974071e-06,
2066
- "loss": 0.8285,
2067
- "step": 2940
2068
- },
2069
- {
2070
- "epoch": 0.7444325279162198,
2071
- "grad_norm": 0.6879960894584656,
2072
- "learning_rate": 9.299828188058013e-06,
2073
- "loss": 0.8154,
2074
  "step": 2950
2075
  },
2076
  {
2077
- "epoch": 0.746956028010851,
2078
- "grad_norm": 0.6418633460998535,
2079
- "learning_rate": 9.128994583684838e-06,
2080
- "loss": 0.7945,
2081
- "step": 2960
2082
- },
2083
- {
2084
- "epoch": 0.7494795281054824,
2085
- "grad_norm": 0.6467211246490479,
2086
- "learning_rate": 8.959393466195972e-06,
2087
- "loss": 0.8464,
2088
- "step": 2970
2089
- },
2090
- {
2091
- "epoch": 0.7520030282001136,
2092
- "grad_norm": 0.6477042436599731,
2093
- "learning_rate": 8.791038006222233e-06,
2094
- "loss": 0.8235,
2095
- "step": 2980
2096
- },
2097
- {
2098
- "epoch": 0.7545265282947448,
2099
- "grad_norm": 0.6426742672920227,
2100
- "learning_rate": 8.623941277660994e-06,
2101
- "loss": 0.8001,
2102
- "step": 2990
2103
- },
2104
- {
2105
- "epoch": 0.7570500283893761,
2106
- "grad_norm": 0.7026243805885315,
2107
- "learning_rate": 8.458116256660981e-06,
2108
- "loss": 0.842,
2109
  "step": 3000
2110
  },
2111
  {
2112
- "epoch": 0.7595735284840073,
2113
- "grad_norm": 0.6429437398910522,
2114
- "learning_rate": 8.293575820614508e-06,
2115
- "loss": 0.8143,
2116
- "step": 3010
2117
- },
2118
- {
2119
- "epoch": 0.7620970285786386,
2120
- "grad_norm": 0.654498815536499,
2121
- "learning_rate": 8.130332747157542e-06,
2122
- "loss": 0.7697,
2123
- "step": 3020
2124
- },
2125
- {
2126
- "epoch": 0.7646205286732698,
2127
- "grad_norm": 0.8270076513290405,
2128
- "learning_rate": 7.968399713177366e-06,
2129
- "loss": 0.825,
2130
- "step": 3030
2131
- },
2132
- {
2133
- "epoch": 0.7671440287679011,
2134
- "grad_norm": 0.6423079967498779,
2135
- "learning_rate": 7.807789293828204e-06,
2136
- "loss": 0.8366,
2137
- "step": 3040
2138
- },
2139
- {
2140
- "epoch": 0.7696675288625323,
2141
- "grad_norm": 0.662451446056366,
2142
- "learning_rate": 7.648513961554607e-06,
2143
- "loss": 0.7695,
2144
  "step": 3050
2145
  },
2146
  {
2147
- "epoch": 0.7721910289571636,
2148
- "grad_norm": 0.5953843593597412,
2149
- "learning_rate": 7.4905860851229605e-06,
2150
- "loss": 0.8296,
2151
- "step": 3060
2152
- },
2153
- {
2154
- "epoch": 0.7747145290517948,
2155
- "grad_norm": 0.7210749387741089,
2156
- "learning_rate": 7.334017928660902e-06,
2157
- "loss": 0.8201,
2158
- "step": 3070
2159
- },
2160
- {
2161
- "epoch": 0.7772380291464261,
2162
- "grad_norm": 0.6214151382446289,
2163
- "learning_rate": 7.1788216507049865e-06,
2164
- "loss": 0.8034,
2165
- "step": 3080
2166
- },
2167
- {
2168
- "epoch": 0.7797615292410573,
2169
- "grad_norm": 0.6791695356369019,
2170
- "learning_rate": 7.0250093032564494e-06,
2171
- "loss": 0.7624,
2172
- "step": 3090
2173
- },
2174
- {
2175
- "epoch": 0.7822850293356886,
2176
- "grad_norm": 0.6388612985610962,
2177
- "learning_rate": 6.872592830845339e-06,
2178
- "loss": 0.8004,
2179
  "step": 3100
2180
  },
2181
  {
2182
- "epoch": 0.7848085294303199,
2183
- "grad_norm": 0.5958021283149719,
2184
- "learning_rate": 6.72158406960289e-06,
2185
- "loss": 0.8275,
2186
- "step": 3110
2187
- },
2188
- {
2189
- "epoch": 0.7873320295249511,
2190
- "grad_norm": 0.572040855884552,
2191
- "learning_rate": 6.571994746342439e-06,
2192
- "loss": 0.8078,
2193
- "step": 3120
2194
- },
2195
- {
2196
- "epoch": 0.7898555296195824,
2197
- "grad_norm": 0.6328415274620056,
2198
- "learning_rate": 6.4238364776486785e-06,
2199
- "loss": 0.7883,
2200
- "step": 3130
2201
- },
2202
- {
2203
- "epoch": 0.7923790297142136,
2204
- "grad_norm": 0.6552072763442993,
2205
- "learning_rate": 6.277120768975644e-06,
2206
- "loss": 0.8398,
2207
- "step": 3140
2208
- },
2209
- {
2210
- "epoch": 0.7949025298088449,
2211
- "grad_norm": 0.7182049751281738,
2212
- "learning_rate": 6.131859013753155e-06,
2213
- "loss": 0.7919,
2214
  "step": 3150
2215
  },
2216
  {
2217
- "epoch": 0.7974260299034761,
2218
- "grad_norm": 0.7126038074493408,
2219
- "learning_rate": 5.988062492502117e-06,
2220
- "loss": 0.7782,
2221
- "step": 3160
2222
- },
2223
- {
2224
- "epoch": 0.7999495299981074,
2225
- "grad_norm": 0.6005820631980896,
2226
- "learning_rate": 5.8457423719584435e-06,
2227
- "loss": 0.7979,
2228
- "step": 3170
2229
- },
2230
- {
2231
- "epoch": 0.8024730300927386,
2232
- "grad_norm": 0.6624283790588379,
2233
- "learning_rate": 5.704909704205949e-06,
2234
- "loss": 0.8297,
2235
- "step": 3180
2236
- },
2237
- {
2238
- "epoch": 0.8049965301873698,
2239
- "grad_norm": 0.6289507150650024,
2240
- "learning_rate": 5.565575425818054e-06,
2241
- "loss": 0.8147,
2242
- "step": 3190
2243
- },
2244
- {
2245
- "epoch": 0.8075200302820011,
2246
- "grad_norm": 0.6975149512290955,
2247
- "learning_rate": 5.427750357008468e-06,
2248
- "loss": 0.7733,
2249
  "step": 3200
2250
  },
2251
  {
2252
- "epoch": 0.8100435303766323,
2253
- "grad_norm": 0.6802620887756348,
2254
- "learning_rate": 5.291445200790982e-06,
2255
- "loss": 0.8226,
2256
- "step": 3210
2257
- },
2258
- {
2259
- "epoch": 0.8125670304712637,
2260
- "grad_norm": 0.6158818602561951,
2261
- "learning_rate": 5.156670542148267e-06,
2262
- "loss": 0.8282,
2263
- "step": 3220
2264
- },
2265
- {
2266
- "epoch": 0.8150905305658949,
2267
- "grad_norm": 0.7228125333786011,
2268
- "learning_rate": 5.023436847209887e-06,
2269
- "loss": 0.816,
2270
- "step": 3230
2271
- },
2272
- {
2273
- "epoch": 0.8176140306605262,
2274
- "grad_norm": 0.6515725255012512,
2275
- "learning_rate": 4.891754462439557e-06,
2276
- "loss": 0.775,
2277
- "step": 3240
2278
- },
2279
- {
2280
- "epoch": 0.8201375307551574,
2281
- "grad_norm": 0.6829689741134644,
2282
- "learning_rate": 4.761633613831645e-06,
2283
- "loss": 0.8156,
2284
  "step": 3250
2285
  },
2286
  {
2287
- "epoch": 0.8226610308497887,
2288
- "grad_norm": 0.7261675596237183,
2289
- "learning_rate": 4.6330844061170914e-06,
2290
- "loss": 0.7862,
2291
- "step": 3260
2292
- },
2293
- {
2294
- "epoch": 0.8251845309444199,
2295
- "grad_norm": 0.6911167502403259,
2296
- "learning_rate": 4.506116821978662e-06,
2297
- "loss": 0.8016,
2298
- "step": 3270
2299
- },
2300
- {
2301
- "epoch": 0.8277080310390512,
2302
- "grad_norm": 0.5780116319656372,
2303
- "learning_rate": 4.380740721275786e-06,
2304
- "loss": 0.824,
2305
- "step": 3280
2306
- },
2307
- {
2308
- "epoch": 0.8302315311336824,
2309
- "grad_norm": 0.6704926490783691,
2310
- "learning_rate": 4.25696584027882e-06,
2311
- "loss": 0.8037,
2312
- "step": 3290
2313
- },
2314
- {
2315
- "epoch": 0.8327550312283136,
2316
- "grad_norm": 0.7162071466445923,
2317
- "learning_rate": 4.134801790913006e-06,
2318
- "loss": 0.7651,
2319
  "step": 3300
2320
  },
2321
  {
2322
- "epoch": 0.8352785313229449,
2323
- "grad_norm": 0.7350740432739258,
2324
- "learning_rate": 4.014258060012005e-06,
2325
- "loss": 0.8278,
2326
- "step": 3310
2327
- },
2328
- {
2329
- "epoch": 0.8378020314175761,
2330
- "grad_norm": 0.6031658053398132,
2331
- "learning_rate": 3.895344008581222e-06,
2332
- "loss": 0.7945,
2333
- "step": 3320
2334
- },
2335
- {
2336
- "epoch": 0.8403255315122075,
2337
- "grad_norm": 0.6996452212333679,
2338
- "learning_rate": 3.7780688710708223e-06,
2339
- "loss": 0.7821,
2340
- "step": 3330
2341
- },
2342
- {
2343
- "epoch": 0.8428490316068387,
2344
- "grad_norm": 0.6655017733573914,
2345
- "learning_rate": 3.6624417546586574e-06,
2346
- "loss": 0.7526,
2347
- "step": 3340
2348
- },
2349
- {
2350
- "epoch": 0.84537253170147,
2351
- "grad_norm": 0.7387165427207947,
2352
- "learning_rate": 3.548471638542991e-06,
2353
- "loss": 0.8259,
2354
  "step": 3350
2355
  },
2356
  {
2357
- "epoch": 0.8478960317961012,
2358
- "grad_norm": 0.6410266757011414,
2359
- "learning_rate": 3.436167373245247e-06,
2360
- "loss": 0.8054,
2361
- "step": 3360
2362
- },
2363
- {
2364
- "epoch": 0.8504195318907325,
2365
- "grad_norm": 0.6522373557090759,
2366
- "learning_rate": 3.325537679922672e-06,
2367
- "loss": 0.8168,
2368
- "step": 3370
2369
- },
2370
- {
2371
- "epoch": 0.8529430319853637,
2372
- "grad_norm": 0.7458412647247314,
2373
- "learning_rate": 3.2165911496911173e-06,
2374
- "loss": 0.7892,
2375
- "step": 3380
2376
- },
2377
- {
2378
- "epoch": 0.8554665320799949,
2379
- "grad_norm": 0.6441506743431091,
2380
- "learning_rate": 3.1093362429578414e-06,
2381
- "loss": 0.8105,
2382
- "step": 3390
2383
- },
2384
- {
2385
- "epoch": 0.8579900321746262,
2386
- "grad_norm": 0.5970674753189087,
2387
- "learning_rate": 3.0037812887645483e-06,
2388
- "loss": 0.8326,
2389
  "step": 3400
2390
  },
2391
  {
2392
- "epoch": 0.8605135322692574,
2393
- "grad_norm": 0.6173757314682007,
2394
- "learning_rate": 2.8999344841405373e-06,
2395
- "loss": 0.7956,
2396
- "step": 3410
2397
  },
2398
  {
2399
- "epoch": 0.8630370323638887,
2400
- "grad_norm": 0.6268020868301392,
2401
- "learning_rate": 2.7978038934662024e-06,
2402
- "loss": 0.7859,
2403
- "step": 3420
2404
  },
2405
  {
2406
- "epoch": 0.8655605324585199,
2407
- "grad_norm": 0.6534834504127502,
2408
- "learning_rate": 2.697397447846725e-06,
2409
- "loss": 0.8041,
2410
- "step": 3430
2411
  },
2412
  {
2413
- "epoch": 0.8680840325531513,
2414
- "grad_norm": 0.6108519434928894,
2415
- "learning_rate": 2.5987229444962237e-06,
2416
- "loss": 0.823,
2417
- "step": 3440
2418
  },
2419
  {
2420
- "epoch": 0.8706075326477825,
2421
- "grad_norm": 0.6347935795783997,
2422
- "learning_rate": 2.501788046132203e-06,
2423
- "loss": 0.831,
2424
- "step": 3450
2425
  },
2426
  {
2427
- "epoch": 0.8731310327424138,
2428
- "grad_norm": 0.6183903813362122,
2429
- "learning_rate": 2.4066002803805386e-06,
2430
- "loss": 0.7974,
2431
- "step": 3460
2432
  },
2433
  {
2434
- "epoch": 0.875654532837045,
2435
- "grad_norm": 0.6723082065582275,
2436
- "learning_rate": 2.313167039190861e-06,
2437
- "loss": 0.8058,
2438
- "step": 3470
2439
  },
2440
  {
2441
- "epoch": 0.8781780329316763,
2442
- "grad_norm": 0.6427431702613831,
2443
- "learning_rate": 2.2214955782625752e-06,
2444
- "loss": 0.805,
2445
- "step": 3480
2446
  },
2447
  {
2448
- "epoch": 0.8807015330263075,
2449
- "grad_norm": 0.7344009280204773,
2450
- "learning_rate": 2.1315930164813507e-06,
2451
- "loss": 0.8366,
2452
- "step": 3490
2453
  },
2454
  {
2455
- "epoch": 0.8832250331209387,
2456
- "grad_norm": 0.6524431109428406,
2457
- "learning_rate": 2.0434663353663536e-06,
2458
- "loss": 0.8022,
2459
- "step": 3500
2460
  },
2461
  {
2462
- "epoch": 0.88574853321557,
2463
- "grad_norm": 0.6769471168518066,
2464
- "learning_rate": 1.9571223785280314e-06,
2465
- "loss": 0.8062,
2466
- "step": 3510
2467
  },
2468
  {
2469
- "epoch": 0.8882720333102012,
2470
- "grad_norm": 0.6867194771766663,
2471
- "learning_rate": 1.8725678511367001e-06,
2472
- "loss": 0.8171,
2473
- "step": 3520
2474
  },
2475
  {
2476
- "epoch": 0.8907955334048325,
2477
- "grad_norm": 0.6660215854644775,
2478
- "learning_rate": 1.789809319401825e-06,
2479
- "loss": 0.8169,
2480
- "step": 3530
2481
  },
2482
  {
2483
- "epoch": 0.8933190334994637,
2484
- "grad_norm": 0.6402613520622253,
2485
- "learning_rate": 1.7088532100621224e-06,
2486
- "loss": 0.7813,
2487
- "step": 3540
2488
  },
2489
  {
2490
- "epoch": 0.895842533594095,
2491
- "grad_norm": 0.6413708925247192,
2492
- "learning_rate": 1.629705809886467e-06,
2493
- "loss": 0.7837,
2494
- "step": 3550
2495
  },
2496
  {
2497
- "epoch": 0.8983660336887263,
2498
- "grad_norm": 0.6048439741134644,
2499
- "learning_rate": 1.5523732651857082e-06,
2500
- "loss": 0.7984,
2501
- "step": 3560
2502
  },
2503
  {
2504
- "epoch": 0.9008895337833576,
2505
- "grad_norm": 0.6774916052818298,
2506
- "learning_rate": 1.4768615813353398e-06,
2507
- "loss": 0.8033,
2508
- "step": 3570
2509
  },
2510
  {
2511
- "epoch": 0.9034130338779888,
2512
- "grad_norm": 0.6154995560646057,
2513
- "learning_rate": 1.4031766223091603e-06,
2514
- "loss": 0.8015,
2515
- "step": 3580
2516
  },
2517
  {
2518
- "epoch": 0.90593653397262,
2519
- "grad_norm": 0.6018934845924377,
2520
- "learning_rate": 1.3313241102239054e-06,
2521
- "loss": 0.7761,
2522
- "step": 3590
2523
  },
2524
  {
2525
- "epoch": 0.9084600340672513,
2526
- "grad_norm": 0.658366322517395,
2527
- "learning_rate": 1.261309624894863e-06,
2528
- "loss": 0.8173,
2529
- "step": 3600
2530
  },
2531
  {
2532
- "epoch": 0.9109835341618825,
2533
- "grad_norm": 0.6167306900024414,
2534
- "learning_rate": 1.1931386034025882e-06,
2535
- "loss": 0.8024,
2536
- "step": 3610
2537
  },
2538
  {
2539
- "epoch": 0.9135070342565138,
2540
- "grad_norm": 0.5509990453720093,
2541
- "learning_rate": 1.1268163396706583e-06,
2542
- "loss": 0.8128,
2543
- "step": 3620
2544
  },
2545
  {
2546
- "epoch": 0.916030534351145,
2547
- "grad_norm": 0.6154832243919373,
2548
- "learning_rate": 1.0623479840545874e-06,
2549
- "loss": 0.7569,
2550
- "step": 3630
2551
  },
2552
  {
2553
- "epoch": 0.9185540344457763,
2554
- "grad_norm": 0.679389238357544,
2555
- "learning_rate": 9.997385429418555e-07,
2556
- "loss": 0.8276,
2557
- "step": 3640
2558
  },
2559
  {
2560
- "epoch": 0.9210775345404075,
2561
- "grad_norm": 0.662276566028595,
2562
- "learning_rate": 9.389928783631207e-07,
2563
- "loss": 0.8304,
2564
- "step": 3650
2565
  },
2566
  {
2567
- "epoch": 0.9236010346350388,
2568
- "grad_norm": 0.6233845949172974,
2569
- "learning_rate": 8.801157076146705e-07,
2570
- "loss": 0.7851,
2571
- "step": 3660
2572
  },
2573
  {
2574
- "epoch": 0.92612453472967,
2575
- "grad_norm": 0.7036879658699036,
2576
- "learning_rate": 8.231116028920765e-07,
2577
- "loss": 0.793,
2578
- "step": 3670
2579
  },
2580
  {
2581
- "epoch": 0.9286480348243014,
2582
- "grad_norm": 0.6103026270866394,
2583
- "learning_rate": 7.679849909351472e-07,
2584
- "loss": 0.7818,
2585
- "step": 3680
2586
  },
2587
  {
2588
- "epoch": 0.9311715349189326,
2589
- "grad_norm": 0.6900059580802917,
2590
- "learning_rate": 7.147401526841485e-07,
2591
- "loss": 0.773,
2592
- "step": 3690
2593
  },
2594
  {
2595
- "epoch": 0.9336950350135638,
2596
- "grad_norm": 0.681058943271637,
2597
- "learning_rate": 6.633812229473791e-07,
2598
- "loss": 0.8357,
2599
- "step": 3700
2600
  },
2601
  {
2602
- "epoch": 0.9362185351081951,
2603
- "grad_norm": 0.7187952995300293,
2604
- "learning_rate": 6.139121900800515e-07,
2605
- "loss": 0.7779,
2606
- "step": 3710
2607
  },
2608
  {
2609
- "epoch": 0.9387420352028263,
2610
- "grad_norm": 0.6179840564727783,
2611
- "learning_rate": 5.663368956745963e-07,
2612
- "loss": 0.7871,
2613
- "step": 3720
2614
  },
2615
  {
2616
- "epoch": 0.9412655352974576,
2617
- "grad_norm": 0.6663089394569397,
2618
- "learning_rate": 5.206590342623164e-07,
2619
- "loss": 0.7901,
2620
- "step": 3730
2621
  },
2622
  {
2623
- "epoch": 0.9437890353920888,
2624
- "grad_norm": 0.6079100370407104,
2625
- "learning_rate": 4.768821530264977e-07,
2626
- "loss": 0.8226,
2627
- "step": 3740
2628
  },
2629
  {
2630
- "epoch": 0.9463125354867201,
2631
- "grad_norm": 0.68614262342453,
2632
- "learning_rate": 4.350096515269325e-07,
2633
- "loss": 0.8185,
2634
- "step": 3750
2635
  },
2636
  {
2637
- "epoch": 0.9488360355813513,
2638
- "grad_norm": 0.6491347551345825,
2639
- "learning_rate": 3.950447814359409e-07,
2640
- "loss": 0.817,
2641
- "step": 3760
2642
  },
2643
  {
2644
- "epoch": 0.9513595356759826,
2645
- "grad_norm": 0.6513685584068298,
2646
- "learning_rate": 3.5699064628583745e-07,
2647
- "loss": 0.7997,
2648
- "step": 3770
2649
  },
2650
  {
2651
- "epoch": 0.9538830357706138,
2652
- "grad_norm": 0.6080814003944397,
2653
- "learning_rate": 3.2085020122793186e-07,
2654
- "loss": 0.7956,
2655
- "step": 3780
2656
  },
2657
  {
2658
- "epoch": 0.956406535865245,
2659
- "grad_norm": 0.6476254463195801,
2660
- "learning_rate": 2.8662625280304613e-07,
2661
- "loss": 0.7888,
2662
- "step": 3790
2663
  },
2664
  {
2665
- "epoch": 0.9589300359598764,
2666
- "grad_norm": 0.6439909934997559,
2667
- "learning_rate": 2.5432145872355816e-07,
2668
- "loss": 0.7847,
2669
- "step": 3800
2670
  },
2671
  {
2672
- "epoch": 0.9614535360545076,
2673
- "grad_norm": 0.6744981408119202,
2674
- "learning_rate": 2.2393832766701706e-07,
2675
- "loss": 0.8093,
2676
- "step": 3810
2677
  },
2678
  {
2679
- "epoch": 0.9639770361491389,
2680
- "grad_norm": 0.5795860886573792,
2681
- "learning_rate": 1.9547921908133483e-07,
2682
- "loss": 0.8082,
2683
- "step": 3820
2684
  },
2685
  {
2686
- "epoch": 0.9665005362437701,
2687
- "grad_norm": 0.6693094968795776,
2688
- "learning_rate": 1.689463430015442e-07,
2689
- "loss": 0.7857,
2690
- "step": 3830
2691
  },
2692
  {
2693
- "epoch": 0.9690240363384014,
2694
- "grad_norm": 0.645203173160553,
2695
- "learning_rate": 1.443417598781971e-07,
2696
- "loss": 0.8056,
2697
- "step": 3840
2698
  },
2699
  {
2700
- "epoch": 0.9715475364330326,
2701
- "grad_norm": 0.6820341348648071,
2702
- "learning_rate": 1.2166738041733684e-07,
2703
- "loss": 0.802,
2704
- "step": 3850
2705
  },
2706
  {
2707
- "epoch": 0.9740710365276639,
2708
- "grad_norm": 0.6292694807052612,
2709
- "learning_rate": 1.0092496543212814e-07,
2710
- "loss": 0.7937,
2711
- "step": 3860
2712
  },
2713
  {
2714
- "epoch": 0.9765945366222951,
2715
- "grad_norm": 0.6253132224082947,
2716
- "learning_rate": 8.211612570611926e-08,
2717
- "loss": 0.7846,
2718
- "step": 3870
2719
  },
2720
  {
2721
- "epoch": 0.9791180367169264,
2722
- "grad_norm": 0.6571831107139587,
2723
- "learning_rate": 6.524232186815305e-08,
2724
- "loss": 0.785,
2725
- "step": 3880
2726
  },
2727
  {
2728
- "epoch": 0.9816415368115576,
2729
- "grad_norm": 0.6356094479560852,
2730
- "learning_rate": 5.03048642789411e-08,
2731
- "loss": 0.7789,
2732
- "step": 3890
2733
  },
2734
  {
2735
- "epoch": 0.9841650369061888,
2736
- "grad_norm": 0.8404703140258789,
2737
- "learning_rate": 3.730491292930072e-08,
2738
- "loss": 0.7954,
2739
- "step": 3900
2740
  },
2741
  {
2742
- "epoch": 0.9866885370008202,
2743
- "grad_norm": 0.7891058325767517,
2744
- "learning_rate": 2.624347735007693e-08,
2745
- "loss": 0.8129,
2746
- "step": 3910
2747
  },
2748
  {
2749
- "epoch": 0.9892120370954514,
2750
- "grad_norm": 0.6858798265457153,
2751
- "learning_rate": 1.7121416533749658e-08,
2752
- "loss": 0.8076,
2753
- "step": 3920
2754
  },
2755
  {
2756
- "epoch": 0.9917355371900827,
2757
- "grad_norm": 0.6489024758338928,
2758
- "learning_rate": 9.939438867723194e-09,
2759
- "loss": 0.8087,
2760
- "step": 3930
2761
  },
2762
  {
2763
- "epoch": 0.9942590372847139,
2764
- "grad_norm": 0.6204003691673279,
2765
- "learning_rate": 4.6981020793118725e-09,
2766
- "loss": 0.8162,
2767
- "step": 3940
2768
  },
2769
  {
2770
- "epoch": 0.9967825373793452,
2771
- "grad_norm": 0.6356140971183777,
2772
- "learning_rate": 1.3978131924385906e-09,
2773
- "loss": 0.7862,
2774
- "step": 3950
2775
  },
2776
  {
2777
- "epoch": 0.9993060374739764,
2778
- "grad_norm": 0.6472454071044922,
2779
- "learning_rate": 3.88284960184393e-11,
2780
- "loss": 0.8188,
2781
- "step": 3960
2782
  }
2783
  ],
2784
- "logging_steps": 10,
2785
- "max_steps": 3962,
2786
  "num_input_tokens_seen": 0,
2787
- "num_train_epochs": 1,
2788
- "save_steps": 1000,
2789
  "stateful_callbacks": {
2790
  "TrainerControl": {
2791
  "args": {
@@ -2798,8 +894,8 @@
2798
  "attributes": {}
2799
  }
2800
  },
2801
- "total_flos": 2.0380844918675866e+18,
2802
- "train_batch_size": 2,
2803
  "trial_name": null,
2804
  "trial_params": null
2805
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 500.0,
6
+ "global_step": 6237,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.02405002405002405,
13
+ "grad_norm": 0.4139963388442993,
14
+ "learning_rate": 0.00019996828714700116,
15
+ "loss": 1.5971,
16
+ "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  },
18
  {
19
+ "epoch": 0.0481000481000481,
20
+ "grad_norm": 0.3423018157482147,
21
+ "learning_rate": 0.00019987316870210547,
22
+ "loss": 1.274,
23
+ "step": 100
24
  },
25
  {
26
+ "epoch": 0.07215007215007214,
27
+ "grad_norm": 0.3551710247993469,
28
+ "learning_rate": 0.0001997147049948582,
29
+ "loss": 1.2519,
30
+ "step": 150
31
  },
32
  {
33
+ "epoch": 0.0962000962000962,
34
+ "grad_norm": 0.32329073548316956,
35
+ "learning_rate": 0.0001994929965319844,
36
+ "loss": 1.2382,
37
+ "step": 200
38
  },
39
  {
40
+ "epoch": 0.12025012025012025,
41
+ "grad_norm": 0.48585018515586853,
42
+ "learning_rate": 0.0001992081839336419,
43
+ "loss": 1.2293,
44
+ "step": 250
45
  },
46
  {
47
+ "epoch": 0.1443001443001443,
48
+ "grad_norm": 0.40136224031448364,
49
+ "learning_rate": 0.00019886044784423197,
50
+ "loss": 1.2214,
51
+ "step": 300
52
  },
53
  {
54
+ "epoch": 0.16835016835016836,
55
+ "grad_norm": 0.574002206325531,
56
+ "learning_rate": 0.00019845000881782432,
57
+ "loss": 1.2184,
58
+ "step": 350
59
  },
60
  {
61
+ "epoch": 0.1924001924001924,
62
+ "grad_norm": 0.4179827570915222,
63
+ "learning_rate": 0.00019797712717826914,
64
+ "loss": 1.2064,
65
+ "step": 400
66
  },
67
  {
68
+ "epoch": 0.21645021645021645,
69
+ "grad_norm": 0.33033809065818787,
70
+ "learning_rate": 0.00019744210285408488,
71
+ "loss": 1.2055,
72
+ "step": 450
73
  },
74
  {
75
+ "epoch": 0.2405002405002405,
76
+ "grad_norm": 0.2719138562679291,
77
+ "learning_rate": 0.0001968452751882264,
78
+ "loss": 1.2077,
79
+ "step": 500
80
  },
81
  {
82
+ "epoch": 0.26455026455026454,
83
+ "grad_norm": 0.29797521233558655,
84
+ "learning_rate": 0.00019618702272285434,
85
+ "loss": 1.2096,
86
+ "step": 550
87
  },
88
  {
89
+ "epoch": 0.2886002886002886,
90
+ "grad_norm": 0.3336372673511505,
91
+ "learning_rate": 0.00019546776295924212,
92
+ "loss": 1.2072,
93
+ "step": 600
94
  },
95
  {
96
+ "epoch": 0.3126503126503126,
97
+ "grad_norm": 0.26755037903785706,
98
+ "learning_rate": 0.0001946879520929728,
99
+ "loss": 1.1974,
100
+ "step": 650
101
  },
102
  {
103
+ "epoch": 0.3367003367003367,
104
+ "grad_norm": 0.36268576979637146,
105
+ "learning_rate": 0.00019384808472459368,
106
+ "loss": 1.2045,
107
+ "step": 700
108
  },
109
  {
110
+ "epoch": 0.36075036075036077,
111
+ "grad_norm": 0.3121575713157654,
112
+ "learning_rate": 0.0001929486935459127,
113
+ "loss": 1.1889,
114
+ "step": 750
115
  },
116
  {
117
+ "epoch": 0.3848003848003848,
118
+ "grad_norm": 0.3159404993057251,
119
+ "learning_rate": 0.00019199034900213452,
120
+ "loss": 1.1921,
121
+ "step": 800
122
  },
123
  {
124
+ "epoch": 0.40885040885040885,
125
+ "grad_norm": 0.7236579060554504,
126
+ "learning_rate": 0.000190973658930052,
127
+ "loss": 1.194,
128
+ "step": 850
129
  },
130
  {
131
+ "epoch": 0.4329004329004329,
132
+ "grad_norm": 0.24907168745994568,
133
+ "learning_rate": 0.00018989926817252113,
134
+ "loss": 1.191,
135
+ "step": 900
136
  },
137
  {
138
+ "epoch": 0.45695045695045694,
139
+ "grad_norm": 0.24481187760829926,
140
+ "learning_rate": 0.00018876785816946505,
141
+ "loss": 1.1857,
142
+ "step": 950
143
  },
144
  {
145
+ "epoch": 0.481000481000481,
146
+ "grad_norm": 0.2668200731277466,
147
+ "learning_rate": 0.00018758014652566597,
148
+ "loss": 1.1957,
149
+ "step": 1000
150
  },
151
  {
152
+ "epoch": 0.5050505050505051,
153
+ "grad_norm": 0.2687171399593353,
154
+ "learning_rate": 0.0001863368865556191,
155
+ "loss": 1.1864,
156
+ "step": 1050
157
  },
158
  {
159
+ "epoch": 0.5291005291005291,
160
+ "grad_norm": 0.23915782570838928,
161
+ "learning_rate": 0.0001850388668057379,
162
+ "loss": 1.184,
163
+ "step": 1100
164
  },
165
  {
166
+ "epoch": 0.5531505531505532,
167
+ "grad_norm": 0.37159469723701477,
168
+ "learning_rate": 0.0001836869105542127,
169
+ "loss": 1.1849,
170
+ "step": 1150
171
  },
172
  {
173
+ "epoch": 0.5772005772005772,
174
+ "grad_norm": 0.2752649784088135,
175
+ "learning_rate": 0.0001822818752888408,
176
+ "loss": 1.1843,
177
+ "step": 1200
178
  },
179
  {
180
+ "epoch": 0.6012506012506013,
181
+ "grad_norm": 0.19733025133609772,
182
+ "learning_rate": 0.00018082465216315882,
183
+ "loss": 1.1766,
184
+ "step": 1250
185
  },
186
  {
187
+ "epoch": 0.6253006253006252,
188
+ "grad_norm": 0.2180165797472,
189
+ "learning_rate": 0.00017931616543122214,
190
+ "loss": 1.1865,
191
+ "step": 1300
192
  },
193
  {
194
+ "epoch": 0.6493506493506493,
195
+ "grad_norm": 0.25025510787963867,
196
+ "learning_rate": 0.00017775737186139038,
197
+ "loss": 1.1723,
198
+ "step": 1350
199
  },
200
  {
201
+ "epoch": 0.6734006734006734,
202
+ "grad_norm": 0.2865007817745209,
203
+ "learning_rate": 0.00017614926012949028,
204
+ "loss": 1.172,
205
+ "step": 1400
206
  },
207
  {
208
+ "epoch": 0.6974506974506974,
209
+ "grad_norm": 0.3406023681163788,
210
+ "learning_rate": 0.00017449285019174098,
211
+ "loss": 1.1795,
212
+ "step": 1450
213
  },
214
  {
215
+ "epoch": 0.7215007215007215,
216
+ "grad_norm": 0.19766800105571747,
217
+ "learning_rate": 0.00017278919263783978,
218
+ "loss": 1.1784,
219
+ "step": 1500
220
  },
221
  {
222
+ "epoch": 0.7455507455507455,
223
+ "grad_norm": 0.1965962052345276,
224
+ "learning_rate": 0.00017103936802461797,
225
+ "loss": 1.1754,
226
+ "step": 1550
227
  },
228
  {
229
+ "epoch": 0.7696007696007696,
230
+ "grad_norm": 0.2381555736064911,
231
+ "learning_rate": 0.00016924448619069023,
232
+ "loss": 1.1671,
233
+ "step": 1600
234
  },
235
  {
236
+ "epoch": 0.7936507936507936,
237
+ "grad_norm": 0.20156389474868774,
238
+ "learning_rate": 0.00016740568555253155,
239
+ "loss": 1.1738,
240
+ "step": 1650
241
  },
242
  {
243
+ "epoch": 0.8177008177008177,
244
+ "grad_norm": 0.18294361233711243,
245
+ "learning_rate": 0.00016552413238242857,
246
+ "loss": 1.1727,
247
+ "step": 1700
248
  },
249
  {
250
+ "epoch": 0.8417508417508418,
251
+ "grad_norm": 0.2975623309612274,
252
+ "learning_rate": 0.00016360102006876317,
253
+ "loss": 1.1677,
254
+ "step": 1750
255
  },
256
  {
257
+ "epoch": 0.8658008658008658,
258
+ "grad_norm": 0.1871371865272522,
259
+ "learning_rate": 0.0001616375683590974,
260
+ "loss": 1.1689,
261
+ "step": 1800
262
  },
263
  {
264
+ "epoch": 0.8898508898508899,
265
+ "grad_norm": 0.21457934379577637,
266
+ "learning_rate": 0.00015963502258654005,
267
+ "loss": 1.1605,
268
+ "step": 1850
269
  },
270
  {
271
+ "epoch": 0.9139009139009139,
272
+ "grad_norm": 0.20261706411838531,
273
+ "learning_rate": 0.0001575946528798853,
274
+ "loss": 1.1627,
275
+ "step": 1900
276
  },
277
  {
278
+ "epoch": 0.937950937950938,
279
+ "grad_norm": 0.17685186862945557,
280
+ "learning_rate": 0.0001555177533580245,
281
+ "loss": 1.1627,
282
+ "step": 1950
283
  },
284
  {
285
+ "epoch": 0.962000962000962,
286
+ "grad_norm": 0.212468221783638,
287
+ "learning_rate": 0.00015340564130914233,
288
+ "loss": 1.161,
289
+ "step": 2000
290
  },
291
  {
292
+ "epoch": 0.9860509860509861,
293
+ "grad_norm": 0.175174742937088,
294
+ "learning_rate": 0.00015125965635521724,
295
+ "loss": 1.1688,
296
+ "step": 2050
297
  },
298
  {
299
+ "epoch": 1.0101010101010102,
300
+ "grad_norm": 0.19970253109931946,
301
+ "learning_rate": 0.00014908115960235682,
302
+ "loss": 1.142,
303
+ "step": 2100
304
  },
305
  {
306
+ "epoch": 1.034151034151034,
307
+ "grad_norm": 0.21254608035087585,
308
+ "learning_rate": 0.00014687153277750676,
309
+ "loss": 1.1271,
310
+ "step": 2150
311
  },
312
  {
313
+ "epoch": 1.0582010582010581,
314
+ "grad_norm": 0.1651500016450882,
315
+ "learning_rate": 0.00014463217735208062,
316
+ "loss": 1.121,
317
+ "step": 2200
318
  },
319
  {
320
+ "epoch": 1.0822510822510822,
321
+ "grad_norm": 0.2405405044555664,
322
+ "learning_rate": 0.00014236451365306674,
323
+ "loss": 1.1313,
324
+ "step": 2250
325
  },
326
  {
327
+ "epoch": 1.1063011063011063,
328
+ "grad_norm": 0.17223596572875977,
329
+ "learning_rate": 0.00014006997996217593,
330
+ "loss": 1.1344,
331
  "step": 2300
332
  },
333
  {
334
+ "epoch": 1.1303511303511304,
335
+ "grad_norm": 0.1969347894191742,
336
+ "learning_rate": 0.00013775003160360096,
337
+ "loss": 1.1176,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  "step": 2350
339
  },
340
  {
341
+ "epoch": 1.1544011544011543,
342
+ "grad_norm": 0.187143936753273,
343
+ "learning_rate": 0.00013540614002096701,
344
+ "loss": 1.1322,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  "step": 2400
346
  },
347
  {
348
+ "epoch": 1.1784511784511784,
349
+ "grad_norm": 0.1838238537311554,
350
+ "learning_rate": 0.00013303979184405826,
351
+ "loss": 1.1293,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  "step": 2450
353
  },
354
  {
355
+ "epoch": 1.2025012025012025,
356
+ "grad_norm": 0.17928341031074524,
357
+ "learning_rate": 0.00013065248794591223,
358
+ "loss": 1.1268,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  "step": 2500
360
  },
361
  {
362
+ "epoch": 1.2265512265512266,
363
+ "grad_norm": 0.2683047950267792,
364
+ "learning_rate": 0.00012824574249088063,
365
+ "loss": 1.1234,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  "step": 2550
367
  },
368
  {
369
+ "epoch": 1.2506012506012505,
370
+ "grad_norm": 0.18034860491752625,
371
+ "learning_rate": 0.0001258210819742599,
372
+ "loss": 1.125,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  "step": 2600
374
  },
375
  {
376
+ "epoch": 1.2746512746512746,
377
+ "grad_norm": 0.26357391476631165,
378
+ "learning_rate": 0.00012338004425410074,
379
+ "loss": 1.1217,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  "step": 2650
381
  },
382
  {
383
+ "epoch": 1.2987012987012987,
384
+ "grad_norm": 0.17828579246997833,
385
+ "learning_rate": 0.00012092417757581085,
386
+ "loss": 1.1262,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  "step": 2700
388
  },
389
  {
390
+ "epoch": 1.3227513227513228,
391
+ "grad_norm": 0.20247310400009155,
392
+ "learning_rate": 0.00011845503959016928,
393
+ "loss": 1.1246,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  "step": 2750
395
  },
396
  {
397
+ "epoch": 1.3468013468013469,
398
+ "grad_norm": 0.17381271719932556,
399
+ "learning_rate": 0.0001159741963653755,
400
+ "loss": 1.1181,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
  "step": 2800
402
  },
403
  {
404
+ "epoch": 1.370851370851371,
405
+ "grad_norm": 0.19958114624023438,
406
+ "learning_rate": 0.00011348322139375948,
407
+ "loss": 1.1307,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
408
  "step": 2850
409
  },
410
  {
411
+ "epoch": 1.3949013949013949,
412
+ "grad_norm": 0.21912401914596558,
413
+ "learning_rate": 0.00011098369459378328,
414
+ "loss": 1.1264,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
  "step": 2900
416
  },
417
  {
418
+ "epoch": 1.418951418951419,
419
+ "grad_norm": 0.1694297194480896,
420
+ "learning_rate": 0.00010847720130796631,
421
+ "loss": 1.1256,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
  "step": 2950
423
  },
424
  {
425
+ "epoch": 1.443001443001443,
426
+ "grad_norm": 0.13446395099163055,
427
+ "learning_rate": 0.00010596533129737092,
428
+ "loss": 1.1258,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  "step": 3000
430
  },
431
  {
432
+ "epoch": 1.467051467051467,
433
+ "grad_norm": 0.140371173620224,
434
+ "learning_rate": 0.00010344967773328507,
435
+ "loss": 1.1191,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
  "step": 3050
437
  },
438
  {
439
+ "epoch": 1.491101491101491,
440
+ "grad_norm": 0.18016813695430756,
441
+ "learning_rate": 0.00010093183618674224,
442
+ "loss": 1.114,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
  "step": 3100
444
  },
445
  {
446
+ "epoch": 1.5151515151515151,
447
+ "grad_norm": 0.17306862771511078,
448
+ "learning_rate": 9.84134036165192e-05,
449
+ "loss": 1.1149,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
  "step": 3150
451
  },
452
  {
453
+ "epoch": 1.5392015392015392,
454
+ "grad_norm": 0.14116255939006805,
455
+ "learning_rate": 9.589597735625377e-05,
456
+ "loss": 1.123,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
  "step": 3200
458
  },
459
  {
460
+ "epoch": 1.5632515632515633,
461
+ "grad_norm": 0.16819800436496735,
462
+ "learning_rate": 9.338115410132441e-05,
463
+ "loss": 1.1203,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  "step": 3250
465
  },
466
  {
467
+ "epoch": 1.5873015873015874,
468
+ "grad_norm": 0.21958529949188232,
469
+ "learning_rate": 9.087052889613518e-05,
470
+ "loss": 1.1226,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471
  "step": 3300
472
  },
473
  {
474
+ "epoch": 1.6113516113516113,
475
+ "grad_norm": 0.15786272287368774,
476
+ "learning_rate": 8.836569412244745e-05,
477
+ "loss": 1.1212,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
  "step": 3350
479
  },
480
  {
481
+ "epoch": 1.6354016354016354,
482
+ "grad_norm": 0.17366796731948853,
483
+ "learning_rate": 8.586823848940047e-05,
484
+ "loss": 1.1129,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
  "step": 3400
486
  },
487
  {
488
+ "epoch": 1.6594516594516593,
489
+ "grad_norm": 0.21448016166687012,
490
+ "learning_rate": 8.337974602586152e-05,
491
+ "loss": 1.1216,
492
+ "step": 3450
493
  },
494
  {
495
+ "epoch": 1.6835016835016834,
496
+ "grad_norm": 0.17243099212646484,
497
+ "learning_rate": 8.090179507574427e-05,
498
+ "loss": 1.1096,
499
+ "step": 3500
500
  },
501
  {
502
+ "epoch": 1.7075517075517075,
503
+ "grad_norm": 0.1429734081029892,
504
+ "learning_rate": 7.843595729693316e-05,
505
+ "loss": 1.1071,
506
+ "step": 3550
507
  },
508
  {
509
+ "epoch": 1.7316017316017316,
510
+ "grad_norm": 0.15200386941432953,
511
+ "learning_rate": 7.598379666444808e-05,
512
+ "loss": 1.1158,
513
+ "step": 3600
514
  },
515
  {
516
+ "epoch": 1.7556517556517557,
517
+ "grad_norm": 0.1442406326532364,
518
+ "learning_rate": 7.354686847848242e-05,
519
+ "loss": 1.112,
520
+ "step": 3650
521
  },
522
  {
523
+ "epoch": 1.7797017797017798,
524
+ "grad_norm": 0.17678239941596985,
525
+ "learning_rate": 7.11267183779428e-05,
526
+ "loss": 1.1118,
527
+ "step": 3700
528
  },
529
  {
530
+ "epoch": 1.8037518037518039,
531
+ "grad_norm": 0.147593155503273,
532
+ "learning_rate": 6.872488136011667e-05,
533
+ "loss": 1.1165,
534
+ "step": 3750
535
  },
536
  {
537
+ "epoch": 1.8278018278018278,
538
+ "grad_norm": 0.1334652155637741,
539
+ "learning_rate": 6.634288080708952e-05,
540
+ "loss": 1.1135,
541
+ "step": 3800
542
  },
543
  {
544
+ "epoch": 1.8518518518518519,
545
+ "grad_norm": 0.14890378713607788,
546
+ "learning_rate": 6.398222751952899e-05,
547
+ "loss": 1.1086,
548
+ "step": 3850
549
  },
550
  {
551
+ "epoch": 1.8759018759018757,
552
+ "grad_norm": 0.1334807574748993,
553
+ "learning_rate": 6.164441875844882e-05,
554
+ "loss": 1.1144,
555
+ "step": 3900
556
  },
557
  {
558
+ "epoch": 1.8999518999518998,
559
+ "grad_norm": 0.12897680699825287,
560
+ "learning_rate": 5.933093729556062e-05,
561
+ "loss": 1.1116,
562
+ "step": 3950
563
  },
564
  {
565
+ "epoch": 1.924001924001924,
566
+ "grad_norm": 0.17530564963817596,
567
+ "learning_rate": 5.7043250472815356e-05,
568
+ "loss": 1.1039,
569
+ "step": 4000
570
  },
571
  {
572
+ "epoch": 1.948051948051948,
573
+ "grad_norm": 0.15966495871543884,
574
+ "learning_rate": 5.478280927173145e-05,
575
+ "loss": 1.101,
576
+ "step": 4050
577
  },
578
  {
579
+ "epoch": 1.9721019721019721,
580
+ "grad_norm": 0.18890446424484253,
581
+ "learning_rate": 5.255104739309924e-05,
582
+ "loss": 1.1077,
583
+ "step": 4100
584
  },
585
  {
586
+ "epoch": 1.9961519961519962,
587
+ "grad_norm": 0.1547369807958603,
588
+ "learning_rate": 5.0349380347646494e-05,
589
+ "loss": 1.103,
590
+ "step": 4150
591
  },
592
  {
593
+ "epoch": 2.0202020202020203,
594
+ "grad_norm": 0.13888758420944214,
595
+ "learning_rate": 4.8179204558240444e-05,
596
+ "loss": 1.0826,
597
+ "step": 4200
598
  },
599
  {
600
+ "epoch": 2.0442520442520444,
601
+ "grad_norm": 0.11266086250543594,
602
+ "learning_rate": 4.6041896474197e-05,
603
+ "loss": 1.071,
604
+ "step": 4250
605
  },
606
  {
607
+ "epoch": 2.068302068302068,
608
+ "grad_norm": 0.14245671033859253,
609
+ "learning_rate": 4.393881169825779e-05,
610
+ "loss": 1.0759,
611
+ "step": 4300
612
  },
613
  {
614
+ "epoch": 2.092352092352092,
615
+ "grad_norm": 0.1226249411702156,
616
+ "learning_rate": 4.187128412678969e-05,
617
+ "loss": 1.0742,
618
+ "step": 4350
619
  },
620
  {
621
+ "epoch": 2.1164021164021163,
622
+ "grad_norm": 0.12307476997375488,
623
+ "learning_rate": 3.984062510375155e-05,
624
+ "loss": 1.0721,
625
+ "step": 4400
626
  },
627
  {
628
+ "epoch": 2.1404521404521404,
629
+ "grad_norm": 0.12813834846019745,
630
+ "learning_rate": 3.7848122588965144e-05,
631
+ "loss": 1.0726,
632
+ "step": 4450
633
  },
634
  {
635
+ "epoch": 2.1645021645021645,
636
+ "grad_norm": 0.13432885706424713,
637
+ "learning_rate": 3.5895040341217543e-05,
638
+ "loss": 1.0745,
639
+ "step": 4500
640
  },
641
  {
642
+ "epoch": 2.1885521885521886,
643
+ "grad_norm": 0.11649097502231598,
644
+ "learning_rate": 3.398261711671309e-05,
645
+ "loss": 1.079,
646
+ "step": 4550
647
  },
648
  {
649
+ "epoch": 2.2126022126022127,
650
+ "grad_norm": 0.11140163242816925,
651
+ "learning_rate": 3.211206588338358e-05,
652
+ "loss": 1.0748,
653
+ "step": 4600
654
  },
655
  {
656
+ "epoch": 2.236652236652237,
657
+ "grad_norm": 0.10978424549102783,
658
+ "learning_rate": 3.028457305155483e-05,
659
+ "loss": 1.0726,
660
+ "step": 4650
661
  },
662
  {
663
+ "epoch": 2.260702260702261,
664
+ "grad_norm": 0.11395589262247086,
665
+ "learning_rate": 2.8501297721457422e-05,
666
+ "loss": 1.0656,
667
+ "step": 4700
668
  },
669
  {
670
+ "epoch": 2.284752284752285,
671
+ "grad_norm": 0.10599405318498611,
672
+ "learning_rate": 2.6763370948059353e-05,
673
+ "loss": 1.0765,
674
+ "step": 4750
675
  },
676
  {
677
+ "epoch": 2.3088023088023086,
678
+ "grad_norm": 0.11157254874706268,
679
+ "learning_rate": 2.5071895023686442e-05,
680
+ "loss": 1.0726,
681
+ "step": 4800
682
  },
683
  {
684
+ "epoch": 2.3328523328523327,
685
+ "grad_norm": 0.1390163153409958,
686
+ "learning_rate": 2.342794277888547e-05,
687
+ "loss": 1.0731,
688
+ "step": 4850
689
  },
690
  {
691
+ "epoch": 2.356902356902357,
692
+ "grad_norm": 0.1519329994916916,
693
+ "learning_rate": 2.1832556901973965e-05,
694
+ "loss": 1.0704,
695
+ "step": 4900
696
  },
697
  {
698
+ "epoch": 2.380952380952381,
699
+ "grad_norm": 0.1278182566165924,
700
+ "learning_rate": 2.0286749277707782e-05,
701
+ "loss": 1.0661,
702
+ "step": 4950
703
  },
704
  {
705
+ "epoch": 2.405002405002405,
706
+ "grad_norm": 0.10508263111114502,
707
+ "learning_rate": 1.879150034548588e-05,
708
+ "loss": 1.0758,
709
+ "step": 5000
710
  },
711
  {
712
+ "epoch": 2.429052429052429,
713
+ "grad_norm": 0.09690719097852707,
714
+ "learning_rate": 1.7347758477500044e-05,
715
+ "loss": 1.0644,
716
+ "step": 5050
717
  },
718
  {
719
+ "epoch": 2.4531024531024532,
720
+ "grad_norm": 0.10174595564603806,
721
+ "learning_rate": 1.5956439377222798e-05,
722
+ "loss": 1.0726,
723
+ "step": 5100
724
  },
725
  {
726
+ "epoch": 2.4771524771524773,
727
+ "grad_norm": 0.10294167697429657,
728
+ "learning_rate": 1.4618425498616162e-05,
729
+ "loss": 1.0655,
730
+ "step": 5150
731
  },
732
  {
733
+ "epoch": 2.501202501202501,
734
+ "grad_norm": 0.11103129386901855,
735
+ "learning_rate": 1.3334565486428996e-05,
736
+ "loss": 1.0651,
737
+ "step": 5200
738
  },
739
  {
740
+ "epoch": 2.525252525252525,
741
+ "grad_norm": 0.10614852607250214,
742
+ "learning_rate": 1.2105673637938053e-05,
743
+ "loss": 1.0701,
744
+ "step": 5250
745
  },
746
  {
747
+ "epoch": 2.549302549302549,
748
+ "grad_norm": 0.09437720477581024,
749
+ "learning_rate": 1.0932529386474188e-05,
750
+ "loss": 1.0673,
751
+ "step": 5300
752
  },
753
  {
754
+ "epoch": 2.5733525733525733,
755
+ "grad_norm": 0.0965106412768364,
756
+ "learning_rate": 9.815876807061264e-06,
757
+ "loss": 1.0769,
758
+ "step": 5350
759
  },
760
  {
761
+ "epoch": 2.5974025974025974,
762
+ "grad_norm": 0.09335634112358093,
763
+ "learning_rate": 8.756424144481312e-06,
764
+ "loss": 1.0646,
765
+ "step": 5400
766
  },
767
  {
768
+ "epoch": 2.6214526214526215,
769
+ "grad_norm": 0.09890544414520264,
770
+ "learning_rate": 7.75484336406529e-06,
771
+ "loss": 1.0757,
772
+ "step": 5450
773
  },
774
  {
775
+ "epoch": 2.6455026455026456,
776
+ "grad_norm": 0.09670912474393845,
777
+ "learning_rate": 6.8117697254943106e-06,
778
+ "loss": 1.0668,
779
+ "step": 5500
780
  },
781
  {
782
+ "epoch": 2.6695526695526697,
783
+ "grad_norm": 0.09898468106985092,
784
+ "learning_rate": 5.927801379881714e-06,
785
+ "loss": 1.0745,
786
+ "step": 5550
787
  },
788
  {
789
+ "epoch": 2.6936026936026938,
790
+ "grad_norm": 0.08697386831045151,
791
+ "learning_rate": 5.103498990391509e-06,
792
+ "loss": 1.0653,
793
+ "step": 5600
794
  },
795
  {
796
+ "epoch": 2.717652717652718,
797
+ "grad_norm": 0.09457134455442429,
798
+ "learning_rate": 4.339385376633775e-06,
799
+ "loss": 1.0678,
800
+ "step": 5650
801
  },
802
  {
803
+ "epoch": 2.741702741702742,
804
+ "grad_norm": 0.09092475473880768,
805
+ "learning_rate": 3.6359451830626723e-06,
806
+ "loss": 1.0635,
807
+ "step": 5700
808
  },
809
  {
810
+ "epoch": 2.7657527657527656,
811
+ "grad_norm": 0.08736653625965118,
812
+ "learning_rate": 2.993624571587239e-06,
813
+ "loss": 1.0639,
814
+ "step": 5750
815
  },
816
  {
817
+ "epoch": 2.7898027898027897,
818
+ "grad_norm": 0.09138292819261551,
819
+ "learning_rate": 2.4128309385900717e-06,
820
+ "loss": 1.065,
821
+ "step": 5800
822
  },
823
  {
824
+ "epoch": 2.813852813852814,
825
+ "grad_norm": 0.08842656016349792,
826
+ "learning_rate": 1.8939326565333037e-06,
827
+ "loss": 1.0636,
828
+ "step": 5850
829
  },
830
  {
831
+ "epoch": 2.837902837902838,
832
+ "grad_norm": 0.08870802819728851,
833
+ "learning_rate": 1.437258840315714e-06,
834
+ "loss": 1.0706,
835
+ "step": 5900
836
  },
837
  {
838
+ "epoch": 2.861952861952862,
839
+ "grad_norm": 0.08659425377845764,
840
+ "learning_rate": 1.0430991385293575e-06,
841
+ "loss": 1.0673,
842
+ "step": 5950
843
  },
844
  {
845
+ "epoch": 2.886002886002886,
846
+ "grad_norm": 0.08142086863517761,
847
+ "learning_rate": 7.117035497478553e-07,
848
+ "loss": 1.0697,
849
+ "step": 6000
850
  },
851
  {
852
+ "epoch": 2.91005291005291,
853
+ "grad_norm": 0.080448217689991,
854
+ "learning_rate": 4.432822639630407e-07,
855
+ "loss": 1.0655,
856
+ "step": 6050
857
  },
858
  {
859
+ "epoch": 2.934102934102934,
860
+ "grad_norm": 0.08980288356542587,
861
+ "learning_rate": 2.380055292704575e-07,
862
+ "loss": 1.0701,
863
+ "step": 6100
864
  },
865
  {
866
+ "epoch": 2.958152958152958,
867
+ "grad_norm": 0.08309097588062286,
868
+ "learning_rate": 9.600354388833443e-08,
869
+ "loss": 1.0684,
870
+ "step": 6150
871
  },
872
  {
873
+ "epoch": 2.982202982202982,
874
+ "grad_norm": 0.08456841111183167,
875
+ "learning_rate": 1.7366373578442397e-08,
876
+ "loss": 1.0684,
877
+ "step": 6200
878
  }
879
  ],
880
+ "logging_steps": 50,
881
+ "max_steps": 6237,
882
  "num_input_tokens_seen": 0,
883
+ "num_train_epochs": 3,
884
+ "save_steps": 500,
885
  "stateful_callbacks": {
886
  "TrainerControl": {
887
  "args": {
 
894
  "attributes": {}
895
  }
896
  },
897
+ "total_flos": 2.056700790948663e+20,
898
+ "train_batch_size": 4,
899
  "trial_name": null,
900
  "trial_params": null
901
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ba6c8e40dc2d34ebe219f16f05b9e60b23b1741dcfccb9613485fc1e913f881
3
- size 7032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cb09fa3cec0d925b5877a57afba4d17f256716f468a4f84dfa477dd700225e0
3
+ size 6968
zero_to_fp32.py CHANGED
@@ -10,7 +10,10 @@
10
  # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
  # application.
12
  #
13
- # example: python zero_to_fp32.py . pytorch_model.bin
 
 
 
14
 
15
  import argparse
16
  import torch
@@ -18,6 +21,8 @@ import glob
18
  import math
19
  import os
20
  import re
 
 
21
  from collections import OrderedDict
22
  from dataclasses import dataclass
23
 
@@ -139,7 +144,6 @@ def parse_model_states(files):
139
 
140
 
141
  def parse_optim_states(files, ds_checkpoint_dir):
142
-
143
  total_files = len(files)
144
  state_dicts = []
145
  for f in files:
@@ -420,12 +424,10 @@ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
420
  offset = 0
421
  total_numel = 0
422
  total_params = 0
423
- for name, shape in param_shapes.items():
424
-
425
  unpartitioned_numel = shape.numel()
426
  total_numel += unpartitioned_numel
427
  total_params += 1
428
-
429
  partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
430
 
431
  if debug:
@@ -521,21 +523,75 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_f
521
  return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
522
 
523
 
524
- def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
 
 
 
 
 
525
  """
526
  Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
527
  loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
528
 
529
  Args:
530
  - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
531
- - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
 
 
532
  - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
533
  - ``exclude_frozen_parameters``: exclude frozen parameters
534
  """
535
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
536
  state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
537
- print(f"Saving fp32 state dict to {output_file}")
538
- torch.save(state_dict, output_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
 
540
 
541
  def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
@@ -578,15 +634,27 @@ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
578
 
579
 
580
  if __name__ == "__main__":
581
-
582
  parser = argparse.ArgumentParser()
583
  parser.add_argument("checkpoint_dir",
584
  type=str,
585
  help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
 
 
 
 
586
  parser.add_argument(
587
- "output_file",
588
  type=str,
589
- help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
 
 
 
 
 
 
 
 
 
590
  parser.add_argument("-t",
591
  "--tag",
592
  type=str,
@@ -599,6 +667,8 @@ if __name__ == "__main__":
599
  debug = args.debug
600
 
601
  convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
602
- args.output_file,
 
 
603
  tag=args.tag,
604
  exclude_frozen_parameters=args.exclude_frozen_parameters)
 
10
  # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
  # application.
12
  #
13
+ # example:
14
+ # python zero_to_fp32.py . output_dir/
15
+ # or
16
+ # python zero_to_fp32.py . output_dir/ --safe_serialization
17
 
18
  import argparse
19
  import torch
 
21
  import math
22
  import os
23
  import re
24
+ import json
25
+ from tqdm import tqdm
26
  from collections import OrderedDict
27
  from dataclasses import dataclass
28
 
 
144
 
145
 
146
  def parse_optim_states(files, ds_checkpoint_dir):
 
147
  total_files = len(files)
148
  state_dicts = []
149
  for f in files:
 
424
  offset = 0
425
  total_numel = 0
426
  total_params = 0
427
+ for name, shape in tqdm(param_shapes.items(), desc='Gathering Sharded Weights'):
 
428
  unpartitioned_numel = shape.numel()
429
  total_numel += unpartitioned_numel
430
  total_params += 1
 
431
  partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
432
 
433
  if debug:
 
523
  return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
524
 
525
 
526
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
527
+ output_dir,
528
+ max_shard_size="5GB",
529
+ safe_serialization=False,
530
+ tag=None,
531
+ exclude_frozen_parameters=False):
532
  """
533
  Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
534
  loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
535
 
536
  Args:
537
  - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
538
+ - ``output_dir``: directory to the pytorch fp32 state_dict output files
539
+ - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
540
+ - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
541
  - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
542
  - ``exclude_frozen_parameters``: exclude frozen parameters
543
  """
544
+ # Dependency pre-check
545
+ if safe_serialization:
546
+ try:
547
+ from safetensors.torch import save_file
548
+ except ImportError:
549
+ print('If you want to use `safe_serialization`, please `pip install safetensors`')
550
+ raise
551
+ if max_shard_size is not None:
552
+ try:
553
+ from huggingface_hub import split_torch_state_dict_into_shards
554
+ except ImportError:
555
+ print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
556
+ raise
557
+
558
+ # Convert zero checkpoint to state_dict
559
  state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
560
+
561
+ # Shard the model if it is too big.
562
+ weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
563
+ if max_shard_size is not None:
564
+ filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
565
+ state_dict_split = split_torch_state_dict_into_shards(state_dict,
566
+ filename_pattern=filename_pattern,
567
+ max_shard_size=max_shard_size)
568
+ else:
569
+ from collections import namedtuple
570
+ StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
571
+ state_dict_split = StateDictSplit(is_sharded=False,
572
+ filename_to_tensors={weights_name: list(state_dict.keys())})
573
+
574
+ # Save the model
575
+ filename_to_tensors = state_dict_split.filename_to_tensors.items()
576
+ for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
577
+ shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
578
+ output_path = os.path.join(output_dir, shard_file)
579
+ if safe_serialization:
580
+ save_file(shard, output_path, metadata={"format": "pt"})
581
+ else:
582
+ torch.save(shard, output_path)
583
+
584
+ # Save index if sharded
585
+ if state_dict_split.is_sharded:
586
+ index = {
587
+ "metadata": state_dict_split.metadata,
588
+ "weight_map": state_dict_split.tensor_to_filename,
589
+ }
590
+ save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
591
+ save_index_file = os.path.join(output_dir, save_index_file)
592
+ with open(save_index_file, "w", encoding="utf-8") as f:
593
+ content = json.dumps(index, indent=2, sort_keys=True) + "\n"
594
+ f.write(content)
595
 
596
 
597
  def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
 
634
 
635
 
636
  if __name__ == "__main__":
 
637
  parser = argparse.ArgumentParser()
638
  parser.add_argument("checkpoint_dir",
639
  type=str,
640
  help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
641
+ parser.add_argument("output_dir",
642
+ type=str,
643
+ help="directory to the pytorch fp32 state_dict output files"
644
+ "(e.g. path/checkpoint-12-output/)")
645
  parser.add_argument(
646
+ "--max_shard_size",
647
  type=str,
648
+ default="5GB",
649
+ help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
650
+ "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
651
+ "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
652
+ "without CPU OOM issues.")
653
+ parser.add_argument(
654
+ "--safe_serialization",
655
+ default=False,
656
+ action='store_true',
657
+ help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
658
  parser.add_argument("-t",
659
  "--tag",
660
  type=str,
 
667
  debug = args.debug
668
 
669
  convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
670
+ args.output_dir,
671
+ max_shard_size=args.max_shard_size,
672
+ safe_serialization=args.safe_serialization,
673
  tag=args.tag,
674
  exclude_frozen_parameters=args.exclude_frozen_parameters)