File size: 23,458 Bytes
46bc97a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 5.984,
  "eval_steps": 100,
  "global_step": 246,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio": 0.0,
      "completion_length": 583.2621688842773,
      "epoch": 0.12,
      "grad_norm": 0.12920165061950684,
      "kl": 0.0007047017415364583,
      "learning_rate": 4e-07,
      "loss": 0.0039,
      "reward": 0.07673611293236414,
      "reward_std": 0.12897611850251753,
      "rewards/accuracy_reward": 0.07100694642091791,
      "rewards/format_reward": 0.0057291668374091385,
      "step": 5
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 576.849148050944,
      "epoch": 0.24,
      "grad_norm": 0.14590902626514435,
      "kl": 0.0013418197631835938,
      "learning_rate": 8e-07,
      "loss": 0.0005,
      "reward": 0.0871527800646921,
      "reward_std": 0.13713855588187773,
      "rewards/accuracy_reward": 0.081250002173086,
      "rewards/format_reward": 0.005902777938172221,
      "step": 10
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 595.0527969360352,
      "epoch": 0.36,
      "grad_norm": 0.13242916762828827,
      "kl": 0.0022638956705729168,
      "learning_rate": 1.2e-06,
      "loss": 0.0035,
      "reward": 0.08645833590999245,
      "reward_std": 0.12982427552342415,
      "rewards/accuracy_reward": 0.07795139101023475,
      "rewards/format_reward": 0.008506944651405017,
      "step": 15
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 589.0404724121094,
      "epoch": 0.48,
      "grad_norm": 0.19261103868484497,
      "kl": 0.006444295247395833,
      "learning_rate": 1.6e-06,
      "loss": 0.0083,
      "reward": 0.12013889234513045,
      "reward_std": 0.17057897535463173,
      "rewards/accuracy_reward": 0.10694444783342381,
      "rewards/format_reward": 0.013194444729015231,
      "step": 20
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 600.2750193277994,
      "epoch": 0.6,
      "grad_norm": 0.18090881407260895,
      "kl": 0.00933685302734375,
      "learning_rate": 2e-06,
      "loss": 0.011,
      "reward": 0.12621528124436737,
      "reward_std": 0.1751370935390393,
      "rewards/accuracy_reward": 0.11458333623595536,
      "rewards/format_reward": 0.011631944697971146,
      "step": 25
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 577.3231079101563,
      "epoch": 0.72,
      "grad_norm": 0.14372943341732025,
      "kl": 0.009203084309895833,
      "learning_rate": 1.9974751105436262e-06,
      "loss": 0.0063,
      "reward": 0.13611111496575176,
      "reward_std": 0.17103372573231657,
      "rewards/accuracy_reward": 0.1302083367947489,
      "rewards/format_reward": 0.005902777938172221,
      "step": 30
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 564.1434224446615,
      "epoch": 0.84,
      "grad_norm": 0.15495018661022186,
      "kl": 0.009105428059895834,
      "learning_rate": 1.98991319230804e-06,
      "loss": 0.0107,
      "reward": 0.1550347256163756,
      "reward_std": 0.19651179468880098,
      "rewards/accuracy_reward": 0.14583333718279998,
      "rewards/format_reward": 0.009201389101023475,
      "step": 35
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 600.4140803019205,
      "epoch": 0.96,
      "grad_norm": 0.17615483701229095,
      "kl": 0.012515767415364584,
      "learning_rate": 1.9773524313084854e-06,
      "loss": 0.0119,
      "reward": 0.15885417150954406,
      "reward_std": 0.2019161203255256,
      "rewards/accuracy_reward": 0.14166667039195696,
      "rewards/format_reward": 0.017187500388051072,
      "step": 40
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 631.5385569254557,
      "epoch": 1.096,
      "grad_norm": 0.16588236391544342,
      "kl": 0.014839680989583333,
      "learning_rate": 1.959856256610988e-06,
      "loss": 0.0096,
      "reward": 0.18350694837669532,
      "reward_std": 0.2348036120335261,
      "rewards/accuracy_reward": 0.15277778275000553,
      "rewards/format_reward": 0.030729167473812897,
      "step": 45
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 593.8875162760417,
      "epoch": 1.216,
      "grad_norm": 0.18021586537361145,
      "kl": 0.016481526692708335,
      "learning_rate": 1.9375130200295876e-06,
      "loss": 0.0189,
      "reward": 0.19756944961845874,
      "reward_std": 0.24355731457471846,
      "rewards/accuracy_reward": 0.16006944881131252,
      "rewards/format_reward": 0.03750000107102096,
      "step": 50
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 587.4776229858398,
      "epoch": 1.336,
      "grad_norm": 0.1994011402130127,
      "kl": 0.0164459228515625,
      "learning_rate": 1.9104355499692162e-06,
      "loss": 0.0176,
      "reward": 0.2189236176510652,
      "reward_std": 0.28772813665370145,
      "rewards/accuracy_reward": 0.14583333767950535,
      "rewards/format_reward": 0.07309028026647865,
      "step": 55
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 575.6356079101563,
      "epoch": 1.456,
      "grad_norm": 0.23497304320335388,
      "kl": 0.02615966796875,
      "learning_rate": 1.8787605816671951e-06,
      "loss": 0.0222,
      "reward": 0.33281251018246016,
      "reward_std": 0.3787623425324758,
      "rewards/accuracy_reward": 0.13923611460874477,
      "rewards/format_reward": 0.19357639501492183,
      "step": 60
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 581.5133870442709,
      "epoch": 1.576,
      "grad_norm": 0.2583252191543579,
      "kl": 0.039479573567708336,
      "learning_rate": 1.8426480667105175e-06,
      "loss": 0.0407,
      "reward": 0.4626736263434092,
      "reward_std": 0.45475957343975704,
      "rewards/accuracy_reward": 0.13229167129223546,
      "rewards/format_reward": 0.33038195346792537,
      "step": 65
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 575.3401219685873,
      "epoch": 1.696,
      "grad_norm": 0.2477940022945404,
      "kl": 0.04772135416666667,
      "learning_rate": 1.8022803653156982e-06,
      "loss": 0.037,
      "reward": 0.6152777954936027,
      "reward_std": 0.4820722574989001,
      "rewards/accuracy_reward": 0.11440972487131755,
      "rewards/format_reward": 0.5008680661519368,
      "step": 70
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 592.2732818603515,
      "epoch": 1.8159999999999998,
      "grad_norm": 0.240928515791893,
      "kl": 0.045609537760416666,
      "learning_rate": 1.7578613254499968e-06,
      "loss": 0.0367,
      "reward": 0.6859375188748041,
      "reward_std": 0.48161858022212983,
      "rewards/accuracy_reward": 0.1130208361428231,
      "rewards/format_reward": 0.5729166840513548,
      "step": 75
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 570.3640808105469,
      "epoch": 1.936,
      "grad_norm": 0.30649441480636597,
      "kl": 0.05423177083333333,
      "learning_rate": 1.7096152534442513e-06,
      "loss": 0.0334,
      "reward": 0.7704861332972844,
      "reward_std": 0.4483942608038584,
      "rewards/accuracy_reward": 0.10572916980211934,
      "rewards/format_reward": 0.6647569671273231,
      "step": 80
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 560.8760559082032,
      "epoch": 2.072,
      "grad_norm": 0.25217875838279724,
      "kl": 0.054768880208333336,
      "learning_rate": 1.6577857812954991e-06,
      "loss": 0.0375,
      "reward": 0.8114583571751912,
      "reward_std": 0.4467007691661517,
      "rewards/accuracy_reward": 0.12256944736776253,
      "rewards/format_reward": 0.6888889064391454,
      "step": 85
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 565.7517557779948,
      "epoch": 2.192,
      "grad_norm": 0.27332058548927307,
      "kl": 0.0546630859375,
      "learning_rate": 1.6026346363792564e-06,
      "loss": 0.0329,
      "reward": 0.8633680770794551,
      "reward_std": 0.47069497853517533,
      "rewards/accuracy_reward": 0.15156250428408385,
      "rewards/format_reward": 0.711805577079455,
      "step": 90
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 564.6762344360352,
      "epoch": 2.312,
      "grad_norm": 0.22232523560523987,
      "kl": 0.05868123372395833,
      "learning_rate": 1.5444403197841344e-06,
      "loss": 0.0211,
      "reward": 0.8944444666306178,
      "reward_std": 0.42260901977618537,
      "rewards/accuracy_reward": 0.1461805594774584,
      "rewards/format_reward": 0.7482639104127884,
      "step": 95
    },
    {
      "epoch": 2.432,
      "grad_norm": 0.21494214236736298,
      "learning_rate": 1.4834966999429178e-06,
      "loss": 0.0271,
      "step": 100
    },
    {
      "epoch": 2.432,
      "eval_clip_ratio": 0.0,
      "eval_completion_length": 528.4134756234976,
      "eval_kl": 0.060819185697115384,
      "eval_loss": 0.013302656821906567,
      "eval_reward": 0.97275644999284,
      "eval_reward_std": 0.405555764069924,
      "eval_rewards/accuracy_reward": 0.16025641531898424,
      "eval_rewards/format_reward": 0.8125000275098361,
      "eval_runtime": 164.4929,
      "eval_samples_per_second": 0.602,
      "eval_steps_per_second": 0.018,
      "step": 100
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 562.4533172607422,
      "epoch": 2.552,
      "grad_norm": 0.24463680386543274,
      "kl": 0.06415608723958334,
      "learning_rate": 1.4201115286619464e-06,
      "loss": 0.0246,
      "reward": 0.9512153029441833,
      "reward_std": 0.4153951602677504,
      "rewards/accuracy_reward": 0.16093750478078922,
      "rewards/format_reward": 0.7902777964870135,
      "step": 105
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 558.5052256266276,
      "epoch": 2.672,
      "grad_norm": 0.22716720402240753,
      "kl": 0.066162109375,
      "learning_rate": 1.3546048870425354e-06,
      "loss": 0.0242,
      "reward": 0.9326389094193777,
      "reward_std": 0.4124096731344859,
      "rewards/accuracy_reward": 0.15625000453243654,
      "rewards/format_reward": 0.7763889074325562,
      "step": 110
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 547.4661656697591,
      "epoch": 2.792,
      "grad_norm": 0.2325204312801361,
      "kl": 0.07105712890625,
      "learning_rate": 1.2873075691421806e-06,
      "loss": 0.0198,
      "reward": 0.9222222457329432,
      "reward_std": 0.4309779698650042,
      "rewards/accuracy_reward": 0.16579861616094907,
      "rewards/format_reward": 0.7564236293236415,
      "step": 115
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 547.4175537109375,
      "epoch": 2.912,
      "grad_norm": 0.23773518204689026,
      "kl": 0.06711832682291667,
      "learning_rate": 1.218559411537699e-06,
      "loss": 0.0211,
      "reward": 0.894270858168602,
      "reward_std": 0.41917893588542937,
      "rewards/accuracy_reward": 0.15920139361793798,
      "rewards/format_reward": 0.7350694666306178,
      "step": 120
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 544.4187637329102,
      "epoch": 3.048,
      "grad_norm": 0.25982293486595154,
      "kl": 0.07174072265625,
      "learning_rate": 1.1487075772256517e-06,
      "loss": 0.029,
      "reward": 0.8729166895151138,
      "reward_std": 0.4223095287879308,
      "rewards/accuracy_reward": 0.16006944874922435,
      "rewards/format_reward": 0.7128472457329432,
      "step": 125
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 520.4824803670248,
      "epoch": 3.168,
      "grad_norm": 0.24558736383914948,
      "kl": 0.08404541015625,
      "learning_rate": 1.0781048025259646e-06,
      "loss": 0.0261,
      "reward": 0.8881944636503856,
      "reward_std": 0.42243550966183346,
      "rewards/accuracy_reward": 0.16284722611308097,
      "rewards/format_reward": 0.725347238779068,
      "step": 130
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 537.1248448689779,
      "epoch": 3.288,
      "grad_norm": 0.29106763005256653,
      "kl": 0.07824300130208334,
      "learning_rate": 1.0071076158414974e-06,
      "loss": 0.0249,
      "reward": 0.9265625198682149,
      "reward_std": 0.41332067002852757,
      "rewards/accuracy_reward": 0.17378472685813903,
      "rewards/format_reward": 0.7527777969837188,
      "step": 135
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 550.350016784668,
      "epoch": 3.408,
      "grad_norm": 0.2443486452102661,
      "kl": 0.071923828125,
      "learning_rate": 9.360745372684345e-07,
      "loss": 0.0295,
      "reward": 0.8960069666306177,
      "reward_std": 0.4272393837571144,
      "rewards/accuracy_reward": 0.17343750571211178,
      "rewards/format_reward": 0.7225694636503855,
      "step": 140
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 543.1215454101563,
      "epoch": 3.528,
      "grad_norm": 0.24748581647872925,
      "kl": 0.07239176432291666,
      "learning_rate": 8.653642681490607e-07,
      "loss": 0.0247,
      "reward": 0.9074653029441834,
      "reward_std": 0.4173097605506579,
      "rewards/accuracy_reward": 0.1642361162851254,
      "rewards/format_reward": 0.7432291815678279,
      "step": 145
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 539.6696355183919,
      "epoch": 3.648,
      "grad_norm": 0.2420652210712433,
      "kl": 0.0806884765625,
      "learning_rate": 7.953338797092901e-07,
      "loss": 0.0247,
      "reward": 0.9553819715976715,
      "reward_std": 0.39353689054648083,
      "rewards/accuracy_reward": 0.15902778361923992,
      "rewards/format_reward": 0.7963541895151138,
      "step": 150
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 542.0953282674153,
      "epoch": 3.768,
      "grad_norm": 0.22927226126194,
      "kl": 0.08297119140625,
      "learning_rate": 7.263370099279171e-07,
      "loss": 0.0375,
      "reward": 0.9421875288089117,
      "reward_std": 0.3737917934854825,
      "rewards/accuracy_reward": 0.14618056000520785,
      "rewards/format_reward": 0.7960069606701533,
      "step": 155
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 560.2560948689778,
      "epoch": 3.888,
      "grad_norm": 0.2442265897989273,
      "kl": 0.08262532552083333,
      "learning_rate": 6.587220777430095e-07,
      "loss": 0.0438,
      "reward": 0.9187500218550364,
      "reward_std": 0.3985011622309685,
      "rewards/accuracy_reward": 0.1730902827034394,
      "rewards/format_reward": 0.7456597417593003,
      "step": 160
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 549.7503621419271,
      "epoch": 4.024,
      "grad_norm": 0.36080020666122437,
      "kl": 0.09295247395833334,
      "learning_rate": 5.928305236133016e-07,
      "loss": 0.0348,
      "reward": 0.8918403009573619,
      "reward_std": 0.42846539815266926,
      "rewards/accuracy_reward": 0.16701389451821644,
      "rewards/format_reward": 0.7248264094193776,
      "step": 165
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 527.6090423583985,
      "epoch": 4.144,
      "grad_norm": 0.2888505756855011,
      "kl": 0.0927978515625,
      "learning_rate": 5.289950853193652e-07,
      "loss": 0.0569,
      "reward": 0.9263889143864313,
      "reward_std": 0.4129257212082545,
      "rewards/accuracy_reward": 0.17309028257926304,
      "rewards/format_reward": 0.75329862733682,
      "step": 170
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 521.2873443603515,
      "epoch": 4.264,
      "grad_norm": 0.2377632111310959,
      "kl": 0.09527180989583334,
      "learning_rate": 4.6753811771138365e-07,
      "loss": 0.0401,
      "reward": 0.9312500258286794,
      "reward_std": 0.3798152153690656,
      "rewards/accuracy_reward": 0.16128472660978635,
      "rewards/format_reward": 0.7699652989705403,
      "step": 175
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 545.0975886027019,
      "epoch": 4.384,
      "grad_norm": 0.2456796020269394,
      "kl": 0.093505859375,
      "learning_rate": 4.0876996488842475e-07,
      "loss": 0.0539,
      "reward": 0.9411458532015483,
      "reward_std": 0.40113388895988467,
      "rewards/accuracy_reward": 0.17621528282761573,
      "rewards/format_reward": 0.7649305760860443,
      "step": 180
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 552.6007125854492,
      "epoch": 4.504,
      "grad_norm": 0.24820686876773834,
      "kl": 0.09252522786458334,
      "learning_rate": 3.529873930293545e-07,
      "loss": 0.0572,
      "reward": 0.9163194666306178,
      "reward_std": 0.38814649879932406,
      "rewards/accuracy_reward": 0.15885417160267631,
      "rewards/format_reward": 0.7574652969837189,
      "step": 185
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 533.9510559082031,
      "epoch": 4.624,
      "grad_norm": 0.2605077922344208,
      "kl": 0.09794921875,
      "learning_rate": 3.0047209178924635e-07,
      "loss": 0.0484,
      "reward": 0.9178819686174393,
      "reward_std": 0.4066275705893834,
      "rewards/accuracy_reward": 0.16649306093653043,
      "rewards/format_reward": 0.7513889104127884,
      "step": 190
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 535.4948069254557,
      "epoch": 4.744,
      "grad_norm": 0.24027736485004425,
      "kl": 0.10040690104166666,
      "learning_rate": 2.514892518288988e-07,
      "loss": 0.0508,
      "reward": 0.9135416855414709,
      "reward_std": 0.3980190739035606,
      "rewards/accuracy_reward": 0.1682291696468989,
      "rewards/format_reward": 0.7453125208616257,
      "step": 195
    },
    {
      "epoch": 4.864,
      "grad_norm": 0.24247297644615173,
      "learning_rate": 2.0628622566063058e-07,
      "loss": 0.0556,
      "step": 200
    },
    {
      "epoch": 4.864,
      "eval_clip_ratio": 0.0,
      "eval_completion_length": 497.66507427509015,
      "eval_kl": 0.10235126201923077,
      "eval_loss": 0.09290527552366257,
      "eval_reward": 0.8525641239606417,
      "eval_reward_std": 0.4311282199162703,
      "eval_rewards/accuracy_reward": 0.1314102616161108,
      "eval_rewards/format_reward": 0.7211538690787095,
      "eval_runtime": 164.8378,
      "eval_samples_per_second": 0.601,
      "eval_steps_per_second": 0.018,
      "step": 200
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 527.5610371907552,
      "epoch": 4.984,
      "grad_norm": 0.24340181052684784,
      "kl": 0.103265380859375,
      "learning_rate": 1.6509127857277782e-07,
      "loss": 0.0592,
      "reward": 0.9018229390184085,
      "reward_std": 0.38689753947158656,
      "rewards/accuracy_reward": 0.16232639336958526,
      "rewards/format_reward": 0.7394965469837189,
      "step": 205
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 535.6948079427083,
      "epoch": 5.12,
      "grad_norm": 0.2896415889263153,
      "kl": 0.10675455729166666,
      "learning_rate": 1.2811243594045694e-07,
      "loss": 0.059,
      "reward": 0.8942708512147267,
      "reward_std": 0.4014328221480052,
      "rewards/accuracy_reward": 0.15416667043852308,
      "rewards/format_reward": 0.7401041835546494,
      "step": 210
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 525.1632136027018,
      "epoch": 5.24,
      "grad_norm": 0.2585560977458954,
      "kl": 0.10707194010416667,
      "learning_rate": 9.55364327434105e-08,
      "loss": 0.0576,
      "reward": 0.8967014074325561,
      "reward_std": 0.3964859182635943,
      "rewards/accuracy_reward": 0.1552083384245634,
      "rewards/format_reward": 0.7414930721124013,
      "step": 215
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 518.6449793497721,
      "epoch": 5.36,
      "grad_norm": 0.2638151943683624,
      "kl": 0.110107421875,
      "learning_rate": 6.75277705956443e-08,
      "loss": 0.0621,
      "reward": 0.910069465637207,
      "reward_std": 0.38575134972731273,
      "rewards/accuracy_reward": 0.1703125045945247,
      "rewards/format_reward": 0.739756965637207,
      "step": 220
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 519.6583480834961,
      "epoch": 5.48,
      "grad_norm": 0.2671918570995331,
      "kl": 0.10970865885416667,
      "learning_rate": 4.422788704864633e-08,
      "loss": 0.0555,
      "reward": 0.9017361313104629,
      "reward_std": 0.39265564555923144,
      "rewards/accuracy_reward": 0.16128472667187452,
      "rewards/format_reward": 0.7404514094193776,
      "step": 225
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 518.6614771525066,
      "epoch": 5.6,
      "grad_norm": 0.2516481876373291,
      "kl": 0.10393880208333334,
      "learning_rate": 2.575444136302185e-08,
      "loss": 0.0585,
      "reward": 0.9203125298023224,
      "reward_std": 0.3854361062248548,
      "rewards/accuracy_reward": 0.16527778220673403,
      "rewards/format_reward": 0.7550347407658895,
      "step": 230
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 523.0328257242838,
      "epoch": 5.72,
      "grad_norm": 0.24842773377895355,
      "kl": 0.10460611979166666,
      "learning_rate": 1.220072035523989e-08,
      "loss": 0.0615,
      "reward": 0.9104166885217031,
      "reward_std": 0.4020949920018514,
      "rewards/accuracy_reward": 0.16076389284183581,
      "rewards/format_reward": 0.7496527930100759,
      "step": 235
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 548.0658167521159,
      "epoch": 5.84,
      "grad_norm": 0.2960110008716583,
      "kl": 0.10340983072916667,
      "learning_rate": 3.6351673198347087e-09,
      "loss": 0.0572,
      "reward": 0.894097242752711,
      "reward_std": 0.39487800349791846,
      "rewards/accuracy_reward": 0.15312500384946665,
      "rewards/format_reward": 0.740972242752711,
      "step": 240
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 524.3946365356445,
      "epoch": 5.96,
      "grad_norm": 0.28276899456977844,
      "kl": 0.1115234375,
      "learning_rate": 1.0103640590064522e-10,
      "loss": 0.0614,
      "reward": 0.8968750178813935,
      "reward_std": 0.40502374321222306,
      "rewards/accuracy_reward": 0.15798611640930177,
      "rewards/format_reward": 0.7388889054457347,
      "step": 245
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 484.8463541666667,
      "epoch": 5.984,
      "kl": 0.11258951822916667,
      "reward": 0.8923611293236414,
      "reward_std": 0.40955925981203717,
      "rewards/accuracy_reward": 0.1605902835726738,
      "rewards/format_reward": 0.731770858168602,
      "step": 246,
      "total_flos": 0.0,
      "train_loss": 0.03362821891328426,
      "train_runtime": 77437.8148,
      "train_samples_per_second": 0.62,
      "train_steps_per_second": 0.003
    }
  ],
  "logging_steps": 5,
  "max_steps": 246,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 6,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 12,
  "trial_name": null,
  "trial_params": null
}