File size: 25,085 Bytes
7fdf253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 2.9667896678966788,
  "eval_steps": 40,
  "global_step": 201,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.07380073800738007,
      "grad_norm": 80.02477445251274,
      "learning_rate": 5e-07,
      "logits/chosen": -2.7569785118103027,
      "logits/rejected": -2.715679883956909,
      "logps/chosen": -343.655517578125,
      "logps/rejected": -244.0912628173828,
      "loss": 0.687,
      "rewards/accuracies": 0.34375,
      "rewards/chosen": 0.02633141539990902,
      "rewards/margins": 0.006850541569292545,
      "rewards/rejected": 0.0194808728992939,
      "step": 5
    },
    {
      "epoch": 0.14760147601476015,
      "grad_norm": 65.98268514011825,
      "learning_rate": 1e-06,
      "logits/chosen": -2.676809787750244,
      "logits/rejected": -2.666592836380005,
      "logps/chosen": -296.428955078125,
      "logps/rejected": -247.4902801513672,
      "loss": 0.6147,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": 0.8011910319328308,
      "rewards/margins": 0.2567104995250702,
      "rewards/rejected": 0.5444804430007935,
      "step": 10
    },
    {
      "epoch": 0.22140221402214022,
      "grad_norm": 48.66427015180346,
      "learning_rate": 9.983100718730718e-07,
      "logits/chosen": -2.416226863861084,
      "logits/rejected": -2.3806653022766113,
      "logps/chosen": -316.8359069824219,
      "logps/rejected": -258.2687683105469,
      "loss": 0.6095,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": 2.1471664905548096,
      "rewards/margins": 0.7412694692611694,
      "rewards/rejected": 1.4058969020843506,
      "step": 15
    },
    {
      "epoch": 0.2952029520295203,
      "grad_norm": 50.05057195236849,
      "learning_rate": 9.932517109205849e-07,
      "logits/chosen": -2.1923749446868896,
      "logits/rejected": -2.1478309631347656,
      "logps/chosen": -294.5142517089844,
      "logps/rejected": -243.7734375,
      "loss": 0.556,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": 2.3299460411071777,
      "rewards/margins": 1.3834998607635498,
      "rewards/rejected": 0.9464457631111145,
      "step": 20
    },
    {
      "epoch": 0.36900369003690037,
      "grad_norm": 41.37624373189553,
      "learning_rate": 9.848591102083375e-07,
      "logits/chosen": -2.0363731384277344,
      "logits/rejected": -2.030383348464966,
      "logps/chosen": -282.7300720214844,
      "logps/rejected": -221.184326171875,
      "loss": 0.4963,
      "rewards/accuracies": 0.793749988079071,
      "rewards/chosen": 2.7908506393432617,
      "rewards/margins": 1.624943494796753,
      "rewards/rejected": 1.1659072637557983,
      "step": 25
    },
    {
      "epoch": 0.44280442804428044,
      "grad_norm": 43.83501071765918,
      "learning_rate": 9.731890013043367e-07,
      "logits/chosen": -2.0403037071228027,
      "logits/rejected": -1.9934555292129517,
      "logps/chosen": -325.14227294921875,
      "logps/rejected": -214.34542846679688,
      "loss": 0.4972,
      "rewards/accuracies": 0.768750011920929,
      "rewards/chosen": 2.984839916229248,
      "rewards/margins": 1.5722445249557495,
      "rewards/rejected": 1.412595510482788,
      "step": 30
    },
    {
      "epoch": 0.5166051660516605,
      "grad_norm": 47.665657648113644,
      "learning_rate": 9.583202707897073e-07,
      "logits/chosen": -2.0699315071105957,
      "logits/rejected": -2.042548418045044,
      "logps/chosen": -318.35357666015625,
      "logps/rejected": -221.4462432861328,
      "loss": 0.5431,
      "rewards/accuracies": 0.8125,
      "rewards/chosen": 3.141150951385498,
      "rewards/margins": 1.8329731225967407,
      "rewards/rejected": 1.3081778287887573,
      "step": 35
    },
    {
      "epoch": 0.5904059040590406,
      "grad_norm": 42.1852532770112,
      "learning_rate": 9.403534270080829e-07,
      "logits/chosen": -2.1574552059173584,
      "logits/rejected": -2.105395555496216,
      "logps/chosen": -282.8706359863281,
      "logps/rejected": -239.42562866210938,
      "loss": 0.563,
      "rewards/accuracies": 0.78125,
      "rewards/chosen": 2.571629524230957,
      "rewards/margins": 1.7009865045547485,
      "rewards/rejected": 0.8706433176994324,
      "step": 40
    },
    {
      "epoch": 0.5904059040590406,
      "eval_logits/chosen": -2.179224967956543,
      "eval_logits/rejected": -2.15881085395813,
      "eval_logps/chosen": -304.3701171875,
      "eval_logps/rejected": -235.69309997558594,
      "eval_loss": 0.4594477713108063,
      "eval_rewards/accuracies": 0.8185483813285828,
      "eval_rewards/chosen": 2.485563278198242,
      "eval_rewards/margins": 1.8135225772857666,
      "eval_rewards/rejected": 0.6720407009124756,
      "eval_runtime": 131.0305,
      "eval_samples_per_second": 14.661,
      "eval_steps_per_second": 0.237,
      "step": 40
    },
    {
      "epoch": 0.6642066420664207,
      "grad_norm": 38.916777514219696,
      "learning_rate": 9.19409920658098e-07,
      "logits/chosen": -2.225562572479248,
      "logits/rejected": -2.181002378463745,
      "logps/chosen": -276.44537353515625,
      "logps/rejected": -232.626220703125,
      "loss": 0.5076,
      "rewards/accuracies": 0.78125,
      "rewards/chosen": 2.278400182723999,
      "rewards/margins": 1.4811707735061646,
      "rewards/rejected": 0.7972294092178345,
      "step": 45
    },
    {
      "epoch": 0.7380073800738007,
      "grad_norm": 50.710250280321,
      "learning_rate": 8.956313238215823e-07,
      "logits/chosen": -2.2307848930358887,
      "logits/rejected": -2.1967437267303467,
      "logps/chosen": -313.6961364746094,
      "logps/rejected": -241.0548858642578,
      "loss": 0.5239,
      "rewards/accuracies": 0.800000011920929,
      "rewards/chosen": 2.592728853225708,
      "rewards/margins": 1.948052167892456,
      "rewards/rejected": 0.6446765661239624,
      "step": 50
    },
    {
      "epoch": 0.8118081180811808,
      "grad_norm": 39.063704669645155,
      "learning_rate": 8.691783729769873e-07,
      "logits/chosen": -2.139880895614624,
      "logits/rejected": -2.139148712158203,
      "logps/chosen": -299.7575988769531,
      "logps/rejected": -245.935546875,
      "loss": 0.5018,
      "rewards/accuracies": 0.8062499761581421,
      "rewards/chosen": 2.354003429412842,
      "rewards/margins": 1.81247878074646,
      "rewards/rejected": 0.5415242910385132,
      "step": 55
    },
    {
      "epoch": 0.8856088560885609,
      "grad_norm": 33.2796085112328,
      "learning_rate": 8.402298824670029e-07,
      "logits/chosen": -2.0772578716278076,
      "logits/rejected": -2.054955005645752,
      "logps/chosen": -295.5028991699219,
      "logps/rejected": -244.0660858154297,
      "loss": 0.4817,
      "rewards/accuracies": 0.793749988079071,
      "rewards/chosen": 2.2515780925750732,
      "rewards/margins": 1.5721994638442993,
      "rewards/rejected": 0.6793786883354187,
      "step": 60
    },
    {
      "epoch": 0.959409594095941,
      "grad_norm": 36.14638757212613,
      "learning_rate": 8.089815357650089e-07,
      "logits/chosen": -2.0140891075134277,
      "logits/rejected": -1.9471585750579834,
      "logps/chosen": -302.58148193359375,
      "logps/rejected": -237.9540252685547,
      "loss": 0.4943,
      "rewards/accuracies": 0.8187500238418579,
      "rewards/chosen": 2.2041029930114746,
      "rewards/margins": 2.0399723052978516,
      "rewards/rejected": 0.16413061320781708,
      "step": 65
    },
    {
      "epoch": 1.033210332103321,
      "grad_norm": 21.302121663013374,
      "learning_rate": 7.756445627110522e-07,
      "logits/chosen": -2.040945053100586,
      "logits/rejected": -2.0241832733154297,
      "logps/chosen": -312.1359558105469,
      "logps/rejected": -239.3393096923828,
      "loss": 0.3303,
      "rewards/accuracies": 0.800000011920929,
      "rewards/chosen": 2.603544235229492,
      "rewards/margins": 2.4756617546081543,
      "rewards/rejected": 0.12788262963294983,
      "step": 70
    },
    {
      "epoch": 1.1070110701107012,
      "grad_norm": 20.556094388092646,
      "learning_rate": 7.404443116588547e-07,
      "logits/chosen": -2.104165554046631,
      "logits/rejected": -2.059689521789551,
      "logps/chosen": -294.634765625,
      "logps/rejected": -238.32437133789062,
      "loss": 0.129,
      "rewards/accuracies": 0.9624999761581421,
      "rewards/chosen": 3.189236879348755,
      "rewards/margins": 3.7732062339782715,
      "rewards/rejected": -0.5839694142341614,
      "step": 75
    },
    {
      "epoch": 1.1808118081180812,
      "grad_norm": 20.50931148538785,
      "learning_rate": 7.036187261857288e-07,
      "logits/chosen": -2.146726608276367,
      "logits/rejected": -2.1075119972229004,
      "logps/chosen": -297.4272155761719,
      "logps/rejected": -262.4473876953125,
      "loss": 0.154,
      "rewards/accuracies": 0.96875,
      "rewards/chosen": 3.2519805431365967,
      "rewards/margins": 3.6943678855895996,
      "rewards/rejected": -0.44238725304603577,
      "step": 80
    },
    {
      "epoch": 1.1808118081180812,
      "eval_logits/chosen": -2.13566517829895,
      "eval_logits/rejected": -2.110398054122925,
      "eval_logps/chosen": -301.3644104003906,
      "eval_logps/rejected": -238.48484802246094,
      "eval_loss": 0.46015238761901855,
      "eval_rewards/accuracies": 0.8427419066429138,
      "eval_rewards/chosen": 2.7861340045928955,
      "eval_rewards/margins": 2.3932666778564453,
      "eval_rewards/rejected": 0.3928670585155487,
      "eval_runtime": 129.5743,
      "eval_samples_per_second": 14.825,
      "eval_steps_per_second": 0.239,
      "step": 80
    },
    {
      "epoch": 1.2546125461254611,
      "grad_norm": 24.33309810818949,
      "learning_rate": 6.654167366624008e-07,
      "logits/chosen": -2.142047882080078,
      "logits/rejected": -2.1115987300872803,
      "logps/chosen": -289.6197204589844,
      "logps/rejected": -245.8259735107422,
      "loss": 0.1699,
      "rewards/accuracies": 0.9437500238418579,
      "rewards/chosen": 3.555595874786377,
      "rewards/margins": 4.105128288269043,
      "rewards/rejected": -0.5495321750640869,
      "step": 85
    },
    {
      "epoch": 1.3284132841328413,
      "grad_norm": 23.507286919588484,
      "learning_rate": 6.260965775552713e-07,
      "logits/chosen": -2.1702046394348145,
      "logits/rejected": -2.1256089210510254,
      "logps/chosen": -299.5054626464844,
      "logps/rejected": -242.0937042236328,
      "loss": 0.159,
      "rewards/accuracies": 0.9437500238418579,
      "rewards/chosen": 3.9076132774353027,
      "rewards/margins": 4.560946464538574,
      "rewards/rejected": -0.6533328890800476,
      "step": 90
    },
    {
      "epoch": 1.4022140221402215,
      "grad_norm": 15.516195820704533,
      "learning_rate": 5.859240418356614e-07,
      "logits/chosen": -2.1203560829162598,
      "logits/rejected": -2.07737398147583,
      "logps/chosen": -270.5323791503906,
      "logps/rejected": -282.30242919921875,
      "loss": 0.1745,
      "rewards/accuracies": 0.9750000238418579,
      "rewards/chosen": 3.4920401573181152,
      "rewards/margins": 4.871523380279541,
      "rewards/rejected": -1.3794825077056885,
      "step": 95
    },
    {
      "epoch": 1.4760147601476015,
      "grad_norm": 15.962268006534465,
      "learning_rate": 5.451706842957421e-07,
      "logits/chosen": -2.0756678581237793,
      "logits/rejected": -2.0366768836975098,
      "logps/chosen": -285.35400390625,
      "logps/rejected": -261.02069091796875,
      "loss": 0.1518,
      "rewards/accuracies": 0.96875,
      "rewards/chosen": 3.5108916759490967,
      "rewards/margins": 4.940871715545654,
      "rewards/rejected": -1.4299800395965576,
      "step": 100
    },
    {
      "epoch": 1.5498154981549814,
      "grad_norm": 25.320702801914457,
      "learning_rate": 5.041119859162068e-07,
      "logits/chosen": -2.1494388580322266,
      "logits/rejected": -2.1103031635284424,
      "logps/chosen": -291.79193115234375,
      "logps/rejected": -242.1620635986328,
      "loss": 0.1927,
      "rewards/accuracies": 0.9125000238418579,
      "rewards/chosen": 3.214129686355591,
      "rewards/margins": 4.194614410400391,
      "rewards/rejected": -0.980484664440155,
      "step": 105
    },
    {
      "epoch": 1.6236162361623616,
      "grad_norm": 24.127332932431226,
      "learning_rate": 4.630254916940423e-07,
      "logits/chosen": -2.174290180206299,
      "logits/rejected": -2.179755926132202,
      "logps/chosen": -279.0810546875,
      "logps/rejected": -252.66488647460938,
      "loss": 0.1829,
      "rewards/accuracies": 0.9437500238418579,
      "rewards/chosen": 3.157012701034546,
      "rewards/margins": 4.443808078765869,
      "rewards/rejected": -1.2867956161499023,
      "step": 110
    },
    {
      "epoch": 1.6974169741697418,
      "grad_norm": 20.11391135642748,
      "learning_rate": 4.2218893451814e-07,
      "logits/chosen": -2.2010812759399414,
      "logits/rejected": -2.164829730987549,
      "logps/chosen": -289.4188232421875,
      "logps/rejected": -246.65945434570312,
      "loss": 0.1934,
      "rewards/accuracies": 0.949999988079071,
      "rewards/chosen": 3.433326244354248,
      "rewards/margins": 4.391345977783203,
      "rewards/rejected": -0.9580191373825073,
      "step": 115
    },
    {
      "epoch": 1.7712177121771218,
      "grad_norm": 20.706343509306766,
      "learning_rate": 3.8187835777481375e-07,
      "logits/chosen": -2.176086187362671,
      "logits/rejected": -2.1578235626220703,
      "logps/chosen": -281.7149353027344,
      "logps/rejected": -265.0261535644531,
      "loss": 0.2027,
      "rewards/accuracies": 0.9750000238418579,
      "rewards/chosen": 3.2026546001434326,
      "rewards/margins": 4.289515495300293,
      "rewards/rejected": -1.0868606567382812,
      "step": 120
    },
    {
      "epoch": 1.7712177121771218,
      "eval_logits/chosen": -2.1726152896881104,
      "eval_logits/rejected": -2.146054983139038,
      "eval_logps/chosen": -304.15960693359375,
      "eval_logps/rejected": -246.97988891601562,
      "eval_loss": 0.48685166239738464,
      "eval_rewards/accuracies": 0.8548387289047241,
      "eval_rewards/chosen": 2.5066120624542236,
      "eval_rewards/margins": 2.9632484912872314,
      "eval_rewards/rejected": -0.4566364884376526,
      "eval_runtime": 129.7757,
      "eval_samples_per_second": 14.802,
      "eval_steps_per_second": 0.239,
      "step": 120
    },
    {
      "epoch": 1.8450184501845017,
      "grad_norm": 20.871306894670933,
      "learning_rate": 3.423662493738687e-07,
      "logits/chosen": -2.180792808532715,
      "logits/rejected": -2.159304141998291,
      "logps/chosen": -301.1511535644531,
      "logps/rejected": -255.13919067382812,
      "loss": 0.1609,
      "rewards/accuracies": 0.9375,
      "rewards/chosen": 3.2595107555389404,
      "rewards/margins": 4.297440528869629,
      "rewards/rejected": -1.037929654121399,
      "step": 125
    },
    {
      "epoch": 1.918819188191882,
      "grad_norm": 27.947861559843737,
      "learning_rate": 3.039196998086687e-07,
      "logits/chosen": -2.136273145675659,
      "logits/rejected": -2.1014552116394043,
      "logps/chosen": -286.9736022949219,
      "logps/rejected": -244.7154083251953,
      "loss": 0.1847,
      "rewards/accuracies": 0.9375,
      "rewards/chosen": 3.395556926727295,
      "rewards/margins": 4.3099188804626465,
      "rewards/rejected": -0.9143617749214172,
      "step": 130
    },
    {
      "epoch": 1.992619926199262,
      "grad_norm": 20.821197239752305,
      "learning_rate": 2.667985967011878e-07,
      "logits/chosen": -2.1088356971740723,
      "logits/rejected": -2.0703465938568115,
      "logps/chosen": -286.96917724609375,
      "logps/rejected": -256.48016357421875,
      "loss": 0.1724,
      "rewards/accuracies": 0.918749988079071,
      "rewards/chosen": 3.350585460662842,
      "rewards/margins": 4.244786262512207,
      "rewards/rejected": -0.8942006826400757,
      "step": 135
    },
    {
      "epoch": 2.066420664206642,
      "grad_norm": 8.245558323252546,
      "learning_rate": 2.3125386803640183e-07,
      "logits/chosen": -2.1218690872192383,
      "logits/rejected": -2.0660667419433594,
      "logps/chosen": -284.4044494628906,
      "logps/rejected": -270.7417907714844,
      "loss": 0.0938,
      "rewards/accuracies": 0.9750000238418579,
      "rewards/chosen": 3.351060152053833,
      "rewards/margins": 4.853818416595459,
      "rewards/rejected": -1.5027587413787842,
      "step": 140
    },
    {
      "epoch": 2.140221402214022,
      "grad_norm": 14.140599014287302,
      "learning_rate": 1.9752578596124952e-07,
      "logits/chosen": -2.093632936477661,
      "logits/rejected": -2.0502517223358154,
      "logps/chosen": -288.5584716796875,
      "logps/rejected": -256.74652099609375,
      "loss": 0.0775,
      "rewards/accuracies": 0.981249988079071,
      "rewards/chosen": 3.4150993824005127,
      "rewards/margins": 4.966043949127197,
      "rewards/rejected": -1.5509445667266846,
      "step": 145
    },
    {
      "epoch": 2.2140221402214024,
      "grad_norm": 7.605905759499919,
      "learning_rate": 1.6584234261399532e-07,
      "logits/chosen": -2.0875797271728516,
      "logits/rejected": -2.0646932125091553,
      "logps/chosen": -295.5018310546875,
      "logps/rejected": -290.001708984375,
      "loss": 0.0579,
      "rewards/accuracies": 0.981249988079071,
      "rewards/chosen": 3.694823741912842,
      "rewards/margins": 5.430555820465088,
      "rewards/rejected": -1.7357313632965088,
      "step": 150
    },
    {
      "epoch": 2.2878228782287824,
      "grad_norm": 20.873090027101682,
      "learning_rate": 1.3641770896292082e-07,
      "logits/chosen": -2.0764248371124268,
      "logits/rejected": -2.060342311859131,
      "logps/chosen": -278.5547790527344,
      "logps/rejected": -249.08203125,
      "loss": 0.0718,
      "rewards/accuracies": 0.9750000238418579,
      "rewards/chosen": 3.3755805492401123,
      "rewards/margins": 5.271130084991455,
      "rewards/rejected": -1.8955495357513428,
      "step": 155
    },
    {
      "epoch": 2.3616236162361623,
      "grad_norm": 12.7807011486128,
      "learning_rate": 1.0945078707215221e-07,
      "logits/chosen": -2.073279857635498,
      "logits/rejected": -2.0515029430389404,
      "logps/chosen": -279.70892333984375,
      "logps/rejected": -263.677734375,
      "loss": 0.0725,
      "rewards/accuracies": 0.987500011920929,
      "rewards/chosen": 3.862575054168701,
      "rewards/margins": 5.486065864562988,
      "rewards/rejected": -1.623490571975708,
      "step": 160
    },
    {
      "epoch": 2.3616236162361623,
      "eval_logits/chosen": -2.0765814781188965,
      "eval_logits/rejected": -2.042445182800293,
      "eval_logps/chosen": -301.5458984375,
      "eval_logps/rejected": -246.53857421875,
      "eval_loss": 0.48189839720726013,
      "eval_rewards/accuracies": 0.8629032373428345,
      "eval_rewards/chosen": 2.7679829597473145,
      "eval_rewards/margins": 3.1804890632629395,
      "eval_rewards/rejected": -0.412506103515625,
      "eval_runtime": 129.9118,
      "eval_samples_per_second": 14.787,
      "eval_steps_per_second": 0.239,
      "step": 160
    },
    {
      "epoch": 2.4354243542435423,
      "grad_norm": 16.62994387557585,
      "learning_rate": 8.512386558088919e-08,
      "logits/chosen": -2.0903940200805664,
      "logits/rejected": -2.0252914428710938,
      "logps/chosen": -286.7425842285156,
      "logps/rejected": -249.64614868164062,
      "loss": 0.0707,
      "rewards/accuracies": 0.9624999761581421,
      "rewards/chosen": 3.8004047870635986,
      "rewards/margins": 5.124575614929199,
      "rewards/rejected": -1.3241703510284424,
      "step": 165
    },
    {
      "epoch": 2.5092250922509223,
      "grad_norm": 13.149398258549308,
      "learning_rate": 6.360138748461013e-08,
      "logits/chosen": -2.078819751739502,
      "logits/rejected": -2.0325751304626465,
      "logps/chosen": -279.3172912597656,
      "logps/rejected": -262.2966003417969,
      "loss": 0.0712,
      "rewards/accuracies": 0.987500011920929,
      "rewards/chosen": 3.741664409637451,
      "rewards/margins": 5.292626857757568,
      "rewards/rejected": -1.5509625673294067,
      "step": 170
    },
    {
      "epoch": 2.5830258302583027,
      "grad_norm": 15.477600906013183,
      "learning_rate": 4.5028838547699346e-08,
      "logits/chosen": -2.058854818344116,
      "logits/rejected": -2.045734167098999,
      "logps/chosen": -293.87738037109375,
      "logps/rejected": -277.49139404296875,
      "loss": 0.0756,
      "rewards/accuracies": 0.9750000238418579,
      "rewards/chosen": 3.888404369354248,
      "rewards/margins": 5.4004316329956055,
      "rewards/rejected": -1.512027382850647,
      "step": 175
    },
    {
      "epoch": 2.6568265682656826,
      "grad_norm": 10.486814550692278,
      "learning_rate": 2.9531763861505964e-08,
      "logits/chosen": -2.057389497756958,
      "logits/rejected": -2.0072054862976074,
      "logps/chosen": -284.025634765625,
      "logps/rejected": -249.7481231689453,
      "loss": 0.0701,
      "rewards/accuracies": 0.96875,
      "rewards/chosen": 3.7061257362365723,
      "rewards/margins": 5.206698417663574,
      "rewards/rejected": -1.500572919845581,
      "step": 180
    },
    {
      "epoch": 2.7306273062730626,
      "grad_norm": 16.604175060639175,
      "learning_rate": 1.7214919195619125e-08,
      "logits/chosen": -2.0375514030456543,
      "logits/rejected": -2.0372228622436523,
      "logps/chosen": -293.4367980957031,
      "logps/rejected": -243.2362823486328,
      "loss": 0.0833,
      "rewards/accuracies": 0.9624999761581421,
      "rewards/chosen": 3.817143201828003,
      "rewards/margins": 5.346969127655029,
      "rewards/rejected": -1.5298258066177368,
      "step": 185
    },
    {
      "epoch": 2.804428044280443,
      "grad_norm": 11.656202399163227,
      "learning_rate": 8.161562878982398e-09,
      "logits/chosen": -2.064812183380127,
      "logits/rejected": -2.0154833793640137,
      "logps/chosen": -295.53033447265625,
      "logps/rejected": -259.0420837402344,
      "loss": 0.0933,
      "rewards/accuracies": 0.9624999761581421,
      "rewards/chosen": 3.9195189476013184,
      "rewards/margins": 5.322437286376953,
      "rewards/rejected": -1.4029181003570557,
      "step": 190
    },
    {
      "epoch": 2.878228782287823,
      "grad_norm": 12.21325697905649,
      "learning_rate": 2.432892997526026e-09,
      "logits/chosen": -2.0528626441955566,
      "logits/rejected": -2.0427441596984863,
      "logps/chosen": -290.7054443359375,
      "logps/rejected": -244.73696899414062,
      "loss": 0.0959,
      "rewards/accuracies": 0.981249988079071,
      "rewards/chosen": 3.4299838542938232,
      "rewards/margins": 5.134265899658203,
      "rewards/rejected": -1.7042820453643799,
      "step": 195
    },
    {
      "epoch": 2.952029520295203,
      "grad_norm": 13.794636154783172,
      "learning_rate": 6.763371270035457e-11,
      "logits/chosen": -2.0266225337982178,
      "logits/rejected": -2.011596441268921,
      "logps/chosen": -275.36798095703125,
      "logps/rejected": -242.58694458007812,
      "loss": 0.0505,
      "rewards/accuracies": 0.987500011920929,
      "rewards/chosen": 3.6280651092529297,
      "rewards/margins": 5.206905364990234,
      "rewards/rejected": -1.5788400173187256,
      "step": 200
    },
    {
      "epoch": 2.952029520295203,
      "eval_logits/chosen": -2.054385185241699,
      "eval_logits/rejected": -2.0193707942962646,
      "eval_logps/chosen": -301.7057800292969,
      "eval_logps/rejected": -247.90260314941406,
      "eval_loss": 0.48475462198257446,
      "eval_rewards/accuracies": 0.8548387289047241,
      "eval_rewards/chosen": 2.75199556350708,
      "eval_rewards/margins": 3.300902843475342,
      "eval_rewards/rejected": -0.5489078760147095,
      "eval_runtime": 129.5144,
      "eval_samples_per_second": 14.832,
      "eval_steps_per_second": 0.239,
      "step": 200
    },
    {
      "epoch": 2.9667896678966788,
      "step": 201,
      "total_flos": 2369906314051584.0,
      "train_loss": 0.26609369445202957,
      "train_runtime": 7643.0309,
      "train_samples_per_second": 6.784,
      "train_steps_per_second": 0.026
    }
  ],
  "logging_steps": 5,
  "max_steps": 201,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 40,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 2369906314051584.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}