chenggong1995 commited on
Commit
4ac5d1e
·
verified ·
1 Parent(s): 9f98dbd

Model save

Browse files
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/gongc1995-city-university-of-hong-kong/huggingface/runs/lfitou3x)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/gongc1995-city-university-of-hong-kong/huggingface/runs/299z2w0y)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.4856032096632307,
4
- "train_runtime": 39327.4213,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 0.763,
7
  "train_steps_per_second": 0.006
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 2.3046215271270145,
4
+ "train_runtime": 39674.2598,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 0.756,
7
  "train_steps_per_second": 0.006
8
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41bdcf73917ea10381300720c2d8d4641f4824d0ce6dd5530bcda28a0015459e
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:854567b961c7917403b178e4175694cad2584d65934dc4a5ab8586789d620207
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c27708362adb3a3ae7f1063abcca0a3a82cbb2398d03c982ed1ae0dc8aab877
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd4d1eece6c6dda70828314d370c5ff174f5125da6bb7145eac618deb24560fb
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27d8d15a889995ed97ee9e4515d2b3f40a6715e5a9e5b5ab1df85a79e83b8157
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81be49c5316cad8f273d0e521fad4c2096739044571d0c5a4ad20f9c890408ff
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a163d9f25ad6d36f214a676d9b504dbc0c08d03b4642e5659d93a78e2b4ffb82
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68a49f902ce8187358fdd05fdd999a321034f8694dc6291a93dbad48c58660ff
3
  size 1089994880
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.4856032096632307,
4
- "train_runtime": 39327.4213,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 0.763,
7
  "train_steps_per_second": 0.006
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 2.3046215271270145,
4
+ "train_runtime": 39674.2598,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 0.756,
7
  "train_steps_per_second": 0.006
8
  }
trainer_state.json CHANGED
@@ -12,7 +12,7 @@
12
  "clip_ratio": 0.0,
13
  "completion_length": 445.9810485839844,
14
  "epoch": 0.017057569296375266,
15
- "grad_norm": 0.44629430770874023,
16
  "kl": 0.0,
17
  "learning_rate": 1.25e-07,
18
  "loss": -0.0061,
@@ -24,678 +24,678 @@
24
  },
25
  {
26
  "clip_ratio": 0.0,
27
- "completion_length": 450.8755760192871,
28
  "epoch": 0.08528784648187633,
29
- "grad_norm": 0.5371240377426147,
30
- "kl": 5.666911602020264e-05,
31
  "learning_rate": 6.25e-07,
32
- "loss": -0.0072,
33
- "reward": 0.26227679662406445,
34
- "reward_std": 0.3667756309732795,
35
- "rewards/accuracy_reward": 0.1936384029686451,
36
- "rewards/format_reward": 0.0686383958091028,
37
  "step": 5
38
  },
39
  {
40
  "clip_ratio": 0.0,
41
- "completion_length": 440.6763626098633,
42
  "epoch": 0.17057569296375266,
43
- "grad_norm": 0.5557937026023865,
44
- "kl": 0.003149533271789551,
45
  "learning_rate": 1.25e-06,
46
- "loss": 0.0071,
47
- "reward": 0.30558037012815475,
48
- "reward_std": 0.4056690149009228,
49
- "rewards/accuracy_reward": 0.19308036658912897,
50
- "rewards/format_reward": 0.11250000540167093,
51
  "step": 10
52
  },
53
  {
54
  "clip_ratio": 0.0,
55
- "completion_length": 456.58997573852537,
56
  "epoch": 0.255863539445629,
57
- "grad_norm": 4.744609832763672,
58
- "kl": 0.0615509033203125,
59
  "learning_rate": 1.875e-06,
60
- "loss": 0.0307,
61
- "reward": 0.674107177555561,
62
- "reward_std": 0.5331576444208622,
63
- "rewards/accuracy_reward": 0.27075893972069026,
64
- "rewards/format_reward": 0.4033482324331999,
65
  "step": 15
66
  },
67
  {
68
  "clip_ratio": 0.0,
69
- "completion_length": 481.85694046020507,
70
  "epoch": 0.3411513859275053,
71
- "grad_norm": 0.46378934383392334,
72
- "kl": 0.06910400390625,
73
  "learning_rate": 2.5e-06,
74
- "loss": 0.0287,
75
- "reward": 1.0837054058909417,
76
- "reward_std": 0.5833071276545525,
77
- "rewards/accuracy_reward": 0.4551339499652386,
78
- "rewards/format_reward": 0.6285714603960514,
79
  "step": 20
80
  },
81
  {
82
  "clip_ratio": 0.0,
83
- "completion_length": 518.9658714294434,
84
  "epoch": 0.42643923240938164,
85
- "grad_norm": 3.33089017868042,
86
- "kl": 0.056927490234375,
87
  "learning_rate": 2.999828909426247e-06,
88
- "loss": 0.0487,
89
- "reward": 1.3687500655651093,
90
- "reward_std": 0.50418995693326,
91
- "rewards/accuracy_reward": 0.581250024586916,
92
- "rewards/format_reward": 0.7875000387430191,
93
  "step": 25
94
  },
95
  {
96
  "clip_ratio": 0.0,
97
- "completion_length": 517.6306053161621,
98
  "epoch": 0.511727078891258,
99
- "grad_norm": 7.978788375854492,
100
- "kl": 0.0954803466796875,
101
  "learning_rate": 2.9938448364256362e-06,
102
- "loss": 0.0525,
103
- "reward": 1.4741072177886962,
104
- "reward_std": 0.43093371838331224,
105
- "rewards/accuracy_reward": 0.5995536006987094,
106
- "rewards/format_reward": 0.8745536118745804,
107
  "step": 30
108
  },
109
  {
110
  "clip_ratio": 0.0,
111
- "completion_length": 505.315869140625,
112
  "epoch": 0.5970149253731343,
113
- "grad_norm": 175585.21875,
114
- "kl": 64.5933837890625,
115
  "learning_rate": 2.979345224048116e-06,
116
- "loss": 5.1443,
117
- "reward": 1.4098214894533156,
118
- "reward_std": 0.4775668144226074,
119
- "rewards/accuracy_reward": 0.5758928783237934,
120
- "rewards/format_reward": 0.833928607404232,
121
  "step": 35
122
  },
123
  {
124
  "clip_ratio": 0.0,
125
- "completion_length": 498.5922119140625,
126
  "epoch": 0.6823027718550106,
127
- "grad_norm": 2.6241941452026367,
128
- "kl": 0.74183349609375,
129
  "learning_rate": 2.956412726139078e-06,
130
- "loss": 0.1161,
131
- "reward": 1.2595982730388642,
132
- "reward_std": 0.5859433703124524,
133
- "rewards/accuracy_reward": 0.5636160977184772,
134
- "rewards/format_reward": 0.695982177555561,
135
  "step": 40
136
  },
137
  {
138
  "clip_ratio": 0.0,
139
- "completion_length": 519.4406471252441,
140
  "epoch": 0.767590618336887,
141
- "grad_norm": 52.20960998535156,
142
- "kl": 0.151666259765625,
143
  "learning_rate": 2.925178067512904e-06,
144
- "loss": 0.1406,
145
- "reward": 1.215401840209961,
146
- "reward_std": 0.6168530285358429,
147
- "rewards/accuracy_reward": 0.5671875238418579,
148
- "rewards/format_reward": 0.648214316368103,
149
  "step": 45
150
  },
151
  {
152
  "clip_ratio": 0.0,
153
- "completion_length": 566.0794883728028,
154
  "epoch": 0.8528784648187633,
155
- "grad_norm": 9.431961059570312,
156
- "kl": 0.23187255859375,
157
  "learning_rate": 2.88581929876693e-06,
158
- "loss": 0.2278,
159
- "reward": 1.245089340209961,
160
- "reward_std": 0.6514639720320702,
161
- "rewards/accuracy_reward": 0.5736607365310192,
162
- "rewards/format_reward": 0.6714285984635353,
163
  "step": 50
164
  },
165
  {
166
  "clip_ratio": 0.0,
167
- "completion_length": 614.7377502441407,
168
  "epoch": 0.9381663113006397,
169
- "grad_norm": 28.97673988342285,
170
- "kl": 3.216015625,
171
  "learning_rate": 2.8385607813186967e-06,
172
- "loss": 0.434,
173
- "reward": 1.1569196969270705,
174
- "reward_std": 0.7305632084608078,
175
- "rewards/accuracy_reward": 0.5412946678698063,
176
- "rewards/format_reward": 0.615625025331974,
177
  "step": 55
178
  },
179
  {
180
  "clip_ratio": 0.0,
181
- "completion_length": 559.5640083312989,
182
  "epoch": 1.0341151385927505,
183
- "grad_norm": 21.23921775817871,
184
- "kl": 6.149072265625,
185
  "learning_rate": 2.7836719084521715e-06,
186
- "loss": 0.6512,
187
- "reward": 1.2779018491506577,
188
- "reward_std": 0.6680191181600094,
189
- "rewards/accuracy_reward": 0.5743303805589676,
190
- "rewards/format_reward": 0.7035714641213417,
191
  "step": 60
192
  },
193
  {
194
  "clip_ratio": 0.0,
195
- "completion_length": 550.1716812133789,
196
  "epoch": 1.1194029850746268,
197
- "grad_norm": 22.82264518737793,
198
- "kl": 2.95859375,
199
  "learning_rate": 2.7214655696635407e-06,
200
- "loss": 0.4399,
201
- "reward": 1.30558041036129,
202
- "reward_std": 0.6594597332179546,
203
- "rewards/accuracy_reward": 0.5687500216066838,
204
- "rewards/format_reward": 0.7368303894996643,
205
  "step": 65
206
  },
207
  {
208
  "clip_ratio": 0.0,
209
- "completion_length": 569.2410942077637,
210
  "epoch": 1.2046908315565032,
211
- "grad_norm": 22.326448440551758,
212
- "kl": 2.64033203125,
213
  "learning_rate": 2.652296367060421e-06,
214
- "loss": 0.4324,
215
- "reward": 1.283482199907303,
216
- "reward_std": 0.6751217097043991,
217
- "rewards/accuracy_reward": 0.5662946663796902,
218
- "rewards/format_reward": 0.7171875342726708,
219
  "step": 70
220
  },
221
  {
222
  "clip_ratio": 0.0,
223
- "completion_length": 589.6875289916992,
224
  "epoch": 1.2899786780383795,
225
- "grad_norm": 11.764822959899902,
226
- "kl": 2.82216796875,
227
  "learning_rate": 2.5765585939817676e-06,
228
- "loss": 0.454,
229
- "reward": 1.2718750685453415,
230
- "reward_std": 0.6881907656788826,
231
- "rewards/accuracy_reward": 0.5497768148779869,
232
- "rewards/format_reward": 0.7220982432365417,
233
  "step": 75
234
  },
235
  {
236
  "clip_ratio": 0.0,
237
- "completion_length": 588.3623039245606,
238
  "epoch": 1.375266524520256,
239
- "grad_norm": 16.239723205566406,
240
- "kl": 1.092626953125,
241
  "learning_rate": 2.4946839873611927e-06,
242
- "loss": 0.3543,
243
- "reward": 1.2966518431901932,
244
- "reward_std": 0.7087490603327751,
245
- "rewards/accuracy_reward": 0.5763393104076385,
246
- "rewards/format_reward": 0.7203125327825546,
247
  "step": 80
248
  },
249
  {
250
  "clip_ratio": 0.0,
251
- "completion_length": 554.1701164245605,
252
  "epoch": 1.4605543710021323,
253
- "grad_norm": 225.1170196533203,
254
- "kl": 3.343701171875,
255
  "learning_rate": 2.4071392666461563e-06,
256
- "loss": 0.4732,
257
- "reward": 1.348883980512619,
258
- "reward_std": 0.6181702017784119,
259
- "rewards/accuracy_reward": 0.5640625275671483,
260
- "rewards/format_reward": 0.7848214700818061,
261
  "step": 85
262
  },
263
  {
264
  "clip_ratio": 0.0,
265
- "completion_length": 557.5998046875,
266
  "epoch": 1.5458422174840085,
267
- "grad_norm": 35.329673767089844,
268
- "kl": 3.09814453125,
269
  "learning_rate": 2.314423473302218e-06,
270
- "loss": 0.4767,
271
- "reward": 1.3281250685453414,
272
- "reward_std": 0.6315318033099174,
273
- "rewards/accuracy_reward": 0.5370535977184773,
274
- "rewards/format_reward": 0.7910714611411095,
275
  "step": 90
276
  },
277
  {
278
  "clip_ratio": 0.0,
279
- "completion_length": 565.7868560791015,
280
  "epoch": 1.6311300639658848,
281
- "grad_norm": 64.45903778076172,
282
- "kl": 2.35947265625,
283
  "learning_rate": 2.2170651260682927e-06,
284
- "loss": 0.424,
285
- "reward": 1.3314732700586318,
286
- "reward_std": 0.6345132827758789,
287
- "rewards/accuracy_reward": 0.5468750260770321,
288
- "rewards/format_reward": 0.7845982551574707,
289
  "step": 95
290
  },
291
  {
292
  "epoch": 1.716417910447761,
293
- "grad_norm": 31.969350814819336,
294
  "learning_rate": 2.1156192081791355e-06,
295
- "loss": 2.0191,
296
  "step": 100
297
  },
298
  {
299
  "epoch": 1.716417910447761,
300
  "eval_clip_ratio": 0.0,
301
- "eval_completion_length": 573.7114408937887,
302
- "eval_kl": 6.245007987220447,
303
- "eval_loss": 0.679384171962738,
304
- "eval_reward": 1.3160087325321599,
305
- "eval_reward_std": 0.6340273570138425,
306
- "eval_rewards/accuracy_reward": 0.5342024423824713,
307
- "eval_rewards/format_reward": 0.7818062878645267,
308
- "eval_runtime": 4416.6844,
309
- "eval_samples_per_second": 1.132,
310
  "eval_steps_per_second": 0.01,
311
  "step": 100
312
  },
313
  {
314
  "clip_ratio": 0.0,
315
- "completion_length": 576.5287105560303,
316
  "epoch": 1.8017057569296375,
317
- "grad_norm": 38.427772521972656,
318
- "kl": 14.87158203125,
319
  "learning_rate": 2.010664003729149e-06,
320
- "loss": 0.5592,
321
- "reward": 1.361272382736206,
322
- "reward_std": 0.6285464949905872,
323
- "rewards/accuracy_reward": 0.5775669913738966,
324
- "rewards/format_reward": 0.7837053917348384,
325
  "step": 105
326
  },
327
  {
328
  "clip_ratio": 0.0,
329
- "completion_length": 571.036637878418,
330
  "epoch": 1.886993603411514,
331
- "grad_norm": 13.744680404663086,
332
- "kl": 4.437109375,
333
  "learning_rate": 1.9027978012115653e-06,
334
- "loss": 0.5515,
335
- "reward": 1.3685268431901931,
336
- "reward_std": 0.6233541399240494,
337
- "rewards/accuracy_reward": 0.5756696701049805,
338
- "rewards/format_reward": 0.7928571805357933,
339
  "step": 110
340
  },
341
  {
342
  "clip_ratio": 0.0,
343
- "completion_length": 561.5263648986817,
344
  "epoch": 1.9722814498933903,
345
- "grad_norm": 11.377424240112305,
346
- "kl": 5.195703125,
347
  "learning_rate": 1.7926354830241926e-06,
348
- "loss": 0.5897,
349
- "reward": 1.3955357789993286,
350
- "reward_std": 0.5930619619786739,
351
- "rewards/accuracy_reward": 0.5919643118977547,
352
- "rewards/format_reward": 0.8035714611411094,
353
  "step": 115
354
  },
355
  {
356
  "clip_ratio": 0.0,
357
- "completion_length": 543.7195793151856,
358
  "epoch": 2.068230277185501,
359
- "grad_norm": 21.994367599487305,
360
- "kl": 3.9482421875,
361
  "learning_rate": 1.6808050203829845e-06,
362
- "loss": 0.4929,
363
- "reward": 1.4200893491506577,
364
- "reward_std": 0.5751285634934902,
365
- "rewards/accuracy_reward": 0.594196455925703,
366
- "rewards/format_reward": 0.8258928969502449,
367
  "step": 120
368
  },
369
  {
370
  "clip_ratio": 0.0,
371
- "completion_length": 528.3288215637207,
372
  "epoch": 2.1535181236673773,
373
- "grad_norm": 23.414892196655273,
374
- "kl": 3.4708984375,
375
  "learning_rate": 1.5679438936238768e-06,
376
- "loss": 0.4547,
377
- "reward": 1.4633929193019868,
378
- "reward_std": 0.5353646248579025,
379
- "rewards/accuracy_reward": 0.6140625290572643,
380
- "rewards/format_reward": 0.8493303969502449,
381
  "step": 125
382
  },
383
  {
384
  "clip_ratio": 0.0,
385
- "completion_length": 532.4109573364258,
386
  "epoch": 2.2388059701492535,
387
- "grad_norm": 5.716637134552002,
388
- "kl": 3.59921875,
389
  "learning_rate": 1.454695458298667e-06,
390
- "loss": 0.4406,
391
- "reward": 1.4633929222822188,
392
- "reward_std": 0.5372500255703926,
393
- "rewards/accuracy_reward": 0.6140625238418579,
394
- "rewards/format_reward": 0.8493303999304771,
395
  "step": 130
396
  },
397
  {
398
  "clip_ratio": 0.0,
399
- "completion_length": 529.8064964294433,
400
  "epoch": 2.3240938166311302,
401
- "grad_norm": 11.957624435424805,
402
- "kl": 3.2958984375,
403
  "learning_rate": 1.341705277779715e-06,
404
- "loss": 0.4071,
405
- "reward": 1.4368304163217545,
406
- "reward_std": 0.5057652793824673,
407
- "rewards/accuracy_reward": 0.5850446701049805,
408
- "rewards/format_reward": 0.8517857536673545,
409
  "step": 135
410
  },
411
  {
412
  "clip_ratio": 0.0,
413
- "completion_length": 513.0975715637207,
414
  "epoch": 2.4093816631130065,
415
- "grad_norm": 409.32232666015625,
416
- "kl": 3.7796875,
417
  "learning_rate": 1.2296174432791415e-06,
418
- "loss": 0.452,
419
- "reward": 1.4669643461704254,
420
- "reward_std": 0.5007161863148213,
421
- "rewards/accuracy_reward": 0.6037946708500386,
422
- "rewards/format_reward": 0.8631696820259094,
423
  "step": 140
424
  },
425
  {
426
  "clip_ratio": 0.0,
427
- "completion_length": 513.7160926818848,
428
  "epoch": 2.4946695095948828,
429
- "grad_norm": 4.6824445724487305,
430
- "kl": 3.0923828125,
431
  "learning_rate": 1.1190709022599545e-06,
432
- "loss": 0.3687,
433
- "reward": 1.501339355111122,
434
- "reward_std": 0.501213763654232,
435
- "rewards/accuracy_reward": 0.6316964566707611,
436
- "rewards/format_reward": 0.8696429014205933,
437
  "step": 145
438
  },
439
  {
440
  "clip_ratio": 0.0,
441
- "completion_length": 503.14399871826174,
442
  "epoch": 2.579957356076759,
443
- "grad_norm": 5.0907368659973145,
444
- "kl": 2.658984375,
445
  "learning_rate": 1.0106958161686963e-06,
446
- "loss": 0.3415,
447
- "reward": 1.4787947177886962,
448
- "reward_std": 0.47073035016655923,
449
- "rewards/accuracy_reward": 0.6015625268220901,
450
- "rewards/format_reward": 0.8772321864962578,
451
  "step": 150
452
  },
453
  {
454
  "clip_ratio": 0.0,
455
- "completion_length": 493.3506935119629,
456
  "epoch": 2.6652452025586353,
457
- "grad_norm": 5.612578868865967,
458
- "kl": 3.1716796875,
459
  "learning_rate": 9.051099682520474e-07,
460
- "loss": 0.3894,
461
- "reward": 1.5064732819795608,
462
- "reward_std": 0.45394266620278356,
463
- "rewards/accuracy_reward": 0.6234375290572644,
464
- "rewards/format_reward": 0.8830357596278191,
465
  "step": 155
466
  },
467
  {
468
  "clip_ratio": 0.0,
469
- "completion_length": 491.6685501098633,
470
  "epoch": 2.750533049040512,
471
- "grad_norm": 9.828149795532227,
472
- "kl": 2.4236328125,
473
  "learning_rate": 8.029152419343472e-07,
474
- "loss": 0.3284,
475
- "reward": 1.5341518610715865,
476
- "reward_std": 0.44977339953184126,
477
- "rewards/accuracy_reward": 0.6435268133878708,
478
- "rewards/format_reward": 0.8906250447034836,
479
  "step": 160
480
  },
481
  {
482
  "clip_ratio": 0.0,
483
- "completion_length": 485.81029205322267,
484
  "epoch": 2.835820895522388,
485
- "grad_norm": 4.9597859382629395,
486
- "kl": 2.07568359375,
487
  "learning_rate": 7.046941898307347e-07,
488
- "loss": 0.3018,
489
- "reward": 1.5156250655651093,
490
- "reward_std": 0.4370568677783012,
491
- "rewards/accuracy_reward": 0.6238839596509933,
492
- "rewards/format_reward": 0.8917411118745804,
493
  "step": 165
494
  },
495
  {
496
  "clip_ratio": 0.0,
497
- "completion_length": 497.5314971923828,
498
  "epoch": 2.9211087420042645,
499
- "grad_norm": 84.52203369140625,
500
- "kl": 3.3498046875,
501
  "learning_rate": 6.11006712953975e-07,
502
- "loss": 0.4164,
503
- "reward": 1.5022322148084641,
504
- "reward_std": 0.4752186842262745,
505
- "rewards/accuracy_reward": 0.6212053842842579,
506
- "rewards/format_reward": 0.8810268267989159,
507
  "step": 170
508
  },
509
  {
510
  "clip_ratio": 0.0,
511
- "completion_length": 495.55774307250977,
512
  "epoch": 3.0170575692963753,
513
- "grad_norm": 16.004627227783203,
514
- "kl": 3.1123046875,
515
  "learning_rate": 5.223868690448817e-07,
516
- "loss": 0.4053,
517
- "reward": 1.4607143461704255,
518
- "reward_std": 0.49991785958409307,
519
- "rewards/accuracy_reward": 0.5937500283122062,
520
- "rewards/format_reward": 0.8669643327593803,
521
  "step": 175
522
  },
523
  {
524
  "clip_ratio": 0.0,
525
- "completion_length": 504.62011337280273,
526
  "epoch": 3.1023454157782515,
527
- "grad_norm": 4.859574794769287,
528
- "kl": 3.499609375,
529
  "learning_rate": 4.3933982822017883e-07,
530
- "loss": 0.4582,
531
- "reward": 1.4575893461704255,
532
- "reward_std": 0.5223984435200691,
533
- "rewards/accuracy_reward": 0.6051339566707611,
534
- "rewards/format_reward": 0.8524553969502449,
535
  "step": 180
536
  },
537
  {
538
  "clip_ratio": 0.0,
539
- "completion_length": 505.25716705322264,
540
  "epoch": 3.1876332622601278,
541
- "grad_norm": 6.827749729156494,
542
- "kl": 3.01005859375,
543
  "learning_rate": 3.6233899329188115e-07,
544
- "loss": 0.3983,
545
- "reward": 1.4602679193019867,
546
- "reward_std": 0.49594502747058866,
547
- "rewards/accuracy_reward": 0.5908482372760773,
548
- "rewards/format_reward": 0.8694196835160255,
549
  "step": 185
550
  },
551
  {
552
  "clip_ratio": 0.0,
553
- "completion_length": 499.8402046203613,
554
  "epoch": 3.272921108742004,
555
- "grad_norm": 5.723031520843506,
556
- "kl": 2.642578125,
557
  "learning_rate": 2.9182330117358096e-07,
558
- "loss": 0.381,
559
- "reward": 1.4939732760190965,
560
- "reward_std": 0.4914128452539444,
561
- "rewards/accuracy_reward": 0.615401814877987,
562
- "rewards/format_reward": 0.8785714715719223,
563
  "step": 190
564
  },
565
  {
566
  "clip_ratio": 0.0,
567
- "completion_length": 495.40426177978514,
568
  "epoch": 3.3582089552238807,
569
- "grad_norm": 7.8606390953063965,
570
- "kl": 3.060546875,
571
  "learning_rate": 2.281947207567473e-07,
572
- "loss": 0.3953,
573
- "reward": 1.468303632736206,
574
- "reward_std": 0.4849529266357422,
575
- "rewards/accuracy_reward": 0.5866071686148644,
576
- "rewards/format_reward": 0.8816964745521545,
577
  "step": 195
578
  },
579
  {
580
  "epoch": 3.443496801705757,
581
- "grad_norm": 4.874986171722412,
582
  "learning_rate": 1.718159615201853e-07,
583
- "loss": 0.3222,
584
  "step": 200
585
  },
586
  {
587
  "epoch": 3.443496801705757,
588
  "eval_clip_ratio": 0.0,
589
- "eval_completion_length": 488.8427760700067,
590
- "eval_kl": 2.4132263378594248,
591
- "eval_loss": 0.32682499289512634,
592
- "eval_reward": 1.4757246209409671,
593
- "eval_reward_std": 0.4513232411858373,
594
- "eval_rewards/accuracy_reward": 0.5826677580039722,
595
- "eval_rewards/format_reward": 0.8930568643652212,
596
- "eval_runtime": 4221.4152,
597
- "eval_samples_per_second": 1.184,
598
  "eval_steps_per_second": 0.011,
599
  "step": 200
600
  },
601
  {
602
  "clip_ratio": 0.0,
603
- "completion_length": 489.15783576965333,
604
  "epoch": 3.5287846481876333,
605
- "grad_norm": 5.266751289367676,
606
- "kl": 2.5323974609375,
607
  "learning_rate": 1.2300840593454622e-07,
608
- "loss": 0.3495,
609
- "reward": 1.5162947088479997,
610
- "reward_std": 0.4514313301071525,
611
- "rewards/accuracy_reward": 0.6287946715950966,
612
- "rewards/format_reward": 0.8875000402331352,
613
  "step": 205
614
  },
615
  {
616
  "clip_ratio": 0.0,
617
- "completion_length": 491.2348434448242,
618
  "epoch": 3.6140724946695095,
619
- "grad_norm": 9.579411506652832,
620
- "kl": 2.3810546875,
621
  "learning_rate": 8.20502774480395e-08,
622
- "loss": 0.308,
623
- "reward": 1.5162946969270705,
624
- "reward_std": 0.4419292353093624,
625
- "rewards/accuracy_reward": 0.6263393133878707,
626
- "rewards/format_reward": 0.8899554029107094,
627
  "step": 210
628
  },
629
  {
630
  "clip_ratio": 0.0,
631
- "completion_length": 489.42010803222655,
632
  "epoch": 3.699360341151386,
633
- "grad_norm": 4.795035362243652,
634
- "kl": 2.62646484375,
635
  "learning_rate": 4.917505449659615e-08,
636
- "loss": 0.3449,
637
- "reward": 1.5185268491506576,
638
- "reward_std": 0.45038328543305395,
639
- "rewards/accuracy_reward": 0.6337053865194321,
640
- "rewards/format_reward": 0.8848214730620384,
641
  "step": 215
642
  },
643
  {
644
  "clip_ratio": 0.0,
645
- "completion_length": 495.2564956665039,
646
  "epoch": 3.7846481876332625,
647
- "grad_norm": 6.437369346618652,
648
- "kl": 2.59952392578125,
649
  "learning_rate": 2.4570139579284723e-08,
650
- "loss": 0.3286,
651
- "reward": 1.495535773038864,
652
- "reward_std": 0.46331221498548986,
653
- "rewards/accuracy_reward": 0.6071428850293159,
654
- "rewards/format_reward": 0.8883929014205932,
655
  "step": 220
656
  },
657
  {
658
  "clip_ratio": 0.0,
659
- "completion_length": 485.0676559448242,
660
  "epoch": 3.8699360341151388,
661
- "grad_norm": 4.596541404724121,
662
- "kl": 2.499609375,
663
  "learning_rate": 8.37579098581176e-09,
664
- "loss": 0.3586,
665
- "reward": 1.5156250715255737,
666
- "reward_std": 0.4558119185268879,
667
- "rewards/accuracy_reward": 0.6247768118977547,
668
- "rewards/format_reward": 0.8908482611179351,
669
  "step": 225
670
  },
671
  {
672
  "clip_ratio": 0.0,
673
- "completion_length": 497.7335029602051,
674
  "epoch": 3.955223880597015,
675
- "grad_norm": 5.619830131530762,
676
- "kl": 2.5017578125,
677
  "learning_rate": 6.843232656998933e-10,
678
- "loss": 0.3316,
679
- "reward": 1.4997768551111221,
680
- "reward_std": 0.44890894591808317,
681
- "rewards/accuracy_reward": 0.6104911021888256,
682
- "rewards/format_reward": 0.8892857521772385,
683
  "step": 230
684
  },
685
  {
686
  "clip_ratio": 0.0,
687
- "completion_length": 497.67967987060547,
688
  "epoch": 3.9893390191897655,
689
- "kl": 2.341064453125,
690
- "reward": 1.5100447237491608,
691
- "reward_std": 0.44926475919783115,
692
- "rewards/accuracy_reward": 0.6216518096625805,
693
- "rewards/format_reward": 0.8883928954601288,
694
  "step": 232,
695
  "total_flos": 0.0,
696
- "train_loss": 0.4856032096632307,
697
- "train_runtime": 39327.4213,
698
- "train_samples_per_second": 0.763,
699
  "train_steps_per_second": 0.006
700
  }
701
  ],
 
12
  "clip_ratio": 0.0,
13
  "completion_length": 445.9810485839844,
14
  "epoch": 0.017057569296375266,
15
+ "grad_norm": 0.4470120966434479,
16
  "kl": 0.0,
17
  "learning_rate": 1.25e-07,
18
  "loss": -0.0061,
 
24
  },
25
  {
26
  "clip_ratio": 0.0,
27
+ "completion_length": 457.7648096084595,
28
  "epoch": 0.08528784648187633,
29
+ "grad_norm": 0.46972450613975525,
30
+ "kl": 4.832446575164795e-05,
31
  "learning_rate": 6.25e-07,
32
+ "loss": -0.0007,
33
+ "reward": 0.2547433148138225,
34
+ "reward_std": 0.3450065036304295,
35
+ "rewards/accuracy_reward": 0.19698661542497575,
36
+ "rewards/format_reward": 0.05775669927243143,
37
  "step": 5
38
  },
39
  {
40
  "clip_ratio": 0.0,
41
+ "completion_length": 449.00381622314455,
42
  "epoch": 0.17057569296375266,
43
+ "grad_norm": 0.4081119894981384,
44
+ "kl": 0.0028772354125976562,
45
  "learning_rate": 1.25e-06,
46
+ "loss": 0.0138,
47
+ "reward": 0.3084821570664644,
48
+ "reward_std": 0.38891434073448183,
49
+ "rewards/accuracy_reward": 0.2015625100582838,
50
+ "rewards/format_reward": 0.10691964821889996,
51
  "step": 10
52
  },
53
  {
54
  "clip_ratio": 0.0,
55
+ "completion_length": 445.6658676147461,
56
  "epoch": 0.255863539445629,
57
+ "grad_norm": 0.8681241273880005,
58
+ "kl": 0.05352325439453125,
59
  "learning_rate": 1.875e-06,
60
+ "loss": 0.0257,
61
+ "reward": 0.6531250305473805,
62
+ "reward_std": 0.5433857575058937,
63
+ "rewards/accuracy_reward": 0.25892858393490314,
64
+ "rewards/format_reward": 0.3941964443773031,
65
  "step": 15
66
  },
67
  {
68
  "clip_ratio": 0.0,
69
+ "completion_length": 469.591983795166,
70
  "epoch": 0.3411513859275053,
71
+ "grad_norm": 0.5079839825630188,
72
+ "kl": 0.0329986572265625,
73
  "learning_rate": 2.5e-06,
74
+ "loss": 0.0307,
75
+ "reward": 1.051785758137703,
76
+ "reward_std": 0.5734383672475815,
77
+ "rewards/accuracy_reward": 0.4296875223517418,
78
+ "rewards/format_reward": 0.6220982387661934,
79
  "step": 20
80
  },
81
  {
82
  "clip_ratio": 0.0,
83
+ "completion_length": 494.53462371826174,
84
  "epoch": 0.42643923240938164,
85
+ "grad_norm": 0.7165977954864502,
86
+ "kl": 0.0740936279296875,
87
  "learning_rate": 2.999828909426247e-06,
88
+ "loss": 0.046,
89
+ "reward": 1.3908482611179351,
90
+ "reward_std": 0.45604070723056794,
91
+ "rewards/accuracy_reward": 0.5560268141329289,
92
+ "rewards/format_reward": 0.8348214745521545,
93
  "step": 25
94
  },
95
  {
96
  "clip_ratio": 0.0,
97
+ "completion_length": 466.7975654602051,
98
  "epoch": 0.511727078891258,
99
+ "grad_norm": 105.83580780029297,
100
+ "kl": 0.15655517578125,
101
  "learning_rate": 2.9938448364256362e-06,
102
+ "loss": 0.0562,
103
+ "reward": 1.4703125596046447,
104
+ "reward_std": 0.39982456117868426,
105
+ "rewards/accuracy_reward": 0.5502232410013675,
106
+ "rewards/format_reward": 0.9200893312692642,
107
  "step": 30
108
  },
109
  {
110
  "clip_ratio": 0.0,
111
+ "completion_length": 506.7870819091797,
112
  "epoch": 0.5970149253731343,
113
+ "grad_norm": 185.41656494140625,
114
+ "kl": 0.2352294921875,
115
  "learning_rate": 2.979345224048116e-06,
116
+ "loss": 0.1242,
117
+ "reward": 1.4352679133415223,
118
+ "reward_std": 0.4489331416785717,
119
+ "rewards/accuracy_reward": 0.5587053798139096,
120
+ "rewards/format_reward": 0.8765625402331352,
121
  "step": 35
122
  },
123
  {
124
  "clip_ratio": 0.0,
125
+ "completion_length": 534.5277046203613,
126
  "epoch": 0.6823027718550106,
127
+ "grad_norm": 174158.28125,
128
+ "kl": 1967.46162109375,
129
  "learning_rate": 2.956412726139078e-06,
130
+ "loss": 65.1453,
131
+ "reward": 1.3604911327362061,
132
+ "reward_std": 0.5219059348106384,
133
+ "rewards/accuracy_reward": 0.557812524586916,
134
+ "rewards/format_reward": 0.8026786133646965,
135
  "step": 40
136
  },
137
  {
138
  "clip_ratio": 0.0,
139
+ "completion_length": 597.9991287231445,
140
  "epoch": 0.767590618336887,
141
+ "grad_norm": 19395.80078125,
142
+ "kl": 781.421875,
143
  "learning_rate": 2.925178067512904e-06,
144
+ "loss": 24.4654,
145
+ "reward": 1.210491119325161,
146
+ "reward_std": 0.614486200362444,
147
+ "rewards/accuracy_reward": 0.5167410962283612,
148
+ "rewards/format_reward": 0.6937500357627868,
149
  "step": 45
150
  },
151
  {
152
  "clip_ratio": 0.0,
153
+ "completion_length": 663.8980178833008,
154
  "epoch": 0.8528784648187633,
155
+ "grad_norm": 12.801368713378906,
156
+ "kl": 8.7136962890625,
157
  "learning_rate": 2.88581929876693e-06,
158
+ "loss": 0.4607,
159
+ "reward": 0.9968750447034835,
160
+ "reward_std": 0.6696360170841217,
161
+ "rewards/accuracy_reward": 0.4488839447498322,
162
+ "rewards/format_reward": 0.547991094738245,
163
  "step": 50
164
  },
165
  {
166
  "clip_ratio": 0.0,
167
+ "completion_length": 697.5154357910156,
168
  "epoch": 0.9381663113006397,
169
+ "grad_norm": 38.23894119262695,
170
+ "kl": 0.28189697265625,
171
  "learning_rate": 2.8385607813186967e-06,
172
+ "loss": 0.2599,
173
+ "reward": 0.9542411118745804,
174
+ "reward_std": 0.6912495926022529,
175
+ "rewards/accuracy_reward": 0.4694196626543999,
176
+ "rewards/format_reward": 0.4848214492201805,
177
  "step": 55
178
  },
179
  {
180
  "clip_ratio": 0.0,
181
+ "completion_length": 649.4566436767578,
182
  "epoch": 1.0341151385927505,
183
+ "grad_norm": 313.91156005859375,
184
+ "kl": 4.05673828125,
185
  "learning_rate": 2.7836719084521715e-06,
186
+ "loss": 0.5276,
187
+ "reward": 1.1421875596046447,
188
+ "reward_std": 0.7461830854415894,
189
+ "rewards/accuracy_reward": 0.529910734295845,
190
+ "rewards/format_reward": 0.6122768133878708,
191
  "step": 60
192
  },
193
  {
194
  "clip_ratio": 0.0,
195
+ "completion_length": 657.656948852539,
196
  "epoch": 1.1194029850746268,
197
+ "grad_norm": 83.17515563964844,
198
+ "kl": 4.4130859375,
199
  "learning_rate": 2.7214655696635407e-06,
200
+ "loss": 0.5723,
201
+ "reward": 1.0674107655882836,
202
+ "reward_std": 0.7840202301740646,
203
+ "rewards/accuracy_reward": 0.4863839507102966,
204
+ "rewards/format_reward": 0.5810268193483352,
205
  "step": 65
206
  },
207
  {
208
  "clip_ratio": 0.0,
209
+ "completion_length": 586.2384178161622,
210
  "epoch": 1.2046908315565032,
211
+ "grad_norm": 15.149043083190918,
212
+ "kl": 2.45390625,
213
  "learning_rate": 2.652296367060421e-06,
214
+ "loss": 0.4104,
215
+ "reward": 1.256919699907303,
216
+ "reward_std": 0.7236043408513069,
217
+ "rewards/accuracy_reward": 0.5531250238418579,
218
+ "rewards/format_reward": 0.703794677555561,
219
  "step": 70
220
  },
221
  {
222
  "clip_ratio": 0.0,
223
+ "completion_length": 574.1111854553222,
224
  "epoch": 1.2899786780383795,
225
+ "grad_norm": 21.18400764465332,
226
+ "kl": 9.9931640625,
227
  "learning_rate": 2.5765585939817676e-06,
228
+ "loss": 0.744,
229
+ "reward": 1.3468750655651092,
230
+ "reward_std": 0.6585553318262101,
231
+ "rewards/accuracy_reward": 0.5841518118977547,
232
+ "rewards/format_reward": 0.7627232521772385,
233
  "step": 75
234
  },
235
  {
236
  "clip_ratio": 0.0,
237
+ "completion_length": 564.7518135070801,
238
  "epoch": 1.375266524520256,
239
+ "grad_norm": 486.4627990722656,
240
+ "kl": 8.6583984375,
241
  "learning_rate": 2.4946839873611927e-06,
242
+ "loss": 0.531,
243
+ "reward": 1.4149554252624512,
244
+ "reward_std": 0.6240757808089257,
245
+ "rewards/accuracy_reward": 0.6238839633762836,
246
+ "rewards/format_reward": 0.7910714626312256,
247
  "step": 80
248
  },
249
  {
250
  "clip_ratio": 0.0,
251
+ "completion_length": 565.4381927490234,
252
  "epoch": 1.4605543710021323,
253
+ "grad_norm": 88.8906021118164,
254
+ "kl": 2.0404296875,
255
  "learning_rate": 2.4071392666461563e-06,
256
+ "loss": 0.1685,
257
+ "reward": 1.3975447028875352,
258
+ "reward_std": 0.6175315536558628,
259
+ "rewards/accuracy_reward": 0.6136160999536514,
260
+ "rewards/format_reward": 0.7839286029338837,
261
  "step": 85
262
  },
263
  {
264
  "clip_ratio": 0.0,
265
+ "completion_length": 569.019669342041,
266
  "epoch": 1.5458422174840085,
267
+ "grad_norm": 17.572298049926758,
268
+ "kl": 1.3623046875,
269
  "learning_rate": 2.314423473302218e-06,
270
+ "loss": 0.0949,
271
+ "reward": 1.3495536297559738,
272
+ "reward_std": 0.6313827067613602,
273
+ "rewards/accuracy_reward": 0.5805803835391998,
274
+ "rewards/format_reward": 0.7689732491970063,
275
  "step": 90
276
  },
277
  {
278
  "clip_ratio": 0.0,
279
+ "completion_length": 589.0908721923828,
280
  "epoch": 1.6311300639658848,
281
+ "grad_norm": 219.19049072265625,
282
+ "kl": 3.03681640625,
283
  "learning_rate": 2.2170651260682927e-06,
284
+ "loss": 0.178,
285
+ "reward": 1.3450893461704254,
286
+ "reward_std": 0.6524915762245656,
287
+ "rewards/accuracy_reward": 0.5834821671247482,
288
+ "rewards/format_reward": 0.7616071730852128,
289
  "step": 95
290
  },
291
  {
292
  "epoch": 1.716417910447761,
293
+ "grad_norm": 9490.1796875,
294
  "learning_rate": 2.1156192081791355e-06,
295
+ "loss": 5.6995,
296
  "step": 100
297
  },
298
  {
299
  "epoch": 1.716417910447761,
300
  "eval_clip_ratio": 0.0,
301
+ "eval_completion_length": 583.028025374245,
302
+ "eval_kl": 8.038014177316294,
303
+ "eval_loss": 0.30782344937324524,
304
+ "eval_reward": 1.30117531782522,
305
+ "eval_reward_std": 0.6690196339695599,
306
+ "eval_rewards/accuracy_reward": 0.538081950558641,
307
+ "eval_rewards/format_reward": 0.7630933717416879,
308
+ "eval_runtime": 4660.7236,
309
+ "eval_samples_per_second": 1.073,
310
  "eval_steps_per_second": 0.01,
311
  "step": 100
312
  },
313
  {
314
  "clip_ratio": 0.0,
315
+ "completion_length": 589.2927730560302,
316
  "epoch": 1.8017057569296375,
317
+ "grad_norm": 36.098472595214844,
318
+ "kl": 46.948193359375,
319
  "learning_rate": 2.010664003729149e-06,
320
+ "loss": 0.191,
321
+ "reward": 1.341964341700077,
322
+ "reward_std": 0.666185948625207,
323
+ "rewards/accuracy_reward": 0.579464315995574,
324
+ "rewards/format_reward": 0.7625000335276126,
325
  "step": 105
326
  },
327
  {
328
  "clip_ratio": 0.0,
329
+ "completion_length": 572.2181030273438,
330
  "epoch": 1.886993603411514,
331
+ "grad_norm": 33.85002517700195,
332
+ "kl": 1.8282470703125,
333
  "learning_rate": 1.9027978012115653e-06,
334
+ "loss": 0.1948,
335
+ "reward": 1.3232143372297287,
336
+ "reward_std": 0.6623021572828293,
337
+ "rewards/accuracy_reward": 0.5607143074274064,
338
+ "rewards/format_reward": 0.7625000298023223,
339
  "step": 110
340
  },
341
  {
342
  "clip_ratio": 0.0,
343
+ "completion_length": 542.2167655944825,
344
  "epoch": 1.9722814498933903,
345
+ "grad_norm": 225.037109375,
346
+ "kl": 0.3978759765625,
347
  "learning_rate": 1.7926354830241926e-06,
348
+ "loss": 0.0266,
349
+ "reward": 1.3252232730388642,
350
+ "reward_std": 0.6835980974137783,
351
+ "rewards/accuracy_reward": 0.5741071701049805,
352
+ "rewards/format_reward": 0.7511161074042321,
353
  "step": 115
354
  },
355
  {
356
  "clip_ratio": 0.0,
357
+ "completion_length": 525.6891525268554,
358
  "epoch": 2.068230277185501,
359
+ "grad_norm": 7.967476844787598,
360
+ "kl": 0.974072265625,
361
  "learning_rate": 1.6808050203829845e-06,
362
+ "loss": 0.0252,
363
+ "reward": 1.304241132736206,
364
+ "reward_std": 0.6886323638260364,
365
+ "rewards/accuracy_reward": 0.5553571663796901,
366
+ "rewards/format_reward": 0.7488839626312256,
367
  "step": 120
368
  },
369
  {
370
  "clip_ratio": 0.0,
371
+ "completion_length": 490.7185485839844,
372
  "epoch": 2.1535181236673773,
373
+ "grad_norm": 2670.3564453125,
374
+ "kl": 3.83818359375,
375
  "learning_rate": 1.5679438936238768e-06,
376
+ "loss": 0.0711,
377
+ "reward": 1.361607202887535,
378
+ "reward_std": 0.621609278768301,
379
+ "rewards/accuracy_reward": 0.5705357417464256,
380
+ "rewards/format_reward": 0.7910714641213417,
381
  "step": 125
382
  },
383
  {
384
  "clip_ratio": 0.0,
385
+ "completion_length": 489.616316986084,
386
  "epoch": 2.2388059701492535,
387
+ "grad_norm": 92.28889465332031,
388
+ "kl": 2.853662109375,
389
  "learning_rate": 1.454695458298667e-06,
390
+ "loss": 0.0984,
391
+ "reward": 1.4082589894533157,
392
+ "reward_std": 0.5909771449863911,
393
+ "rewards/accuracy_reward": 0.5848214507102967,
394
+ "rewards/format_reward": 0.8234375342726707,
395
  "step": 130
396
  },
397
  {
398
  "clip_ratio": 0.0,
399
+ "completion_length": 480.3973449707031,
400
  "epoch": 2.3240938166311302,
401
+ "grad_norm": 636.6416015625,
402
+ "kl": 1.462548828125,
403
  "learning_rate": 1.341705277779715e-06,
404
+ "loss": 0.0403,
405
+ "reward": 1.3687500536441803,
406
+ "reward_std": 0.5565730266273021,
407
+ "rewards/accuracy_reward": 0.5379464507102967,
408
+ "rewards/format_reward": 0.8308036103844643,
409
  "step": 135
410
  },
411
  {
412
  "clip_ratio": 0.0,
413
+ "completion_length": 473.5602897644043,
414
  "epoch": 2.4093816631130065,
415
+ "grad_norm": 175.14183044433594,
416
+ "kl": 1.86015625,
417
  "learning_rate": 1.2296174432791415e-06,
418
+ "loss": 0.0452,
419
+ "reward": 1.3828125596046448,
420
+ "reward_std": 0.5798827618360519,
421
+ "rewards/accuracy_reward": 0.5468750238418579,
422
+ "rewards/format_reward": 0.8359375432133674,
423
  "step": 140
424
  },
425
  {
426
  "clip_ratio": 0.0,
427
+ "completion_length": 460.103589630127,
428
  "epoch": 2.4946695095948828,
429
+ "grad_norm": 2649.130859375,
430
+ "kl": 2.87041015625,
431
  "learning_rate": 1.1190709022599545e-06,
432
+ "loss": 0.0699,
433
+ "reward": 1.3917411416769028,
434
+ "reward_std": 0.5849474132061004,
435
+ "rewards/accuracy_reward": 0.556919663399458,
436
+ "rewards/format_reward": 0.8348214626312256,
437
  "step": 145
438
  },
439
  {
440
  "clip_ratio": 0.0,
441
+ "completion_length": 458.2018081665039,
442
  "epoch": 2.579957356076759,
443
+ "grad_norm": 1305.534423828125,
444
+ "kl": 4.1134765625,
445
  "learning_rate": 1.0106958161686963e-06,
446
+ "loss": 0.1285,
447
+ "reward": 1.340178629755974,
448
+ "reward_std": 0.6000380210578442,
449
+ "rewards/accuracy_reward": 0.5174107395112515,
450
+ "rewards/format_reward": 0.8227678969502449,
451
  "step": 150
452
  },
453
  {
454
  "clip_ratio": 0.0,
455
+ "completion_length": 452.27725372314455,
456
  "epoch": 2.6652452025586353,
457
+ "grad_norm": 481.3914794921875,
458
+ "kl": 2.22734375,
459
  "learning_rate": 9.051099682520474e-07,
460
+ "loss": 0.039,
461
+ "reward": 1.3504464954137803,
462
+ "reward_std": 0.613651292026043,
463
+ "rewards/accuracy_reward": 0.5243303805589676,
464
+ "rewards/format_reward": 0.826116107404232,
465
  "step": 155
466
  },
467
  {
468
  "clip_ratio": 0.0,
469
+ "completion_length": 452.8314926147461,
470
  "epoch": 2.750533049040512,
471
+ "grad_norm": 59.583213806152344,
472
+ "kl": 2.50859375,
473
  "learning_rate": 8.029152419343472e-07,
474
+ "loss": 0.058,
475
+ "reward": 1.3649554133415223,
476
+ "reward_std": 0.6238026581704617,
477
+ "rewards/accuracy_reward": 0.544419664889574,
478
+ "rewards/format_reward": 0.8205357521772385,
479
  "step": 160
480
  },
481
  {
482
  "clip_ratio": 0.0,
483
+ "completion_length": 455.3404197692871,
484
  "epoch": 2.835820895522388,
485
+ "grad_norm": 1071.0487060546875,
486
+ "kl": 2.58671875,
487
  "learning_rate": 7.046941898307347e-07,
488
+ "loss": 0.076,
489
+ "reward": 1.340848270058632,
490
+ "reward_std": 0.6379699215292931,
491
+ "rewards/accuracy_reward": 0.5301339507102967,
492
+ "rewards/format_reward": 0.8107143223285675,
493
  "step": 165
494
  },
495
  {
496
  "clip_ratio": 0.0,
497
+ "completion_length": 461.10560073852537,
498
  "epoch": 2.9211087420042645,
499
+ "grad_norm": 869.7614135742188,
500
+ "kl": 2.18603515625,
501
  "learning_rate": 6.11006712953975e-07,
502
+ "loss": 0.06,
503
+ "reward": 1.3625000655651092,
504
+ "reward_std": 0.605516166985035,
505
+ "rewards/accuracy_reward": 0.5419643118977546,
506
+ "rewards/format_reward": 0.8205357491970062,
507
  "step": 170
508
  },
509
  {
510
  "clip_ratio": 0.0,
511
+ "completion_length": 465.46427154541016,
512
  "epoch": 3.0170575692963753,
513
+ "grad_norm": 37.821475982666016,
514
+ "kl": 3.72734375,
515
  "learning_rate": 5.223868690448817e-07,
516
+ "loss": 0.0881,
517
+ "reward": 1.3287946999073028,
518
+ "reward_std": 0.6144991792738438,
519
+ "rewards/accuracy_reward": 0.5147321581840515,
520
+ "rewards/format_reward": 0.8140625327825546,
521
  "step": 175
522
  },
523
  {
524
  "clip_ratio": 0.0,
525
+ "completion_length": 473.2102905273438,
526
  "epoch": 3.1023454157782515,
527
+ "grad_norm": 2058.541015625,
528
+ "kl": 3.3796875,
529
  "learning_rate": 4.3933982822017883e-07,
530
+ "loss": 0.108,
531
+ "reward": 1.3754464894533158,
532
+ "reward_std": 0.6123204372823239,
533
+ "rewards/accuracy_reward": 0.5506696686148643,
534
+ "rewards/format_reward": 0.8247768208384514,
535
  "step": 180
536
  },
537
  {
538
  "clip_ratio": 0.0,
539
+ "completion_length": 475.0154228210449,
540
  "epoch": 3.1876332622601278,
541
+ "grad_norm": 267681.28125,
542
+ "kl": 63.9951171875,
543
  "learning_rate": 3.6233899329188115e-07,
544
+ "loss": 3.57,
545
+ "reward": 1.3428572058677672,
546
+ "reward_std": 0.6201678223907947,
547
+ "rewards/accuracy_reward": 0.5258928827941418,
548
+ "rewards/format_reward": 0.8169643193483352,
549
  "step": 185
550
  },
551
  {
552
  "clip_ratio": 0.0,
553
+ "completion_length": 471.1038162231445,
554
  "epoch": 3.272921108742004,
555
+ "grad_norm": 2452.4267578125,
556
+ "kl": 32.8134765625,
557
  "learning_rate": 2.9182330117358096e-07,
558
+ "loss": 1.8761,
559
+ "reward": 1.3464286357164383,
560
+ "reward_std": 0.6306888595223427,
561
+ "rewards/accuracy_reward": 0.5354910954833031,
562
+ "rewards/format_reward": 0.810937537252903,
563
  "step": 190
564
  },
565
  {
566
  "clip_ratio": 0.0,
567
+ "completion_length": 461.9361801147461,
568
  "epoch": 3.3582089552238807,
569
+ "grad_norm": 938.4805908203125,
570
+ "kl": 4.35234375,
571
  "learning_rate": 2.281947207567473e-07,
572
+ "loss": 0.131,
573
+ "reward": 1.3305804163217545,
574
+ "reward_std": 0.6262686759233475,
575
+ "rewards/accuracy_reward": 0.5127232395112514,
576
+ "rewards/format_reward": 0.8178571835160255,
577
  "step": 195
578
  },
579
  {
580
  "epoch": 3.443496801705757,
581
+ "grad_norm": 1083.9442138671875,
582
  "learning_rate": 1.718159615201853e-07,
583
+ "loss": 0.0121,
584
  "step": 200
585
  },
586
  {
587
  "epoch": 3.443496801705757,
588
  "eval_clip_ratio": 0.0,
589
+ "eval_completion_length": 467.76792581089006,
590
+ "eval_kl": 2.3580770766773163,
591
+ "eval_loss": 0.06606976687908173,
592
+ "eval_reward": 1.3115872357980893,
593
+ "eval_reward_std": 0.6067914157248915,
594
+ "eval_rewards/accuracy_reward": 0.492383637367346,
595
+ "eval_rewards/format_reward": 0.8192035969073018,
596
+ "eval_runtime": 4249.3378,
597
+ "eval_samples_per_second": 1.177,
598
  "eval_steps_per_second": 0.011,
599
  "step": 200
600
  },
601
  {
602
  "clip_ratio": 0.0,
603
+ "completion_length": 464.8625213623047,
604
  "epoch": 3.5287846481876333,
605
+ "grad_norm": 41.21245193481445,
606
+ "kl": 2.200341796875,
607
  "learning_rate": 1.2300840593454622e-07,
608
+ "loss": 0.076,
609
+ "reward": 1.3582589879631997,
610
+ "reward_std": 0.6224493138492108,
611
+ "rewards/accuracy_reward": 0.5436384178698063,
612
+ "rewards/format_reward": 0.8146205730736256,
613
  "step": 205
614
  },
615
  {
616
  "clip_ratio": 0.0,
617
+ "completion_length": 465.8446632385254,
618
  "epoch": 3.6140724946695095,
619
+ "grad_norm": 418.15625,
620
+ "kl": 2.2728515625,
621
  "learning_rate": 8.20502774480395e-08,
622
+ "loss": 0.0404,
623
+ "reward": 1.377901840209961,
624
+ "reward_std": 0.6090469680726528,
625
+ "rewards/accuracy_reward": 0.5529018096625805,
626
+ "rewards/format_reward": 0.8250000357627869,
627
  "step": 210
628
  },
629
  {
630
  "clip_ratio": 0.0,
631
+ "completion_length": 464.74443969726565,
632
  "epoch": 3.699360341151386,
633
+ "grad_norm": 315.2561340332031,
634
+ "kl": 2.88203125,
635
  "learning_rate": 4.917505449659615e-08,
636
+ "loss": 0.0884,
637
+ "reward": 1.368080422282219,
638
+ "reward_std": 0.6142228864133358,
639
+ "rewards/accuracy_reward": 0.5479910932481289,
640
+ "rewards/format_reward": 0.8200893253087997,
641
  "step": 215
642
  },
643
  {
644
  "clip_ratio": 0.0,
645
+ "completion_length": 467.38953018188477,
646
  "epoch": 3.7846481876332625,
647
+ "grad_norm": 156.9326171875,
648
+ "kl": 2.8296875,
649
  "learning_rate": 2.4570139579284723e-08,
650
+ "loss": 0.0658,
651
+ "reward": 1.3386161386966706,
652
+ "reward_std": 0.6271648786962032,
653
+ "rewards/accuracy_reward": 0.5265625193715096,
654
+ "rewards/format_reward": 0.8120536088943482,
655
  "step": 220
656
  },
657
  {
658
  "clip_ratio": 0.0,
659
+ "completion_length": 462.0622955322266,
660
  "epoch": 3.8699360341151388,
661
+ "grad_norm": 81.24732208251953,
662
+ "kl": 2.6775390625,
663
  "learning_rate": 8.37579098581176e-09,
664
+ "loss": 0.0865,
665
+ "reward": 1.3390625447034836,
666
+ "reward_std": 0.6189317315816879,
667
+ "rewards/accuracy_reward": 0.5323660977184772,
668
+ "rewards/format_reward": 0.8066964671015739,
669
  "step": 225
670
  },
671
  {
672
  "clip_ratio": 0.0,
673
+ "completion_length": 470.36943969726565,
674
  "epoch": 3.955223880597015,
675
+ "grad_norm": 553.80859375,
676
+ "kl": 2.40078125,
677
  "learning_rate": 6.843232656998933e-10,
678
+ "loss": 0.053,
679
+ "reward": 1.3424107700586319,
680
+ "reward_std": 0.6173162661492825,
681
+ "rewards/accuracy_reward": 0.5263393089175225,
682
+ "rewards/format_reward": 0.8160714641213417,
683
  "step": 230
684
  },
685
  {
686
  "clip_ratio": 0.0,
687
+ "completion_length": 486.319974899292,
688
  "epoch": 3.9893390191897655,
689
+ "kl": 2.2841796875,
690
+ "reward": 1.3577009439468384,
691
+ "reward_std": 0.5995111986994743,
692
+ "rewards/accuracy_reward": 0.5401785913854837,
693
+ "rewards/format_reward": 0.8175223655998707,
694
  "step": 232,
695
  "total_flos": 0.0,
696
+ "train_loss": 2.3046215271270145,
697
+ "train_runtime": 39674.2598,
698
+ "train_samples_per_second": 0.756,
699
  "train_steps_per_second": 0.006
700
  }
701
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:64c6f6630b8a5a88f7029bab8461886d206fb2c510193f119d0d3600ab77af8c
3
  size 7992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b39e274769e16b9d724b7c7e093d76fc3e115e88841b6fcfd63b455cd60b72f8
3
  size 7992