chinmaydk99 commited on
Commit
545f519
·
verified ·
1 Parent(s): 8572b05

Model save

Browse files
Files changed (6) hide show
  1. README.md +68 -0
  2. all_results.json +8 -0
  3. config.json +1 -1
  4. generation_config.json +14 -0
  5. train_results.json +8 -0
  6. trainer_state.json +1092 -0
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: chinmaydk99/Qwen2.5-0.5B-Open-R1-Distill-Small
3
+ library_name: transformers
4
+ model_name: qwen2.5-0.5b-grpo-math
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - grpo
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for qwen2.5-0.5b-grpo-math
13
+
14
+ This model is a fine-tuned version of [chinmaydk99/Qwen2.5-0.5B-Open-R1-Distill-Small](https://huggingface.co/chinmaydk99/Qwen2.5-0.5B-Open-R1-Distill-Small).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="chinmaydk99/qwen2.5-0.5b-grpo-math", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+
31
+
32
+
33
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.14.0
38
+ - Transformers: 4.48.3
39
+ - Pytorch: 2.5.1+cu121
40
+ - Datasets: 3.2.0
41
+ - Tokenizers: 0.21.0
42
+
43
+ ## Citations
44
+
45
+ Cite GRPO as:
46
+
47
+ ```bibtex
48
+ @article{zhihong2024deepseekmath,
49
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
50
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
51
+ year = 2024,
52
+ eprint = {arXiv:2402.03300},
53
+ }
54
+
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.00017194549279035223,
4
+ "train_runtime": 4065.9132,
5
+ "train_samples": 45000,
6
+ "train_samples_per_second": 0.59,
7
+ "train_steps_per_second": 0.037
8
+ }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.48.3",
26
- "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
 
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.48.3",
26
+ "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.48.3"
14
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.00017194549279035223,
4
+ "train_runtime": 4065.9132,
5
+ "train_samples": 45000,
6
+ "train_samples_per_second": 0.59,
7
+ "train_steps_per_second": 0.037
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1092 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.05333333333333334,
5
+ "eval_steps": 500,
6
+ "global_step": 150,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "completion_length": 961.2890625,
13
+ "epoch": 0.0007111111111111111,
14
+ "grad_norm": 0.07362582076051967,
15
+ "kl": 0.0,
16
+ "learning_rate": 2e-07,
17
+ "loss": -0.0,
18
+ "reward": 0.09375000186264515,
19
+ "reward_std": 0.15193375945091248,
20
+ "rewards/format_compliance_reward": 0.0,
21
+ "rewards/math_reasoning_reward": 0.0,
22
+ "rewards/solution_quality_reward": 0.09375000186264515,
23
+ "step": 2
24
+ },
25
+ {
26
+ "completion_length": 939.3046875,
27
+ "epoch": 0.0014222222222222223,
28
+ "grad_norm": 0.0907152492976058,
29
+ "kl": 0.0003669261932373047,
30
+ "learning_rate": 4e-07,
31
+ "loss": 0.0,
32
+ "reward": 0.0875000013038516,
33
+ "reward_std": 0.14330127369612455,
34
+ "rewards/format_compliance_reward": 0.0,
35
+ "rewards/math_reasoning_reward": 0.0,
36
+ "rewards/solution_quality_reward": 0.0875000013038516,
37
+ "step": 4
38
+ },
39
+ {
40
+ "completion_length": 966.875,
41
+ "epoch": 0.0021333333333333334,
42
+ "grad_norm": 0.07758448496582468,
43
+ "kl": 0.00034809112548828125,
44
+ "learning_rate": 4.99941324504621e-07,
45
+ "loss": 0.0,
46
+ "reward": 0.08750000176951289,
47
+ "reward_std": 0.12500000186264515,
48
+ "rewards/format_compliance_reward": 0.0,
49
+ "rewards/math_reasoning_reward": 0.0,
50
+ "rewards/solution_quality_reward": 0.08750000176951289,
51
+ "step": 6
52
+ },
53
+ {
54
+ "completion_length": 974.5625,
55
+ "epoch": 0.0028444444444444446,
56
+ "grad_norm": 0.04999161599871498,
57
+ "kl": 0.00039124488830566406,
58
+ "learning_rate": 4.99472085783721e-07,
59
+ "loss": 0.0,
60
+ "reward": 0.031250000931322575,
61
+ "reward_std": 0.037500000558793545,
62
+ "rewards/format_compliance_reward": 0.0,
63
+ "rewards/math_reasoning_reward": 0.0,
64
+ "rewards/solution_quality_reward": 0.031250000931322575,
65
+ "step": 8
66
+ },
67
+ {
68
+ "completion_length": 988.71875,
69
+ "epoch": 0.0035555555555555557,
70
+ "grad_norm": 0.05986623117422814,
71
+ "kl": 0.00038301944732666016,
72
+ "learning_rate": 4.985344892885899e-07,
73
+ "loss": 0.0,
74
+ "reward": 0.06250000093132257,
75
+ "reward_std": 0.0933012729510665,
76
+ "rewards/format_compliance_reward": 0.0,
77
+ "rewards/math_reasoning_reward": 0.0,
78
+ "rewards/solution_quality_reward": 0.06250000093132257,
79
+ "step": 10
80
+ },
81
+ {
82
+ "completion_length": 951.15625,
83
+ "epoch": 0.004266666666666667,
84
+ "grad_norm": 0.0799364307268448,
85
+ "kl": 0.00038933753967285156,
86
+ "learning_rate": 4.971302952586796e-07,
87
+ "loss": 0.0,
88
+ "reward": 0.07500000158324838,
89
+ "reward_std": 0.12886751629412174,
90
+ "rewards/format_compliance_reward": 0.0,
91
+ "rewards/math_reasoning_reward": 0.0,
92
+ "rewards/solution_quality_reward": 0.07500000158324838,
93
+ "step": 12
94
+ },
95
+ {
96
+ "completion_length": 960.3046875,
97
+ "epoch": 0.004977777777777778,
98
+ "grad_norm": 0.03830275509956425,
99
+ "kl": 0.0003789663314819336,
100
+ "learning_rate": 4.952621399215597e-07,
101
+ "loss": 0.0,
102
+ "reward": 0.05000000074505806,
103
+ "reward_std": 0.0894337585195899,
104
+ "rewards/format_compliance_reward": 0.0,
105
+ "rewards/math_reasoning_reward": 0.0,
106
+ "rewards/solution_quality_reward": 0.05000000074505806,
107
+ "step": 14
108
+ },
109
+ {
110
+ "completion_length": 943.65625,
111
+ "epoch": 0.005688888888888889,
112
+ "grad_norm": 0.09243726858362067,
113
+ "kl": 0.00037360191345214844,
114
+ "learning_rate": 4.929335305436764e-07,
115
+ "loss": 0.0,
116
+ "reward": 0.12500000279396772,
117
+ "reward_std": 0.15773503109812737,
118
+ "rewards/format_compliance_reward": 0.0,
119
+ "rewards/math_reasoning_reward": 0.0,
120
+ "rewards/solution_quality_reward": 0.12500000279396772,
121
+ "step": 16
122
+ },
123
+ {
124
+ "completion_length": 991.3359375,
125
+ "epoch": 0.0064,
126
+ "grad_norm": 0.02502555687332951,
127
+ "kl": 0.0003898143768310547,
128
+ "learning_rate": 4.901488388458247e-07,
129
+ "loss": 0.0,
130
+ "reward": 0.0437500006519258,
131
+ "reward_std": 0.06636751536279917,
132
+ "rewards/format_compliance_reward": 0.0,
133
+ "rewards/math_reasoning_reward": 0.0,
134
+ "rewards/solution_quality_reward": 0.0437500006519258,
135
+ "step": 18
136
+ },
137
+ {
138
+ "completion_length": 946.1328125,
139
+ "epoch": 0.0071111111111111115,
140
+ "grad_norm": 0.09101166665795608,
141
+ "kl": 0.0004526376724243164,
142
+ "learning_rate": 4.869132927957006e-07,
143
+ "loss": 0.0,
144
+ "reward": 0.1171875037252903,
145
+ "reward_std": 0.1776762753725052,
146
+ "rewards/format_compliance_reward": 0.0,
147
+ "rewards/math_reasoning_reward": 0.0,
148
+ "rewards/solution_quality_reward": 0.1171875037252903,
149
+ "step": 20
150
+ },
151
+ {
152
+ "completion_length": 935.46875,
153
+ "epoch": 0.007822222222222222,
154
+ "grad_norm": 0.07270975069713734,
155
+ "kl": 0.0004680156707763672,
156
+ "learning_rate": 4.832329667929376e-07,
157
+ "loss": 0.0,
158
+ "reward": 0.08750000223517418,
159
+ "reward_std": 0.11443375889211893,
160
+ "rewards/format_compliance_reward": 0.0,
161
+ "rewards/math_reasoning_reward": 0.0,
162
+ "rewards/solution_quality_reward": 0.08750000223517418,
163
+ "step": 22
164
+ },
165
+ {
166
+ "completion_length": 949.0859375,
167
+ "epoch": 0.008533333333333334,
168
+ "grad_norm": 0.08038949844442059,
169
+ "kl": 0.0006613731384277344,
170
+ "learning_rate": 4.791147702650565e-07,
171
+ "loss": 0.0,
172
+ "reward": 0.11250000260770321,
173
+ "reward_std": 0.12886751629412174,
174
+ "rewards/format_compliance_reward": 0.0,
175
+ "rewards/math_reasoning_reward": 0.0,
176
+ "rewards/solution_quality_reward": 0.11250000260770321,
177
+ "step": 24
178
+ },
179
+ {
180
+ "completion_length": 956.25,
181
+ "epoch": 0.009244444444444444,
182
+ "grad_norm": 0.08202117227401906,
183
+ "kl": 0.0005040168762207031,
184
+ "learning_rate": 4.745664346957361e-07,
185
+ "loss": 0.0,
186
+ "reward": 0.09375000186264515,
187
+ "reward_std": 0.13466878794133663,
188
+ "rewards/format_compliance_reward": 0.0,
189
+ "rewards/math_reasoning_reward": 0.0,
190
+ "rewards/solution_quality_reward": 0.09375000186264515,
191
+ "step": 26
192
+ },
193
+ {
194
+ "completion_length": 975.703125,
195
+ "epoch": 0.009955555555555556,
196
+ "grad_norm": 0.052563124629516914,
197
+ "kl": 0.0009806156158447266,
198
+ "learning_rate": 4.695964991097616e-07,
199
+ "loss": 0.0,
200
+ "reward": 0.056250001303851604,
201
+ "reward_std": 0.07693375833332539,
202
+ "rewards/format_compliance_reward": 0.0,
203
+ "rewards/math_reasoning_reward": 0.0,
204
+ "rewards/solution_quality_reward": 0.056250001303851604,
205
+ "step": 28
206
+ },
207
+ {
208
+ "completion_length": 950.8515625,
209
+ "epoch": 0.010666666666666666,
210
+ "grad_norm": 0.07832441677596065,
211
+ "kl": 0.0008480548858642578,
212
+ "learning_rate": 4.642142940418973e-07,
213
+ "loss": 0.0,
214
+ "reward": 0.15000000316649675,
215
+ "reward_std": 0.20773503184318542,
216
+ "rewards/format_compliance_reward": 0.0,
217
+ "rewards/math_reasoning_reward": 0.0,
218
+ "rewards/solution_quality_reward": 0.15000000316649675,
219
+ "step": 30
220
+ },
221
+ {
222
+ "completion_length": 974.40625,
223
+ "epoch": 0.011377777777777778,
224
+ "grad_norm": 0.07055076490510688,
225
+ "kl": 0.0010929107666015625,
226
+ "learning_rate": 4.5842992401978256e-07,
227
+ "loss": 0.0,
228
+ "reward": 0.11250000167638063,
229
+ "reward_std": 0.15773503109812737,
230
+ "rewards/format_compliance_reward": 0.0,
231
+ "rewards/math_reasoning_reward": 0.0,
232
+ "rewards/solution_quality_reward": 0.11250000167638063,
233
+ "step": 32
234
+ },
235
+ {
236
+ "completion_length": 931.6015625,
237
+ "epoch": 0.012088888888888889,
238
+ "grad_norm": 0.11840439438100717,
239
+ "kl": 0.0014081001281738281,
240
+ "learning_rate": 4.5225424859373684e-07,
241
+ "loss": 0.0001,
242
+ "reward": 0.17812500521540642,
243
+ "reward_std": 0.22999473847448826,
244
+ "rewards/format_compliance_reward": 0.0,
245
+ "rewards/math_reasoning_reward": 0.0,
246
+ "rewards/solution_quality_reward": 0.17812500521540642,
247
+ "step": 34
248
+ },
249
+ {
250
+ "completion_length": 913.5078125,
251
+ "epoch": 0.0128,
252
+ "grad_norm": 0.09388094812115422,
253
+ "kl": 0.0025322437286376953,
254
+ "learning_rate": 4.456988619490889e-07,
255
+ "loss": 0.0001,
256
+ "reward": 0.218750003259629,
257
+ "reward_std": 0.19240381754934788,
258
+ "rewards/format_compliance_reward": 0.0,
259
+ "rewards/math_reasoning_reward": 0.0,
260
+ "rewards/solution_quality_reward": 0.218750003259629,
261
+ "step": 36
262
+ },
263
+ {
264
+ "completion_length": 942.7890625,
265
+ "epoch": 0.013511111111111111,
266
+ "grad_norm": 0.1071443097365241,
267
+ "kl": 0.0012729167938232422,
268
+ "learning_rate": 4.3877607113930516e-07,
269
+ "loss": 0.0001,
270
+ "reward": 0.1937500024214387,
271
+ "reward_std": 0.24523503240197897,
272
+ "rewards/format_compliance_reward": 0.0,
273
+ "rewards/math_reasoning_reward": 0.0,
274
+ "rewards/solution_quality_reward": 0.1937500024214387,
275
+ "step": 38
276
+ },
277
+ {
278
+ "completion_length": 932.3125,
279
+ "epoch": 0.014222222222222223,
280
+ "grad_norm": 0.09884048766059533,
281
+ "kl": 0.0025246143341064453,
282
+ "learning_rate": 4.314988729807827e-07,
283
+ "loss": 0.0001,
284
+ "reward": 0.2312500043772161,
285
+ "reward_std": 0.20966878905892372,
286
+ "rewards/format_compliance_reward": 0.0,
287
+ "rewards/math_reasoning_reward": 0.0,
288
+ "rewards/solution_quality_reward": 0.2312500043772161,
289
+ "step": 40
290
+ },
291
+ {
292
+ "completion_length": 983.9375,
293
+ "epoch": 0.014933333333333333,
294
+ "grad_norm": 0.06377692673539463,
295
+ "kl": 0.0017533302307128906,
296
+ "learning_rate": 4.238809296526846e-07,
297
+ "loss": 0.0001,
298
+ "reward": 0.15000000223517418,
299
+ "reward_std": 0.1933012744411826,
300
+ "rewards/format_compliance_reward": 0.0,
301
+ "rewards/math_reasoning_reward": 0.0,
302
+ "rewards/solution_quality_reward": 0.15000000223517418,
303
+ "step": 42
304
+ },
305
+ {
306
+ "completion_length": 934.265625,
307
+ "epoch": 0.015644444444444443,
308
+ "grad_norm": 0.10797227603988564,
309
+ "kl": 0.0016703605651855469,
310
+ "learning_rate": 4.159365430476261e-07,
311
+ "loss": 0.0001,
312
+ "reward": 0.26250000577419996,
313
+ "reward_std": 0.21830127481371164,
314
+ "rewards/format_compliance_reward": 0.0,
315
+ "rewards/math_reasoning_reward": 0.0,
316
+ "rewards/solution_quality_reward": 0.26250000204890966,
317
+ "step": 44
318
+ },
319
+ {
320
+ "completion_length": 888.1484375,
321
+ "epoch": 0.016355555555555557,
322
+ "grad_norm": 0.11888486637171698,
323
+ "kl": 0.0021381378173828125,
324
+ "learning_rate": 4.076806279213655e-07,
325
+ "loss": 0.0001,
326
+ "reward": 0.3250000048428774,
327
+ "reward_std": 0.2577350325882435,
328
+ "rewards/format_compliance_reward": 0.0,
329
+ "rewards/math_reasoning_reward": 0.0,
330
+ "rewards/solution_quality_reward": 0.3250000048428774,
331
+ "step": 46
332
+ },
333
+ {
334
+ "completion_length": 844.6484375,
335
+ "epoch": 0.017066666666666667,
336
+ "grad_norm": 0.10144889612555925,
337
+ "kl": 0.002949237823486328,
338
+ "learning_rate": 3.991286838919086e-07,
339
+ "loss": 0.0001,
340
+ "reward": 0.3921875115483999,
341
+ "reward_std": 0.2589998487383127,
342
+ "rewards/format_compliance_reward": 0.0,
343
+ "rewards/math_reasoning_reward": 0.0,
344
+ "rewards/solution_quality_reward": 0.3921875115483999,
345
+ "step": 48
346
+ },
347
+ {
348
+ "completion_length": 862.4296875,
349
+ "epoch": 0.017777777777777778,
350
+ "grad_norm": 0.09453500886940876,
351
+ "kl": 0.0031652450561523438,
352
+ "learning_rate": 3.902967663405956e-07,
353
+ "loss": 0.0001,
354
+ "reward": 0.36250000912696123,
355
+ "reward_std": 0.24047006107866764,
356
+ "rewards/format_compliance_reward": 0.0,
357
+ "rewards/math_reasoning_reward": 0.0,
358
+ "rewards/solution_quality_reward": 0.36250000912696123,
359
+ "step": 50
360
+ },
361
+ {
362
+ "completion_length": 905.5,
363
+ "epoch": 0.018488888888888888,
364
+ "grad_norm": 0.10133881191260517,
365
+ "kl": 0.0029811859130859375,
366
+ "learning_rate": 3.8120145626980015e-07,
367
+ "loss": 0.0001,
368
+ "reward": 0.32031250838190317,
369
+ "reward_std": 0.22642100881785154,
370
+ "rewards/format_compliance_reward": 0.0,
371
+ "rewards/math_reasoning_reward": 0.0,
372
+ "rewards/solution_quality_reward": 0.3203125046566129,
373
+ "step": 52
374
+ },
375
+ {
376
+ "completion_length": 920.1953125,
377
+ "epoch": 0.0192,
378
+ "grad_norm": 0.10154509040958454,
379
+ "kl": 0.0032434463500976562,
380
+ "learning_rate": 3.718598291738298e-07,
381
+ "loss": 0.0001,
382
+ "reward": 0.30781250540167093,
383
+ "reward_std": 0.2779353503137827,
384
+ "rewards/format_compliance_reward": 0.0,
385
+ "rewards/math_reasoning_reward": 0.0,
386
+ "rewards/solution_quality_reward": 0.30781250540167093,
387
+ "step": 54
388
+ },
389
+ {
390
+ "completion_length": 893.3515625,
391
+ "epoch": 0.019911111111111112,
392
+ "grad_norm": 0.10046839615716938,
393
+ "kl": 0.0037431716918945312,
394
+ "learning_rate": 3.622894229814698e-07,
395
+ "loss": 0.0001,
396
+ "reward": 0.3812500098720193,
397
+ "reward_std": 0.2846687901765108,
398
+ "rewards/format_compliance_reward": 0.0,
399
+ "rewards/math_reasoning_reward": 0.0,
400
+ "rewards/solution_quality_reward": 0.3812500098720193,
401
+ "step": 56
402
+ },
403
+ {
404
+ "completion_length": 905.0546875,
405
+ "epoch": 0.020622222222222222,
406
+ "grad_norm": 0.0961680067295368,
407
+ "kl": 0.004122734069824219,
408
+ "learning_rate": 3.52508205130354e-07,
409
+ "loss": 0.0002,
410
+ "reward": 0.3875000085681677,
411
+ "reward_std": 0.2616025470197201,
412
+ "rewards/format_compliance_reward": 0.0,
413
+ "rewards/math_reasoning_reward": 0.0,
414
+ "rewards/solution_quality_reward": 0.3875000048428774,
415
+ "step": 58
416
+ },
417
+ {
418
+ "completion_length": 871.7890625,
419
+ "epoch": 0.021333333333333333,
420
+ "grad_norm": 0.08426575372485079,
421
+ "kl": 0.0044574737548828125,
422
+ "learning_rate": 3.4253453883497864e-07,
423
+ "loss": 0.0002,
424
+ "reward": 0.4125000177882612,
425
+ "reward_std": 0.25103630404919386,
426
+ "rewards/format_compliance_reward": 0.0,
427
+ "rewards/math_reasoning_reward": 0.0,
428
+ "rewards/solution_quality_reward": 0.4125000028871,
429
+ "step": 60
430
+ },
431
+ {
432
+ "completion_length": 901.296875,
433
+ "epoch": 0.022044444444444443,
434
+ "grad_norm": 0.09601873424860208,
435
+ "kl": 0.004614830017089844,
436
+ "learning_rate": 3.323871486116851e-07,
437
+ "loss": 0.0002,
438
+ "reward": 0.3750000107102096,
439
+ "reward_std": 0.2943375762552023,
440
+ "rewards/format_compliance_reward": 0.0,
441
+ "rewards/math_reasoning_reward": 0.0,
442
+ "rewards/solution_quality_reward": 0.3750000107102096,
443
+ "step": 62
444
+ },
445
+ {
446
+ "completion_length": 876.109375,
447
+ "epoch": 0.022755555555555557,
448
+ "grad_norm": 0.07468416170263495,
449
+ "kl": 0.004523277282714844,
450
+ "learning_rate": 3.220850851253377e-07,
451
+ "loss": 0.0002,
452
+ "reward": 0.37187500670552254,
453
+ "reward_std": 0.20931097934953868,
454
+ "rewards/format_compliance_reward": 0.0,
455
+ "rewards/math_reasoning_reward": 0.0,
456
+ "rewards/solution_quality_reward": 0.37187500670552254,
457
+ "step": 64
458
+ },
459
+ {
460
+ "completion_length": 876.34375,
461
+ "epoch": 0.023466666666666667,
462
+ "grad_norm": 0.10975736428011727,
463
+ "kl": 0.005245208740234375,
464
+ "learning_rate": 3.1164768942369053e-07,
465
+ "loss": 0.0002,
466
+ "reward": 0.42343751108273864,
467
+ "reward_std": 0.2873450629413128,
468
+ "rewards/format_compliance_reward": 0.0,
469
+ "rewards/math_reasoning_reward": 0.0,
470
+ "rewards/solution_quality_reward": 0.42343750735744834,
471
+ "step": 66
472
+ },
473
+ {
474
+ "completion_length": 846.3671875,
475
+ "epoch": 0.024177777777777777,
476
+ "grad_norm": 0.10665048077601928,
477
+ "kl": 0.0054931640625,
478
+ "learning_rate": 3.010945566265912e-07,
479
+ "loss": 0.0002,
480
+ "reward": 0.4234375124797225,
481
+ "reward_std": 0.2752937898039818,
482
+ "rewards/format_compliance_reward": 0.0,
483
+ "rewards/math_reasoning_reward": 0.0,
484
+ "rewards/solution_quality_reward": 0.42343750689178705,
485
+ "step": 68
486
+ },
487
+ {
488
+ "completion_length": 829.9609375,
489
+ "epoch": 0.024888888888888887,
490
+ "grad_norm": 0.08361928294620831,
491
+ "kl": 0.005539894104003906,
492
+ "learning_rate": 2.9044549913819124e-07,
493
+ "loss": 0.0002,
494
+ "reward": 0.48750001983717084,
495
+ "reward_std": 0.2154700607061386,
496
+ "rewards/format_compliance_reward": 0.0,
497
+ "rewards/math_reasoning_reward": 0.0,
498
+ "rewards/solution_quality_reward": 0.48750000493600965,
499
+ "step": 70
500
+ },
501
+ {
502
+ "completion_length": 858.71875,
503
+ "epoch": 0.0256,
504
+ "grad_norm": 0.07638495432759437,
505
+ "kl": 0.005488395690917969,
506
+ "learning_rate": 2.797205094512266e-07,
507
+ "loss": 0.0002,
508
+ "reward": 0.45000001741573215,
509
+ "reward_std": 0.21160254627466202,
510
+ "rewards/format_compliance_reward": 0.0,
511
+ "rewards/math_reasoning_reward": 0.0,
512
+ "rewards/solution_quality_reward": 0.45000000623986125,
513
+ "step": 72
514
+ },
515
+ {
516
+ "completion_length": 864.84375,
517
+ "epoch": 0.02631111111111111,
518
+ "grad_norm": 0.10096698237334256,
519
+ "kl": 0.0058040618896484375,
520
+ "learning_rate": 2.6893972261320264e-07,
521
+ "loss": 0.0002,
522
+ "reward": 0.4000000124797225,
523
+ "reward_std": 0.2827350329607725,
524
+ "rewards/format_compliance_reward": 0.0,
525
+ "rewards/math_reasoning_reward": 0.0,
526
+ "rewards/solution_quality_reward": 0.4000000087544322,
527
+ "step": 74
528
+ },
529
+ {
530
+ "completion_length": 907.4296875,
531
+ "epoch": 0.027022222222222222,
532
+ "grad_norm": 0.08728011123913502,
533
+ "kl": 0.005370140075683594,
534
+ "learning_rate": 2.5812337842494516e-07,
535
+ "loss": 0.0002,
536
+ "reward": 0.42500000866129994,
537
+ "reward_std": 0.24433757551014423,
538
+ "rewards/format_compliance_reward": 0.0,
539
+ "rewards/math_reasoning_reward": 0.0,
540
+ "rewards/solution_quality_reward": 0.42500000493600965,
541
+ "step": 76
542
+ },
543
+ {
544
+ "completion_length": 817.8828125,
545
+ "epoch": 0.027733333333333332,
546
+ "grad_norm": 0.09172512669808838,
547
+ "kl": 0.0060749053955078125,
548
+ "learning_rate": 2.4729178344249006e-07,
549
+ "loss": 0.0002,
550
+ "reward": 0.487500018440187,
551
+ "reward_std": 0.22603630367666483,
552
+ "rewards/format_compliance_reward": 0.0,
553
+ "rewards/math_reasoning_reward": 0.0,
554
+ "rewards/solution_quality_reward": 0.4875000109896064,
555
+ "step": 78
556
+ },
557
+ {
558
+ "completion_length": 811.90625,
559
+ "epoch": 0.028444444444444446,
560
+ "grad_norm": 0.087230141698115,
561
+ "kl": 0.0064601898193359375,
562
+ "learning_rate": 2.3646527285364563e-07,
563
+ "loss": 0.0003,
564
+ "reward": 0.5500000175088644,
565
+ "reward_std": 0.22216878924518824,
566
+ "rewards/format_compliance_reward": 0.0,
567
+ "rewards/math_reasoning_reward": 0.0,
568
+ "rewards/solution_quality_reward": 0.5500000100582838,
569
+ "step": 80
570
+ },
571
+ {
572
+ "completion_length": 820.3125,
573
+ "epoch": 0.029155555555555556,
574
+ "grad_norm": 0.0917309701218231,
575
+ "kl": 0.0067539215087890625,
576
+ "learning_rate": 2.256641723008026e-07,
577
+ "loss": 0.0003,
578
+ "reward": 0.4890625160187483,
579
+ "reward_std": 0.23515222850255668,
580
+ "rewards/format_compliance_reward": 0.0,
581
+ "rewards/math_reasoning_reward": 0.0,
582
+ "rewards/solution_quality_reward": 0.489062512293458,
583
+ "step": 82
584
+ },
585
+ {
586
+ "completion_length": 875.1953125,
587
+ "epoch": 0.029866666666666666,
588
+ "grad_norm": 0.05629416236953807,
589
+ "kl": 0.005397796630859375,
590
+ "learning_rate": 2.1490875972166393e-07,
591
+ "loss": 0.0002,
592
+ "reward": 0.5187500147148967,
593
+ "reward_std": 0.1991025460883975,
594
+ "rewards/format_compliance_reward": 0.0,
595
+ "rewards/math_reasoning_reward": 0.0,
596
+ "rewards/solution_quality_reward": 0.5187500072643161,
597
+ "step": 84
598
+ },
599
+ {
600
+ "completion_length": 853.609375,
601
+ "epoch": 0.030577777777777777,
602
+ "grad_norm": 0.08222448453561998,
603
+ "kl": 0.0061321258544921875,
604
+ "learning_rate": 2.0421922727953595e-07,
605
+ "loss": 0.0002,
606
+ "reward": 0.49843751173466444,
607
+ "reward_std": 0.24561973754316568,
608
+ "rewards/format_compliance_reward": 0.0,
609
+ "rewards/math_reasoning_reward": 0.0,
610
+ "rewards/solution_quality_reward": 0.498437506146729,
611
+ "step": 86
612
+ },
613
+ {
614
+ "completion_length": 848.15625,
615
+ "epoch": 0.03128888888888889,
616
+ "grad_norm": 0.0924588566987047,
617
+ "kl": 0.0057964324951171875,
618
+ "learning_rate": 1.9361564345465145e-07,
619
+ "loss": 0.0002,
620
+ "reward": 0.49375001480802894,
621
+ "reward_std": 0.2741025472059846,
622
+ "rewards/format_compliance_reward": 0.0,
623
+ "rewards/math_reasoning_reward": 0.0,
624
+ "rewards/solution_quality_reward": 0.49375001108273864,
625
+ "step": 88
626
+ },
627
+ {
628
+ "completion_length": 850.2109375,
629
+ "epoch": 0.032,
630
+ "grad_norm": 0.09370433522264131,
631
+ "kl": 0.005938529968261719,
632
+ "learning_rate": 1.8311791536769483e-07,
633
+ "loss": 0.0002,
634
+ "reward": 0.4875000179745257,
635
+ "reward_std": 0.25490381848067045,
636
+ "rewards/format_compliance_reward": 0.0,
637
+ "rewards/math_reasoning_reward": 0.0,
638
+ "rewards/solution_quality_reward": 0.4875000105239451,
639
+ "step": 90
640
+ },
641
+ {
642
+ "completion_length": 823.2890625,
643
+ "epoch": 0.032711111111111114,
644
+ "grad_norm": 0.10865632851826816,
645
+ "kl": 0.0072078704833984375,
646
+ "learning_rate": 1.7274575140626315e-07,
647
+ "loss": 0.0003,
648
+ "reward": 0.47968751238659024,
649
+ "reward_std": 0.2377937890123576,
650
+ "rewards/format_compliance_reward": 0.0,
651
+ "rewards/math_reasoning_reward": 0.0,
652
+ "rewards/solution_quality_reward": 0.47968750866129994,
653
+ "step": 92
654
+ },
655
+ {
656
+ "completion_length": 817.4140625,
657
+ "epoch": 0.03342222222222222,
658
+ "grad_norm": 0.08418982535916476,
659
+ "kl": 0.0056285858154296875,
660
+ "learning_rate": 1.6251862422442788e-07,
661
+ "loss": 0.0002,
662
+ "reward": 0.5242187650874257,
663
+ "reward_std": 0.25055886153131723,
664
+ "rewards/format_compliance_reward": 0.0,
665
+ "rewards/math_reasoning_reward": 0.0054687499068677425,
666
+ "rewards/solution_quality_reward": 0.5187500091269612,
667
+ "step": 94
668
+ },
669
+ {
670
+ "completion_length": 832.8984375,
671
+ "epoch": 0.034133333333333335,
672
+ "grad_norm": 0.08609457505525932,
673
+ "kl": 0.0064220428466796875,
674
+ "learning_rate": 1.5245573418486135e-07,
675
+ "loss": 0.0003,
676
+ "reward": 0.498437509406358,
677
+ "reward_std": 0.19642627402208745,
678
+ "rewards/format_compliance_reward": 0.0,
679
+ "rewards/math_reasoning_reward": 0.0,
680
+ "rewards/solution_quality_reward": 0.498437509406358,
681
+ "step": 96
682
+ },
683
+ {
684
+ "completion_length": 834.265625,
685
+ "epoch": 0.03484444444444444,
686
+ "grad_norm": 0.11364212512648743,
687
+ "kl": 0.0072803497314453125,
688
+ "learning_rate": 1.4257597331216208e-07,
689
+ "loss": 0.0003,
690
+ "reward": 0.4625000134110451,
691
+ "reward_std": 0.25401466409675777,
692
+ "rewards/format_compliance_reward": 0.0,
693
+ "rewards/math_reasoning_reward": 0.0,
694
+ "rewards/solution_quality_reward": 0.4625000096857548,
695
+ "step": 98
696
+ },
697
+ {
698
+ "completion_length": 857.9921875,
699
+ "epoch": 0.035555555555555556,
700
+ "grad_norm": 0.07970778178862901,
701
+ "kl": 0.005957603454589844,
702
+ "learning_rate": 1.328978898250525e-07,
703
+ "loss": 0.0002,
704
+ "reward": 0.49140626657754183,
705
+ "reward_std": 0.256591965444386,
706
+ "rewards/format_compliance_reward": 0.00390625,
707
+ "rewards/math_reasoning_reward": 0.0,
708
+ "rewards/solution_quality_reward": 0.48750000540167093,
709
+ "step": 100
710
+ },
711
+ {
712
+ "completion_length": 850.5234375,
713
+ "epoch": 0.03626666666666667,
714
+ "grad_norm": 0.0806380302022146,
715
+ "kl": 0.0061206817626953125,
716
+ "learning_rate": 1.234396533140365e-07,
717
+ "loss": 0.0002,
718
+ "reward": 0.5625000167638063,
719
+ "reward_std": 0.22216878924518824,
720
+ "rewards/format_compliance_reward": 0.0,
721
+ "rewards/math_reasoning_reward": 0.0,
722
+ "rewards/solution_quality_reward": 0.5625000093132257,
723
+ "step": 102
724
+ },
725
+ {
726
+ "completion_length": 873.421875,
727
+ "epoch": 0.036977777777777776,
728
+ "grad_norm": 0.08485517408265005,
729
+ "kl": 0.00583648681640625,
730
+ "learning_rate": 1.1421902062989178e-07,
731
+ "loss": 0.0002,
732
+ "reward": 0.5046875160187483,
733
+ "reward_std": 0.22255349438637495,
734
+ "rewards/format_compliance_reward": 0.0,
735
+ "rewards/math_reasoning_reward": 0.0,
736
+ "rewards/solution_quality_reward": 0.5046875067055225,
737
+ "step": 104
738
+ },
739
+ {
740
+ "completion_length": 868.8515625,
741
+ "epoch": 0.03768888888888889,
742
+ "grad_norm": 0.08212512079305459,
743
+ "kl": 0.005665779113769531,
744
+ "learning_rate": 1.0525330254703788e-07,
745
+ "loss": 0.0002,
746
+ "reward": 0.4375000107102096,
747
+ "reward_std": 0.22603630367666483,
748
+ "rewards/format_compliance_reward": 0.0,
749
+ "rewards/math_reasoning_reward": 0.0,
750
+ "rewards/solution_quality_reward": 0.4375000107102096,
751
+ "step": 106
752
+ },
753
+ {
754
+ "completion_length": 853.9140625,
755
+ "epoch": 0.0384,
756
+ "grad_norm": 0.08994900430130048,
757
+ "kl": 0.00644683837890625,
758
+ "learning_rate": 9.655933126436563e-08,
759
+ "loss": 0.0003,
760
+ "reward": 0.4796875203028321,
761
+ "reward_std": 0.21711003221571445,
762
+ "rewards/format_compliance_reward": 0.0,
763
+ "rewards/math_reasoning_reward": 0.0,
764
+ "rewards/solution_quality_reward": 0.4796875109896064,
765
+ "step": 108
766
+ },
767
+ {
768
+ "completion_length": 825.1328125,
769
+ "epoch": 0.03911111111111111,
770
+ "grad_norm": 0.07289579376020662,
771
+ "kl": 0.006561279296875,
772
+ "learning_rate": 8.81534288045431e-08,
773
+ "loss": 0.0003,
774
+ "reward": 0.5171875199303031,
775
+ "reward_std": 0.20899984799325466,
776
+ "rewards/format_compliance_reward": 0.0,
777
+ "rewards/math_reasoning_reward": 0.0,
778
+ "rewards/solution_quality_reward": 0.5171875124797225,
779
+ "step": 110
780
+ },
781
+ {
782
+ "completion_length": 841.34375,
783
+ "epoch": 0.039822222222222224,
784
+ "grad_norm": 0.09488084966315946,
785
+ "kl": 0.006694793701171875,
786
+ "learning_rate": 8.005137637112302e-08,
787
+ "loss": 0.0003,
788
+ "reward": 0.5093750152736902,
789
+ "reward_std": 0.28519109170883894,
790
+ "rewards/format_compliance_reward": 0.0,
791
+ "rewards/math_reasoning_reward": 0.0,
792
+ "rewards/solution_quality_reward": 0.5093750134110451,
793
+ "step": 112
794
+ },
795
+ {
796
+ "completion_length": 872.6796875,
797
+ "epoch": 0.04053333333333333,
798
+ "grad_norm": 0.09567296035443791,
799
+ "kl": 0.0057201385498046875,
800
+ "learning_rate": 7.226838472098237e-08,
801
+ "loss": 0.0002,
802
+ "reward": 0.47968751238659024,
803
+ "reward_std": 0.2666613038163632,
804
+ "rewards/format_compliance_reward": 0.0,
805
+ "rewards/math_reasoning_reward": 0.0,
806
+ "rewards/solution_quality_reward": 0.47968750493600965,
807
+ "step": 114
808
+ },
809
+ {
810
+ "completion_length": 813.8828125,
811
+ "epoch": 0.041244444444444445,
812
+ "grad_norm": 0.10820746157768055,
813
+ "kl": 0.007022857666015625,
814
+ "learning_rate": 6.481906560771524e-08,
815
+ "loss": 0.0003,
816
+ "reward": 0.5648437645286322,
817
+ "reward_std": 0.24614199809730053,
818
+ "rewards/format_compliance_reward": 0.0,
819
+ "rewards/math_reasoning_reward": 0.01015624962747097,
820
+ "rewards/solution_quality_reward": 0.5546875093132257,
821
+ "step": 116
822
+ },
823
+ {
824
+ "completion_length": 767.515625,
825
+ "epoch": 0.04195555555555556,
826
+ "grad_norm": 0.08997306133980622,
827
+ "kl": 0.0065708160400390625,
828
+ "learning_rate": 5.771740434959277e-08,
829
+ "loss": 0.0003,
830
+ "reward": 0.6203125147148967,
831
+ "reward_std": 0.21331608993932605,
832
+ "rewards/format_compliance_reward": 0.0,
833
+ "rewards/math_reasoning_reward": 0.0,
834
+ "rewards/solution_quality_reward": 0.6203125072643161,
835
+ "step": 118
836
+ },
837
+ {
838
+ "completion_length": 873.734375,
839
+ "epoch": 0.042666666666666665,
840
+ "grad_norm": 0.08100641846157194,
841
+ "kl": 0.006244659423828125,
842
+ "learning_rate": 5.097673357358906e-08,
843
+ "loss": 0.0002,
844
+ "reward": 0.40000000363215804,
845
+ "reward_std": 0.2500735791400075,
846
+ "rewards/format_compliance_reward": 0.0,
847
+ "rewards/math_reasoning_reward": 0.0,
848
+ "rewards/solution_quality_reward": 0.40000000363215804,
849
+ "step": 120
850
+ },
851
+ {
852
+ "completion_length": 792.8515625,
853
+ "epoch": 0.04337777777777778,
854
+ "grad_norm": 0.09791409622965977,
855
+ "kl": 0.0066165924072265625,
856
+ "learning_rate": 4.460970818476717e-08,
857
+ "loss": 0.0003,
858
+ "reward": 0.5750000160187483,
859
+ "reward_std": 0.23660254664719105,
860
+ "rewards/format_compliance_reward": 0.0,
861
+ "rewards/math_reasoning_reward": 0.0,
862
+ "rewards/solution_quality_reward": 0.5750000085681677,
863
+ "step": 122
864
+ },
865
+ {
866
+ "completion_length": 801.453125,
867
+ "epoch": 0.044088888888888886,
868
+ "grad_norm": 0.08071603466853912,
869
+ "kl": 0.005702972412109375,
870
+ "learning_rate": 3.8628281608017065e-08,
871
+ "loss": 0.0002,
872
+ "reward": 0.5296875154599547,
873
+ "reward_std": 0.16949251643382013,
874
+ "rewards/format_compliance_reward": 0.0,
875
+ "rewards/math_reasoning_reward": 0.0,
876
+ "rewards/solution_quality_reward": 0.5296875117346644,
877
+ "step": 124
878
+ },
879
+ {
880
+ "completion_length": 837.828125,
881
+ "epoch": 0.0448,
882
+ "grad_norm": 0.08788137824409525,
883
+ "kl": 0.0059604644775390625,
884
+ "learning_rate": 3.304368334674965e-08,
885
+ "loss": 0.0002,
886
+ "reward": 0.481250012293458,
887
+ "reward_std": 0.24910254683345556,
888
+ "rewards/format_compliance_reward": 0.0,
889
+ "rewards/math_reasoning_reward": 0.0,
890
+ "rewards/solution_quality_reward": 0.4812500048428774,
891
+ "step": 126
892
+ },
893
+ {
894
+ "completion_length": 821.6015625,
895
+ "epoch": 0.04551111111111111,
896
+ "grad_norm": 0.09054496411632566,
897
+ "kl": 0.006633758544921875,
898
+ "learning_rate": 2.7866397900677185e-08,
899
+ "loss": 0.0003,
900
+ "reward": 0.5437500169500709,
901
+ "reward_std": 0.22410254646092653,
902
+ "rewards/format_compliance_reward": 0.0,
903
+ "rewards/math_reasoning_reward": 0.0,
904
+ "rewards/solution_quality_reward": 0.5437500094994903,
905
+ "step": 128
906
+ },
907
+ {
908
+ "completion_length": 853.3046875,
909
+ "epoch": 0.04622222222222222,
910
+ "grad_norm": 0.08754779746234931,
911
+ "kl": 0.006439208984375,
912
+ "learning_rate": 2.3106145082260774e-08,
913
+ "loss": 0.0003,
914
+ "reward": 0.48125001415610313,
915
+ "reward_std": 0.23466878943145275,
916
+ "rewards/format_compliance_reward": 0.0,
917
+ "rewards/math_reasoning_reward": 0.0,
918
+ "rewards/solution_quality_reward": 0.48125000670552254,
919
+ "step": 130
920
+ },
921
+ {
922
+ "completion_length": 828.0703125,
923
+ "epoch": 0.046933333333333334,
924
+ "grad_norm": 0.09068860656569837,
925
+ "kl": 0.005916595458984375,
926
+ "learning_rate": 1.877186176877779e-08,
927
+ "loss": 0.0002,
928
+ "reward": 0.4781250171363354,
929
+ "reward_std": 0.15580127481371164,
930
+ "rewards/format_compliance_reward": 0.0,
931
+ "rewards/math_reasoning_reward": 0.0,
932
+ "rewards/solution_quality_reward": 0.4781250134110451,
933
+ "step": 132
934
+ },
935
+ {
936
+ "completion_length": 807.2109375,
937
+ "epoch": 0.04764444444444445,
938
+ "grad_norm": 0.0940658478261923,
939
+ "kl": 0.006163597106933594,
940
+ "learning_rate": 1.4871685124269007e-08,
941
+ "loss": 0.0002,
942
+ "reward": 0.48593751387670636,
943
+ "reward_std": 0.23586003202944994,
944
+ "rewards/format_compliance_reward": 0.0,
945
+ "rewards/math_reasoning_reward": 0.0,
946
+ "rewards/solution_quality_reward": 0.48593751015141606,
947
+ "step": 134
948
+ },
949
+ {
950
+ "completion_length": 850.7578125,
951
+ "epoch": 0.048355555555555554,
952
+ "grad_norm": 0.09234689786610698,
953
+ "kl": 0.006160736083984375,
954
+ "learning_rate": 1.141293732286297e-08,
955
+ "loss": 0.0002,
956
+ "reward": 0.4687500139698386,
957
+ "reward_std": 0.2885363046079874,
958
+ "rewards/format_compliance_reward": 0.0,
959
+ "rewards/math_reasoning_reward": 0.0,
960
+ "rewards/solution_quality_reward": 0.4687500102445483,
961
+ "step": 136
962
+ },
963
+ {
964
+ "completion_length": 836.015625,
965
+ "epoch": 0.04906666666666667,
966
+ "grad_norm": 0.08756107592003563,
967
+ "kl": 0.0068187713623046875,
968
+ "learning_rate": 8.402111802159412e-09,
969
+ "loss": 0.0003,
970
+ "reward": 0.526562511920929,
971
+ "reward_std": 0.23564936965703964,
972
+ "rewards/format_compliance_reward": 0.0,
973
+ "rewards/math_reasoning_reward": 0.0,
974
+ "rewards/solution_quality_reward": 0.5265625081956387,
975
+ "step": 138
976
+ },
977
+ {
978
+ "completion_length": 815.0625,
979
+ "epoch": 0.049777777777777775,
980
+ "grad_norm": 0.07921211552971878,
981
+ "kl": 0.0070705413818359375,
982
+ "learning_rate": 5.844861072478335e-09,
983
+ "loss": 0.0003,
984
+ "reward": 0.5671875206753612,
985
+ "reward_std": 0.24085476621985435,
986
+ "rewards/format_compliance_reward": 0.0,
987
+ "rewards/math_reasoning_reward": 0.0,
988
+ "rewards/solution_quality_reward": 0.5671875076368451,
989
+ "step": 140
990
+ },
991
+ {
992
+ "completion_length": 817.890625,
993
+ "epoch": 0.05048888888888889,
994
+ "grad_norm": 0.08609005306539234,
995
+ "kl": 0.006622314453125,
996
+ "learning_rate": 3.745986104862903e-09,
997
+ "loss": 0.0003,
998
+ "reward": 0.5875000208616257,
999
+ "reward_std": 0.2721687899902463,
1000
+ "rewards/format_compliance_reward": 0.0,
1001
+ "rewards/math_reasoning_reward": 0.0,
1002
+ "rewards/solution_quality_reward": 0.5875000134110451,
1003
+ "step": 142
1004
+ },
1005
+ {
1006
+ "completion_length": 818.9375,
1007
+ "epoch": 0.0512,
1008
+ "grad_norm": 0.06621426959758697,
1009
+ "kl": 0.0066070556640625,
1010
+ "learning_rate": 2.1094273177576505e-09,
1011
+ "loss": 0.0003,
1012
+ "reward": 0.4500000085681677,
1013
+ "reward_std": 0.1933012744411826,
1014
+ "rewards/format_compliance_reward": 0.0,
1015
+ "rewards/math_reasoning_reward": 0.0,
1016
+ "rewards/solution_quality_reward": 0.4500000085681677,
1017
+ "step": 144
1018
+ },
1019
+ {
1020
+ "completion_length": 841.125,
1021
+ "epoch": 0.05191111111111111,
1022
+ "grad_norm": 0.10016409872935261,
1023
+ "kl": 0.0057544708251953125,
1024
+ "learning_rate": 9.38257179284696e-10,
1025
+ "loss": 0.0002,
1026
+ "reward": 0.5109375165775418,
1027
+ "reward_std": 0.2147275460883975,
1028
+ "rewards/format_compliance_reward": 0.0,
1029
+ "rewards/math_reasoning_reward": 0.0,
1030
+ "rewards/solution_quality_reward": 0.5109375016763806,
1031
+ "step": 146
1032
+ },
1033
+ {
1034
+ "completion_length": 822.9921875,
1035
+ "epoch": 0.05262222222222222,
1036
+ "grad_norm": 0.08984285169471264,
1037
+ "kl": 0.0066013336181640625,
1038
+ "learning_rate": 2.3467443900582197e-10,
1039
+ "loss": 0.0003,
1040
+ "reward": 0.517187518067658,
1041
+ "reward_std": 0.2522275464143604,
1042
+ "rewards/format_compliance_reward": 0.0,
1043
+ "rewards/math_reasoning_reward": 0.0,
1044
+ "rewards/solution_quality_reward": 0.517187506891787,
1045
+ "step": 148
1046
+ },
1047
+ {
1048
+ "completion_length": 772.4453125,
1049
+ "epoch": 0.05333333333333334,
1050
+ "grad_norm": 0.09806696754810183,
1051
+ "kl": 0.007259368896484375,
1052
+ "learning_rate": 0.0,
1053
+ "loss": 0.0003,
1054
+ "reward": 0.5906250197440386,
1055
+ "reward_std": 0.23228630423545837,
1056
+ "rewards/format_compliance_reward": 0.00390625,
1057
+ "rewards/math_reasoning_reward": 0.0054687499068677425,
1058
+ "rewards/solution_quality_reward": 0.5812500100582838,
1059
+ "step": 150
1060
+ },
1061
+ {
1062
+ "epoch": 0.05333333333333334,
1063
+ "step": 150,
1064
+ "total_flos": 0.0,
1065
+ "train_loss": 0.00017194549279035223,
1066
+ "train_runtime": 4065.9132,
1067
+ "train_samples_per_second": 0.59,
1068
+ "train_steps_per_second": 0.037
1069
+ }
1070
+ ],
1071
+ "logging_steps": 2,
1072
+ "max_steps": 150,
1073
+ "num_input_tokens_seen": 0,
1074
+ "num_train_epochs": 1,
1075
+ "save_steps": 25,
1076
+ "stateful_callbacks": {
1077
+ "TrainerControl": {
1078
+ "args": {
1079
+ "should_epoch_stop": false,
1080
+ "should_evaluate": false,
1081
+ "should_log": false,
1082
+ "should_save": true,
1083
+ "should_training_stop": true
1084
+ },
1085
+ "attributes": {}
1086
+ }
1087
+ },
1088
+ "total_flos": 0.0,
1089
+ "train_batch_size": 1,
1090
+ "trial_name": null,
1091
+ "trial_params": null
1092
+ }