Bradley commited on
Commit
c0e9192
·
verified ·
1 Parent(s): 2d7ed1d

Model save

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-Math-1.5B
3
+ library_name: transformers
4
+ model_name: Qwen-2.5-7B-Simple-RL
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - grpo
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for Qwen-2.5-7B-Simple-RL
13
+
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="Bradley/Qwen-2.5-7B-Simple-RL", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/bradley/huggingface/runs/ju8pbdvp)
31
+
32
+
33
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.16.0.dev0
38
+ - Transformers: 4.49.0.dev0
39
+ - Pytorch: 2.5.1
40
+ - Datasets: 3.3.0
41
+ - Tokenizers: 0.21.0
42
+
43
+ ## Citations
44
+
45
+ Cite GRPO as:
46
+
47
+ ```bibtex
48
+ @article{zhihong2024deepseekmath,
49
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
50
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
51
+ year = 2024,
52
+ eprint = {arXiv:2402.03300},
53
+ }
54
+
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.0,
4
+ "train_runtime": 1.0266,
5
+ "train_samples": 7500,
6
+ "train_samples_per_second": 7305.492,
7
+ "train_steps_per_second": 56.496
8
+ }
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Qwen/Qwen2.5-Math-1.5B",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151643,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 1536,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 8960,
13
+ "max_position_embeddings": 4096,
14
+ "max_window_layers": 21,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 28,
18
+ "num_key_value_heads": 2,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": null,
21
+ "rope_theta": 10000,
22
+ "sliding_window": 4096,
23
+ "tie_word_embeddings": true,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.49.0.dev0",
26
+ "use_cache": false,
27
+ "use_mrope": false,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 151936
30
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.49.0.dev0"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed5a684ef0ef5612c79d240e03a157adece0803f8ad596d73aa6d434a4a70990
3
+ size 3554214752
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'Please reason step by step, and put your final answer within \\\\boxed{}.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nPlease reason step by step, and put your final answer within \\\\boxed{}.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|endoftext|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "model_max_length": 131072,
204
+ "pad_token": "<|endoftext|>",
205
+ "padding_side": "left",
206
+ "split_special_tokens": false,
207
+ "tokenizer_class": "Qwen2Tokenizer",
208
+ "unk_token": null
209
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.0,
4
+ "train_runtime": 1.0266,
5
+ "train_samples": 7500,
6
+ "train_samples_per_second": 7305.492,
7
+ "train_steps_per_second": 56.496
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,796 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9893390191897654,
5
+ "eval_steps": 100,
6
+ "global_step": 58,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "completion_length": 612.3248062133789,
13
+ "epoch": 0.017057569296375266,
14
+ "grad_norm": 0.8495666980743408,
15
+ "kl": 0.0,
16
+ "learning_rate": 5e-07,
17
+ "loss": 0.0,
18
+ "reward": 0.3694196529686451,
19
+ "reward_std": 0.3680446147918701,
20
+ "rewards/accuracy_reward": 0.368303582072258,
21
+ "rewards/format_reward": 0.0011160714784637094,
22
+ "step": 1
23
+ },
24
+ {
25
+ "completion_length": 599.4174499511719,
26
+ "epoch": 0.03411513859275053,
27
+ "grad_norm": 0.458848774433136,
28
+ "kl": 0.0,
29
+ "learning_rate": 1e-06,
30
+ "loss": 0.0,
31
+ "reward": 0.349330373108387,
32
+ "reward_std": 0.3545062802731991,
33
+ "rewards/accuracy_reward": 0.3470982313156128,
34
+ "rewards/format_reward": 0.0022321429569274187,
35
+ "step": 2
36
+ },
37
+ {
38
+ "completion_length": 623.0669860839844,
39
+ "epoch": 0.0511727078891258,
40
+ "grad_norm": 0.4115377366542816,
41
+ "kl": 5.65648078918457e-05,
42
+ "learning_rate": 1.5e-06,
43
+ "loss": 0.0,
44
+ "reward": 0.3549107313156128,
45
+ "reward_std": 0.38977522775530815,
46
+ "rewards/accuracy_reward": 0.3515625186264515,
47
+ "rewards/format_reward": 0.003348214435391128,
48
+ "step": 3
49
+ },
50
+ {
51
+ "completion_length": 611.1585159301758,
52
+ "epoch": 0.06823027718550106,
53
+ "grad_norm": 0.47733554244041443,
54
+ "kl": 6.213784217834473e-05,
55
+ "learning_rate": 2e-06,
56
+ "loss": 0.0,
57
+ "reward": 0.3303571566939354,
58
+ "reward_std": 0.3627745509147644,
59
+ "rewards/accuracy_reward": 0.3292410857975483,
60
+ "rewards/format_reward": 0.0011160714784637094,
61
+ "step": 4
62
+ },
63
+ {
64
+ "completion_length": 580.760066986084,
65
+ "epoch": 0.08528784648187633,
66
+ "grad_norm": 0.4139915406703949,
67
+ "kl": 7.668137550354004e-05,
68
+ "learning_rate": 2.5e-06,
69
+ "loss": 0.0,
70
+ "reward": 0.36941965855658054,
71
+ "reward_std": 0.37279706075787544,
72
+ "rewards/accuracy_reward": 0.36607144214212894,
73
+ "rewards/format_reward": 0.003348214435391128,
74
+ "step": 5
75
+ },
76
+ {
77
+ "completion_length": 589.4765853881836,
78
+ "epoch": 0.1023454157782516,
79
+ "grad_norm": 0.9889692664146423,
80
+ "kl": 0.00010079145431518555,
81
+ "learning_rate": 3e-06,
82
+ "loss": 0.0,
83
+ "reward": 0.333705373108387,
84
+ "reward_std": 0.35657742619514465,
85
+ "rewards/accuracy_reward": 0.333705373108387,
86
+ "rewards/format_reward": 0.0,
87
+ "step": 6
88
+ },
89
+ {
90
+ "completion_length": 586.4297180175781,
91
+ "epoch": 0.11940298507462686,
92
+ "grad_norm": 1.1514272689819336,
93
+ "kl": 0.00018024444580078125,
94
+ "learning_rate": 2.9972633313349763e-06,
95
+ "loss": 0.0,
96
+ "reward": 0.3381696604192257,
97
+ "reward_std": 0.3602691851556301,
98
+ "rewards/accuracy_reward": 0.3370535857975483,
99
+ "rewards/format_reward": 0.0011160714784637094,
100
+ "step": 7
101
+ },
102
+ {
103
+ "completion_length": 607.0881881713867,
104
+ "epoch": 0.13646055437100213,
105
+ "grad_norm": 0.3025471866130829,
106
+ "kl": 0.0006165504455566406,
107
+ "learning_rate": 2.989063311147081e-06,
108
+ "loss": 0.0,
109
+ "reward": 0.3727678768336773,
110
+ "reward_std": 0.365192923694849,
111
+ "rewards/accuracy_reward": 0.3716518059372902,
112
+ "rewards/format_reward": 0.0011160714784637094,
113
+ "step": 8
114
+ },
115
+ {
116
+ "completion_length": 582.1261444091797,
117
+ "epoch": 0.1535181236673774,
118
+ "grad_norm": 0.3065524995326996,
119
+ "kl": 0.0008487701416015625,
120
+ "learning_rate": 2.9754298604207156e-06,
121
+ "loss": 0.0,
122
+ "reward": 0.4140625186264515,
123
+ "reward_std": 0.4111504293978214,
124
+ "rewards/accuracy_reward": 0.4129464477300644,
125
+ "rewards/format_reward": 0.0011160714784637094,
126
+ "step": 9
127
+ },
128
+ {
129
+ "completion_length": 609.3214645385742,
130
+ "epoch": 0.17057569296375266,
131
+ "grad_norm": 0.22064943611621857,
132
+ "kl": 0.00093841552734375,
133
+ "learning_rate": 2.956412726139078e-06,
134
+ "loss": 0.0,
135
+ "reward": 0.408482164144516,
136
+ "reward_std": 0.36822360940277576,
137
+ "rewards/accuracy_reward": 0.4073660932481289,
138
+ "rewards/format_reward": 0.0011160714784637094,
139
+ "step": 10
140
+ },
141
+ {
142
+ "completion_length": 639.6484680175781,
143
+ "epoch": 0.18763326226012794,
144
+ "grad_norm": 0.9334643483161926,
145
+ "kl": 0.003772258758544922,
146
+ "learning_rate": 2.9320812997628183e-06,
147
+ "loss": 0.0002,
148
+ "reward": 0.3917410857975483,
149
+ "reward_std": 0.3854985646903515,
150
+ "rewards/accuracy_reward": 0.3895089440047741,
151
+ "rewards/format_reward": 0.0022321429569274187,
152
+ "step": 11
153
+ },
154
+ {
155
+ "completion_length": 571.6004638671875,
156
+ "epoch": 0.2046908315565032,
157
+ "grad_norm": 0.308927059173584,
158
+ "kl": 0.0036773681640625,
159
+ "learning_rate": 2.9025243640281224e-06,
160
+ "loss": 0.0001,
161
+ "reward": 0.4854910932481289,
162
+ "reward_std": 0.3801889941096306,
163
+ "rewards/accuracy_reward": 0.4843750186264515,
164
+ "rewards/format_reward": 0.0011160714784637094,
165
+ "step": 12
166
+ },
167
+ {
168
+ "completion_length": 596.0346221923828,
169
+ "epoch": 0.22174840085287847,
170
+ "grad_norm": 0.3541490435600281,
171
+ "kl": 0.0049343109130859375,
172
+ "learning_rate": 2.8678497689881355e-06,
173
+ "loss": 0.0002,
174
+ "reward": 0.4654018096625805,
175
+ "reward_std": 0.3858399875462055,
176
+ "rewards/accuracy_reward": 0.4620535895228386,
177
+ "rewards/format_reward": 0.003348214435391128,
178
+ "step": 13
179
+ },
180
+ {
181
+ "completion_length": 630.7388610839844,
182
+ "epoch": 0.23880597014925373,
183
+ "grad_norm": 0.17941774427890778,
184
+ "kl": 0.004192352294921875,
185
+ "learning_rate": 2.8281840384798147e-06,
186
+ "loss": 0.0002,
187
+ "reward": 0.4642857350409031,
188
+ "reward_std": 0.3576914146542549,
189
+ "rewards/accuracy_reward": 0.4631696604192257,
190
+ "rewards/format_reward": 0.0011160714784637094,
191
+ "step": 14
192
+ },
193
+ {
194
+ "completion_length": 619.506721496582,
195
+ "epoch": 0.255863539445629,
196
+ "grad_norm": 0.2217731475830078,
197
+ "kl": 0.0070285797119140625,
198
+ "learning_rate": 2.7836719084521715e-06,
199
+ "loss": 0.0003,
200
+ "reward": 0.4229910895228386,
201
+ "reward_std": 0.3531936705112457,
202
+ "rewards/accuracy_reward": 0.4229910895228386,
203
+ "rewards/format_reward": 0.0,
204
+ "step": 15
205
+ },
206
+ {
207
+ "completion_length": 591.1908798217773,
208
+ "epoch": 0.27292110874200426,
209
+ "grad_norm": 0.23496699333190918,
210
+ "kl": 0.008819580078125,
211
+ "learning_rate": 2.7344757988404844e-06,
212
+ "loss": 0.0004,
213
+ "reward": 0.5089285857975483,
214
+ "reward_std": 0.3891791105270386,
215
+ "rewards/accuracy_reward": 0.5078125149011612,
216
+ "rewards/format_reward": 0.0011160714784637094,
217
+ "step": 16
218
+ },
219
+ {
220
+ "completion_length": 621.5346145629883,
221
+ "epoch": 0.2899786780383795,
222
+ "grad_norm": 1.1953942775726318,
223
+ "kl": 0.0303192138671875,
224
+ "learning_rate": 2.680775220913575e-06,
225
+ "loss": 0.0012,
226
+ "reward": 0.4888393096625805,
227
+ "reward_std": 0.3780059143900871,
228
+ "rewards/accuracy_reward": 0.4854910932481289,
229
+ "rewards/format_reward": 0.003348214435391128,
230
+ "step": 17
231
+ },
232
+ {
233
+ "completion_length": 649.108283996582,
234
+ "epoch": 0.3070362473347548,
235
+ "grad_norm": 0.8823431730270386,
236
+ "kl": 0.03516387939453125,
237
+ "learning_rate": 2.6227661222566517e-06,
238
+ "loss": 0.0014,
239
+ "reward": 0.4944196678698063,
240
+ "reward_std": 0.3778855614364147,
241
+ "rewards/accuracy_reward": 0.4944196678698063,
242
+ "rewards/format_reward": 0.0,
243
+ "step": 18
244
+ },
245
+ {
246
+ "completion_length": 627.3359603881836,
247
+ "epoch": 0.32409381663113007,
248
+ "grad_norm": 0.2409147024154663,
249
+ "kl": 0.0187530517578125,
250
+ "learning_rate": 2.5606601717798212e-06,
251
+ "loss": 0.0008,
252
+ "reward": 0.5446428842842579,
253
+ "reward_std": 0.3593517243862152,
254
+ "rewards/accuracy_reward": 0.5435268133878708,
255
+ "rewards/format_reward": 0.0011160714784637094,
256
+ "step": 19
257
+ },
258
+ {
259
+ "completion_length": 587.4598388671875,
260
+ "epoch": 0.3411513859275053,
261
+ "grad_norm": 0.36490699648857117,
262
+ "kl": 0.020496368408203125,
263
+ "learning_rate": 2.4946839873611927e-06,
264
+ "loss": 0.0008,
265
+ "reward": 0.5669643171131611,
266
+ "reward_std": 0.33723995834589005,
267
+ "rewards/accuracy_reward": 0.5658482499420643,
268
+ "rewards/format_reward": 0.0011160714784637094,
269
+ "step": 20
270
+ },
271
+ {
272
+ "completion_length": 639.7868499755859,
273
+ "epoch": 0.3582089552238806,
274
+ "grad_norm": 2.728965997695923,
275
+ "kl": 0.05742645263671875,
276
+ "learning_rate": 2.425078308942815e-06,
277
+ "loss": 0.0023,
278
+ "reward": 0.5747768059372902,
279
+ "reward_std": 0.31150117330253124,
280
+ "rewards/accuracy_reward": 0.5747768059372902,
281
+ "rewards/format_reward": 0.0,
282
+ "step": 21
283
+ },
284
+ {
285
+ "completion_length": 619.8281478881836,
286
+ "epoch": 0.3752665245202559,
287
+ "grad_norm": 0.12718994915485382,
288
+ "kl": 0.01346588134765625,
289
+ "learning_rate": 2.3520971200967337e-06,
290
+ "loss": 0.0005,
291
+ "reward": 0.5691964514553547,
292
+ "reward_std": 0.32910910062491894,
293
+ "rewards/accuracy_reward": 0.5691964514553547,
294
+ "rewards/format_reward": 0.0,
295
+ "step": 22
296
+ },
297
+ {
298
+ "completion_length": 639.185302734375,
299
+ "epoch": 0.39232409381663114,
300
+ "grad_norm": 0.15699955821037292,
301
+ "kl": 0.009624481201171875,
302
+ "learning_rate": 2.276006721266485e-06,
303
+ "loss": 0.0004,
304
+ "reward": 0.5256696678698063,
305
+ "reward_std": 0.3530396558344364,
306
+ "rewards/accuracy_reward": 0.5256696678698063,
307
+ "rewards/format_reward": 0.0,
308
+ "step": 23
309
+ },
310
+ {
311
+ "completion_length": 630.2265853881836,
312
+ "epoch": 0.4093816631130064,
313
+ "grad_norm": 0.09854375571012497,
314
+ "kl": 0.009571075439453125,
315
+ "learning_rate": 2.1970847580656528e-06,
316
+ "loss": 0.0004,
317
+ "reward": 0.5747768133878708,
318
+ "reward_std": 0.315556762740016,
319
+ "rewards/accuracy_reward": 0.5747768133878708,
320
+ "rewards/format_reward": 0.0,
321
+ "step": 24
322
+ },
323
+ {
324
+ "completion_length": 636.4408798217773,
325
+ "epoch": 0.42643923240938164,
326
+ "grad_norm": 0.12171110510826111,
327
+ "kl": 0.0113525390625,
328
+ "learning_rate": 2.1156192081791355e-06,
329
+ "loss": 0.0005,
330
+ "reward": 0.533482164144516,
331
+ "reward_std": 0.3439404182136059,
332
+ "rewards/accuracy_reward": 0.533482164144516,
333
+ "rewards/format_reward": 0.0,
334
+ "step": 25
335
+ },
336
+ {
337
+ "completion_length": 613.5491256713867,
338
+ "epoch": 0.44349680170575695,
339
+ "grad_norm": 0.10672565549612045,
340
+ "kl": 0.007846832275390625,
341
+ "learning_rate": 2.0319073305638034e-06,
342
+ "loss": 0.0003,
343
+ "reward": 0.5167411044239998,
344
+ "reward_std": 0.35071658343076706,
345
+ "rewards/accuracy_reward": 0.5156250298023224,
346
+ "rewards/format_reward": 0.0011160714784637094,
347
+ "step": 26
348
+ },
349
+ {
350
+ "completion_length": 624.9497985839844,
351
+ "epoch": 0.4605543710021322,
352
+ "grad_norm": 0.13481971621513367,
353
+ "kl": 0.0122833251953125,
354
+ "learning_rate": 1.9462545807828044e-06,
355
+ "loss": 0.0005,
356
+ "reward": 0.5837053842842579,
357
+ "reward_std": 0.3019485678523779,
358
+ "rewards/accuracy_reward": 0.5837053842842579,
359
+ "rewards/format_reward": 0.0,
360
+ "step": 27
361
+ },
362
+ {
363
+ "completion_length": 616.9285888671875,
364
+ "epoch": 0.47761194029850745,
365
+ "grad_norm": 0.1329677402973175,
366
+ "kl": 0.009983062744140625,
367
+ "learning_rate": 1.8589734964313368e-06,
368
+ "loss": 0.0004,
369
+ "reward": 0.5669643059372902,
370
+ "reward_std": 0.34032437205314636,
371
+ "rewards/accuracy_reward": 0.5669643059372902,
372
+ "rewards/format_reward": 0.0,
373
+ "step": 28
374
+ },
375
+ {
376
+ "completion_length": 634.3593902587891,
377
+ "epoch": 0.4946695095948827,
378
+ "grad_norm": 0.08903162181377411,
379
+ "kl": 0.00791168212890625,
380
+ "learning_rate": 1.7703825567208588e-06,
381
+ "loss": 0.0003,
382
+ "reward": 0.5680803768336773,
383
+ "reward_std": 0.2970337048172951,
384
+ "rewards/accuracy_reward": 0.5669643096625805,
385
+ "rewards/format_reward": 0.0011160714784637094,
386
+ "step": 29
387
+ },
388
+ {
389
+ "completion_length": 650.2902069091797,
390
+ "epoch": 0.511727078891258,
391
+ "grad_norm": 0.09270613640546799,
392
+ "kl": 0.010021209716796875,
393
+ "learning_rate": 1.6808050203829845e-06,
394
+ "loss": 0.0004,
395
+ "reward": 0.5368303805589676,
396
+ "reward_std": 0.316427243873477,
397
+ "rewards/accuracy_reward": 0.5368303805589676,
398
+ "rewards/format_reward": 0.0,
399
+ "step": 30
400
+ },
401
+ {
402
+ "completion_length": 635.8582916259766,
403
+ "epoch": 0.5287846481876333,
404
+ "grad_norm": 0.0931444764137268,
405
+ "kl": 0.007686614990234375,
406
+ "learning_rate": 1.5905677461334292e-06,
407
+ "loss": 0.0003,
408
+ "reward": 0.5435268245637417,
409
+ "reward_std": 0.31041241250932217,
410
+ "rewards/accuracy_reward": 0.5435268245637417,
411
+ "rewards/format_reward": 0.0,
412
+ "step": 31
413
+ },
414
+ {
415
+ "completion_length": 638.8214569091797,
416
+ "epoch": 0.5458422174840085,
417
+ "grad_norm": 0.1111321821808815,
418
+ "kl": 0.007816314697265625,
419
+ "learning_rate": 1.5e-06,
420
+ "loss": 0.0003,
421
+ "reward": 0.5691964626312256,
422
+ "reward_std": 0.3233349844813347,
423
+ "rewards/accuracy_reward": 0.5691964626312256,
424
+ "rewards/format_reward": 0.0,
425
+ "step": 32
426
+ },
427
+ {
428
+ "completion_length": 627.1317291259766,
429
+ "epoch": 0.5628997867803838,
430
+ "grad_norm": 0.1286892145872116,
431
+ "kl": 0.010196685791015625,
432
+ "learning_rate": 1.4094322538665708e-06,
433
+ "loss": 0.0004,
434
+ "reward": 0.5513393059372902,
435
+ "reward_std": 0.30831882171332836,
436
+ "rewards/accuracy_reward": 0.5513393059372902,
437
+ "rewards/format_reward": 0.0,
438
+ "step": 33
439
+ },
440
+ {
441
+ "completion_length": 623.7600708007812,
442
+ "epoch": 0.579957356076759,
443
+ "grad_norm": 0.10542038083076477,
444
+ "kl": 0.0088958740234375,
445
+ "learning_rate": 1.3191949796170155e-06,
446
+ "loss": 0.0004,
447
+ "reward": 0.510044664144516,
448
+ "reward_std": 0.3244265168905258,
449
+ "rewards/accuracy_reward": 0.5089285895228386,
450
+ "rewards/format_reward": 0.0011160714784637094,
451
+ "step": 34
452
+ },
453
+ {
454
+ "completion_length": 662.3705673217773,
455
+ "epoch": 0.5970149253731343,
456
+ "grad_norm": 0.16724978387355804,
457
+ "kl": 0.00736236572265625,
458
+ "learning_rate": 1.2296174432791415e-06,
459
+ "loss": 0.0003,
460
+ "reward": 0.558035746216774,
461
+ "reward_std": 0.3229935597628355,
462
+ "rewards/accuracy_reward": 0.558035746216774,
463
+ "rewards/format_reward": 0.0,
464
+ "step": 35
465
+ },
466
+ {
467
+ "completion_length": 609.7712326049805,
468
+ "epoch": 0.6140724946695096,
469
+ "grad_norm": 0.09450776875019073,
470
+ "kl": 0.009227752685546875,
471
+ "learning_rate": 1.141026503568664e-06,
472
+ "loss": 0.0004,
473
+ "reward": 0.5345982387661934,
474
+ "reward_std": 0.3159206211566925,
475
+ "rewards/accuracy_reward": 0.533482164144516,
476
+ "rewards/format_reward": 0.0011160714784637094,
477
+ "step": 36
478
+ },
479
+ {
480
+ "completion_length": 624.8783798217773,
481
+ "epoch": 0.6311300639658849,
482
+ "grad_norm": 0.1119498461484909,
483
+ "kl": 0.0086822509765625,
484
+ "learning_rate": 1.0537454192171958e-06,
485
+ "loss": 0.0003,
486
+ "reward": 0.6026785932481289,
487
+ "reward_std": 0.2997346203774214,
488
+ "rewards/accuracy_reward": 0.6026785932481289,
489
+ "rewards/format_reward": 0.0,
490
+ "step": 37
491
+ },
492
+ {
493
+ "completion_length": 613.0368576049805,
494
+ "epoch": 0.6481876332622601,
495
+ "grad_norm": 0.09415058046579361,
496
+ "kl": 0.00948333740234375,
497
+ "learning_rate": 9.680926694361964e-07,
498
+ "loss": 0.0004,
499
+ "reward": 0.6015625298023224,
500
+ "reward_std": 0.2972884103655815,
501
+ "rewards/accuracy_reward": 0.6015625298023224,
502
+ "rewards/format_reward": 0.0,
503
+ "step": 38
504
+ },
505
+ {
506
+ "completion_length": 605.041316986084,
507
+ "epoch": 0.6652452025586354,
508
+ "grad_norm": 0.11349077522754669,
509
+ "kl": 0.01213836669921875,
510
+ "learning_rate": 8.843807918208651e-07,
511
+ "loss": 0.0005,
512
+ "reward": 0.5982143208384514,
513
+ "reward_std": 0.30514490231871605,
514
+ "rewards/accuracy_reward": 0.597098246216774,
515
+ "rewards/format_reward": 0.0011160714784637094,
516
+ "step": 39
517
+ },
518
+ {
519
+ "completion_length": 664.9520416259766,
520
+ "epoch": 0.6823027718550106,
521
+ "grad_norm": 0.07810351997613907,
522
+ "kl": 0.008647918701171875,
523
+ "learning_rate": 8.029152419343472e-07,
524
+ "loss": 0.0003,
525
+ "reward": 0.5022321715950966,
526
+ "reward_std": 0.2990181464701891,
527
+ "rewards/accuracy_reward": 0.5022321715950966,
528
+ "rewards/format_reward": 0.0,
529
+ "step": 40
530
+ },
531
+ {
532
+ "completion_length": 637.6194458007812,
533
+ "epoch": 0.6993603411513859,
534
+ "grad_norm": 0.11813782900571823,
535
+ "kl": 0.0095977783203125,
536
+ "learning_rate": 7.239932787335147e-07,
537
+ "loss": 0.0004,
538
+ "reward": 0.5513393096625805,
539
+ "reward_std": 0.30872474052011967,
540
+ "rewards/accuracy_reward": 0.5513393096625805,
541
+ "rewards/format_reward": 0.0,
542
+ "step": 41
543
+ },
544
+ {
545
+ "completion_length": 629.3381881713867,
546
+ "epoch": 0.7164179104477612,
547
+ "grad_norm": 0.08325987309217453,
548
+ "kl": 0.010807037353515625,
549
+ "learning_rate": 6.479028799032664e-07,
550
+ "loss": 0.0004,
551
+ "reward": 0.597098246216774,
552
+ "reward_std": 0.31203021854162216,
553
+ "rewards/accuracy_reward": 0.597098246216774,
554
+ "rewards/format_reward": 0.0,
555
+ "step": 42
556
+ },
557
+ {
558
+ "completion_length": 683.7757034301758,
559
+ "epoch": 0.7334754797441365,
560
+ "grad_norm": 0.07234999537467957,
561
+ "kl": 0.006565093994140625,
562
+ "learning_rate": 5.749216910571854e-07,
563
+ "loss": 0.0003,
564
+ "reward": 0.5457589589059353,
565
+ "reward_std": 0.29359665140509605,
566
+ "rewards/accuracy_reward": 0.5457589589059353,
567
+ "rewards/format_reward": 0.0,
568
+ "step": 43
569
+ },
570
+ {
571
+ "completion_length": 626.8995895385742,
572
+ "epoch": 0.7505330490405118,
573
+ "grad_norm": 0.14146152138710022,
574
+ "kl": 0.01007080078125,
575
+ "learning_rate": 5.053160126388076e-07,
576
+ "loss": 0.0004,
577
+ "reward": 0.5781250335276127,
578
+ "reward_std": 0.3283701930195093,
579
+ "rewards/accuracy_reward": 0.5781250335276127,
580
+ "rewards/format_reward": 0.0,
581
+ "step": 44
582
+ },
583
+ {
584
+ "completion_length": 607.5156555175781,
585
+ "epoch": 0.767590618336887,
586
+ "grad_norm": 0.10176991671323776,
587
+ "kl": 0.01148223876953125,
588
+ "learning_rate": 4.3933982822017883e-07,
589
+ "loss": 0.0005,
590
+ "reward": 0.584821455180645,
591
+ "reward_std": 0.2961436016485095,
592
+ "rewards/accuracy_reward": 0.584821455180645,
593
+ "rewards/format_reward": 0.0,
594
+ "step": 45
595
+ },
596
+ {
597
+ "completion_length": 642.0223541259766,
598
+ "epoch": 0.7846481876332623,
599
+ "grad_norm": 0.09057696908712387,
600
+ "kl": 0.008152008056640625,
601
+ "learning_rate": 3.772338777433482e-07,
602
+ "loss": 0.0003,
603
+ "reward": 0.544642873108387,
604
+ "reward_std": 0.30674307234585285,
605
+ "rewards/accuracy_reward": 0.544642873108387,
606
+ "rewards/format_reward": 0.0,
607
+ "step": 46
608
+ },
609
+ {
610
+ "completion_length": 631.8716888427734,
611
+ "epoch": 0.8017057569296375,
612
+ "grad_norm": 0.09820377081632614,
613
+ "kl": 0.010608673095703125,
614
+ "learning_rate": 3.192247790864249e-07,
615
+ "loss": 0.0004,
616
+ "reward": 0.561383955180645,
617
+ "reward_std": 0.32947295345366,
618
+ "rewards/accuracy_reward": 0.5602678805589676,
619
+ "rewards/format_reward": 0.0011160714784637094,
620
+ "step": 47
621
+ },
622
+ {
623
+ "completion_length": 628.2678909301758,
624
+ "epoch": 0.8187633262260128,
625
+ "grad_norm": 0.10683470219373703,
626
+ "kl": 0.0122833251953125,
627
+ "learning_rate": 2.6552420115951547e-07,
628
+ "loss": 0.0005,
629
+ "reward": 0.5602678805589676,
630
+ "reward_std": 0.3167324475944042,
631
+ "rewards/accuracy_reward": 0.5602678805589676,
632
+ "rewards/format_reward": 0.0,
633
+ "step": 48
634
+ },
635
+ {
636
+ "completion_length": 649.482177734375,
637
+ "epoch": 0.835820895522388,
638
+ "grad_norm": 0.07014506310224533,
639
+ "kl": 0.00775146484375,
640
+ "learning_rate": 2.163280915478289e-07,
641
+ "loss": 0.0003,
642
+ "reward": 0.5546875298023224,
643
+ "reward_std": 0.2984890937805176,
644
+ "rewards/accuracy_reward": 0.5546875298023224,
645
+ "rewards/format_reward": 0.0,
646
+ "step": 49
647
+ },
648
+ {
649
+ "completion_length": 646.8906478881836,
650
+ "epoch": 0.8528784648187633,
651
+ "grad_norm": 0.09115675836801529,
652
+ "kl": 0.008701324462890625,
653
+ "learning_rate": 1.718159615201853e-07,
654
+ "loss": 0.0003,
655
+ "reward": 0.5781250186264515,
656
+ "reward_std": 0.3092201482504606,
657
+ "rewards/accuracy_reward": 0.5781250186264515,
658
+ "rewards/format_reward": 0.0,
659
+ "step": 50
660
+ },
661
+ {
662
+ "completion_length": 605.0301666259766,
663
+ "epoch": 0.8699360341151386,
664
+ "grad_norm": 0.10600890219211578,
665
+ "kl": 0.0100250244140625,
666
+ "learning_rate": 1.321502310118649e-07,
667
+ "loss": 0.0004,
668
+ "reward": 0.6149553805589676,
669
+ "reward_std": 0.3139810301363468,
670
+ "rewards/accuracy_reward": 0.6149553805589676,
671
+ "rewards/format_reward": 0.0,
672
+ "step": 51
673
+ },
674
+ {
675
+ "completion_length": 659.0055999755859,
676
+ "epoch": 0.8869936034115139,
677
+ "grad_norm": 0.07781906425952911,
678
+ "kl": 0.00876617431640625,
679
+ "learning_rate": 9.747563597187792e-08,
680
+ "loss": 0.0004,
681
+ "reward": 0.577008955180645,
682
+ "reward_std": 0.3069279305636883,
683
+ "rewards/accuracy_reward": 0.577008955180645,
684
+ "rewards/format_reward": 0.0,
685
+ "step": 52
686
+ },
687
+ {
688
+ "completion_length": 645.2064971923828,
689
+ "epoch": 0.9040511727078892,
690
+ "grad_norm": 0.10724423825740814,
691
+ "kl": 0.009578704833984375,
692
+ "learning_rate": 6.791870023718161e-08,
693
+ "loss": 0.0004,
694
+ "reward": 0.5636160895228386,
695
+ "reward_std": 0.2895326167345047,
696
+ "rewards/accuracy_reward": 0.5636160895228386,
697
+ "rewards/format_reward": 0.0,
698
+ "step": 53
699
+ },
700
+ {
701
+ "completion_length": 611.3013534545898,
702
+ "epoch": 0.9211087420042644,
703
+ "grad_norm": 0.09349353611469269,
704
+ "kl": 0.011241912841796875,
705
+ "learning_rate": 4.358727386092198e-08,
706
+ "loss": 0.0004,
707
+ "reward": 0.6250000335276127,
708
+ "reward_std": 0.31463043205440044,
709
+ "rewards/accuracy_reward": 0.6250000335276127,
710
+ "rewards/format_reward": 0.0,
711
+ "step": 54
712
+ },
713
+ {
714
+ "completion_length": 656.0178833007812,
715
+ "epoch": 0.9381663113006397,
716
+ "grad_norm": 0.09912308305501938,
717
+ "kl": 0.009449005126953125,
718
+ "learning_rate": 2.4570139579284723e-08,
719
+ "loss": 0.0004,
720
+ "reward": 0.5904018133878708,
721
+ "reward_std": 0.34508523903787136,
722
+ "rewards/accuracy_reward": 0.5904018133878708,
723
+ "rewards/format_reward": 0.0,
724
+ "step": 55
725
+ },
726
+ {
727
+ "completion_length": 626.8180999755859,
728
+ "epoch": 0.9552238805970149,
729
+ "grad_norm": 0.3471205234527588,
730
+ "kl": 0.017642974853515625,
731
+ "learning_rate": 1.093668885291904e-08,
732
+ "loss": 0.0007,
733
+ "reward": 0.5825893059372902,
734
+ "reward_std": 0.31324212066829205,
735
+ "rewards/accuracy_reward": 0.5814732387661934,
736
+ "rewards/format_reward": 0.0011160714784637094,
737
+ "step": 56
738
+ },
739
+ {
740
+ "completion_length": 627.2902069091797,
741
+ "epoch": 0.9722814498933902,
742
+ "grad_norm": 0.08442260324954987,
743
+ "kl": 0.008678436279296875,
744
+ "learning_rate": 2.736668665023756e-09,
745
+ "loss": 0.0003,
746
+ "reward": 0.597098246216774,
747
+ "reward_std": 0.2932440135627985,
748
+ "rewards/accuracy_reward": 0.597098246216774,
749
+ "rewards/format_reward": 0.0,
750
+ "step": 57
751
+ },
752
+ {
753
+ "completion_length": 619.4442138671875,
754
+ "epoch": 0.9893390191897654,
755
+ "grad_norm": 0.078741155564785,
756
+ "kl": 0.008129119873046875,
757
+ "learning_rate": 0.0,
758
+ "loss": 0.0003,
759
+ "reward": 0.5970982424914837,
760
+ "reward_std": 0.28597242943942547,
761
+ "rewards/accuracy_reward": 0.5970982424914837,
762
+ "rewards/format_reward": 0.0,
763
+ "step": 58
764
+ },
765
+ {
766
+ "epoch": 0.9893390191897654,
767
+ "step": 58,
768
+ "total_flos": 0.0,
769
+ "train_loss": 0.0,
770
+ "train_runtime": 1.0266,
771
+ "train_samples_per_second": 7305.492,
772
+ "train_steps_per_second": 56.496
773
+ }
774
+ ],
775
+ "logging_steps": 1,
776
+ "max_steps": 58,
777
+ "num_input_tokens_seen": 0,
778
+ "num_train_epochs": 1,
779
+ "save_steps": 20,
780
+ "stateful_callbacks": {
781
+ "TrainerControl": {
782
+ "args": {
783
+ "should_epoch_stop": false,
784
+ "should_evaluate": false,
785
+ "should_log": false,
786
+ "should_save": true,
787
+ "should_training_stop": true
788
+ },
789
+ "attributes": {}
790
+ }
791
+ },
792
+ "total_flos": 0.0,
793
+ "train_batch_size": 16,
794
+ "trial_name": null,
795
+ "trial_params": null
796
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e55781e19058d9186bf905b094cd5cb5eec883bbdf40d80849f0ade9ff24c939
3
+ size 7608
vocab.json ADDED
The diff for this file is too large to render. See raw diff