chitanda commited on
Commit
e38171c
·
verified ·
1 Parent(s): dccf6b5

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +25 -0
  2. checkpoint-100/config.json +28 -0
  3. checkpoint-100/generation_config.json +7 -0
  4. checkpoint-100/gsm8k.test.v1.1.0shot.json +0 -0
  5. checkpoint-100/gsm8k.test.v1.1.0shot.jsonl +0 -0
  6. checkpoint-100/gsm8k.test.v1.1.0shot.metrics.json +5 -0
  7. checkpoint-100/math.test.v1.1.0shot.json +0 -0
  8. checkpoint-100/math.test.v1.1.0shot.jsonl +3 -0
  9. checkpoint-100/math.test.v1.1.0shot.metrics.json +5 -0
  10. checkpoint-100/pytorch_model.bin +3 -0
  11. checkpoint-100/special_tokens_map.json +34 -0
  12. checkpoint-100/tokenizer.json +3 -0
  13. checkpoint-100/tokenizer.model +3 -0
  14. checkpoint-100/tokenizer_config.json +70 -0
  15. checkpoint-100/training_config.yaml +211 -0
  16. checkpoint-200/config.json +28 -0
  17. checkpoint-200/generation_config.json +7 -0
  18. checkpoint-200/gsm8k.test.v1.1.0shot.json +0 -0
  19. checkpoint-200/gsm8k.test.v1.1.0shot.jsonl +0 -0
  20. checkpoint-200/gsm8k.test.v1.1.0shot.metrics.json +5 -0
  21. checkpoint-200/math.test.v1.1.0shot.json +0 -0
  22. checkpoint-200/math.test.v1.1.0shot.jsonl +3 -0
  23. checkpoint-200/math.test.v1.1.0shot.metrics.json +5 -0
  24. checkpoint-200/pytorch_model.bin +3 -0
  25. checkpoint-200/special_tokens_map.json +34 -0
  26. checkpoint-200/tokenizer.json +3 -0
  27. checkpoint-200/tokenizer.model +3 -0
  28. checkpoint-200/tokenizer_config.json +70 -0
  29. checkpoint-200/training_config.yaml +211 -0
  30. checkpoint-300/config.json +28 -0
  31. checkpoint-300/generation_config.json +7 -0
  32. checkpoint-300/gsm8k.test.v1.1.0shot.json +0 -0
  33. checkpoint-300/gsm8k.test.v1.1.0shot.jsonl +0 -0
  34. checkpoint-300/gsm8k.test.v1.1.0shot.metrics.json +5 -0
  35. checkpoint-300/math.test.v1.1.0shot.json +3 -0
  36. checkpoint-300/math.test.v1.1.0shot.jsonl +3 -0
  37. checkpoint-300/math.test.v1.1.0shot.metrics.json +5 -0
  38. checkpoint-300/pytorch_model.bin +3 -0
  39. checkpoint-300/special_tokens_map.json +34 -0
  40. checkpoint-300/tokenizer.json +3 -0
  41. checkpoint-300/tokenizer.model +3 -0
  42. checkpoint-300/tokenizer_config.json +70 -0
  43. checkpoint-300/training_config.yaml +211 -0
  44. checkpoint-400/config.json +28 -0
  45. checkpoint-400/generation_config.json +7 -0
  46. checkpoint-400/gsm8k.test.v1.1.0shot.json +0 -0
  47. checkpoint-400/gsm8k.test.v1.1.0shot.jsonl +0 -0
  48. checkpoint-400/gsm8k.test.v1.1.0shot.metrics.json +5 -0
  49. checkpoint-400/math.test.v1.1.0shot.json +3 -0
  50. checkpoint-400/math.test.v1.1.0shot.jsonl +3 -0
.gitattributes CHANGED
@@ -33,3 +33,28 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-100/math.test.v1.1.0shot.jsonl filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-200/math.test.v1.1.0shot.jsonl filter=lfs diff=lfs merge=lfs -text
39
+ checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ checkpoint-300/math.test.v1.1.0shot.json filter=lfs diff=lfs merge=lfs -text
41
+ checkpoint-300/math.test.v1.1.0shot.jsonl filter=lfs diff=lfs merge=lfs -text
42
+ checkpoint-300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
+ checkpoint-400/math.test.v1.1.0shot.json filter=lfs diff=lfs merge=lfs -text
44
+ checkpoint-400/math.test.v1.1.0shot.jsonl filter=lfs diff=lfs merge=lfs -text
45
+ checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
+ checkpoint-500/math.test.v1.1.0shot.json filter=lfs diff=lfs merge=lfs -text
47
+ checkpoint-500/math.test.v1.1.0shot.jsonl filter=lfs diff=lfs merge=lfs -text
48
+ checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
+ checkpoint-600/math.test.v1.1.0shot.json filter=lfs diff=lfs merge=lfs -text
50
+ checkpoint-600/math.test.v1.1.0shot.jsonl filter=lfs diff=lfs merge=lfs -text
51
+ checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
52
+ checkpoint-700/math.test.v1.1.0shot.json filter=lfs diff=lfs merge=lfs -text
53
+ checkpoint-700/math.test.v1.1.0shot.jsonl filter=lfs diff=lfs merge=lfs -text
54
+ checkpoint-700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
55
+ checkpoint-800/math.test.v1.1.0shot.json filter=lfs diff=lfs merge=lfs -text
56
+ checkpoint-800/math.test.v1.1.0shot.jsonl filter=lfs diff=lfs merge=lfs -text
57
+ checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
58
+ checkpoint-900/math.test.v1.1.0shot.json filter=lfs diff=lfs merge=lfs -text
59
+ checkpoint-900/math.test.v1.1.0shot.jsonl filter=lfs diff=lfs merge=lfs -text
60
+ checkpoint-900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
checkpoint-100/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000/",
3
+ "architectures": [
4
+ "GemmaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 2,
9
+ "eos_token_id": 1,
10
+ "head_dim": 256,
11
+ "hidden_act": "gelu",
12
+ "hidden_size": 2048,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 16384,
15
+ "max_position_embeddings": 8192,
16
+ "model_type": "gemma",
17
+ "num_attention_heads": 8,
18
+ "num_hidden_layers": 18,
19
+ "num_key_value_heads": 1,
20
+ "pad_token_id": 0,
21
+ "rms_norm_eps": 1e-06,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000.0,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.38.2",
26
+ "use_cache": true,
27
+ "vocab_size": 256000
28
+ }
checkpoint-100/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.38.2"
7
+ }
checkpoint-100/gsm8k.test.v1.1.0shot.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-100/gsm8k.test.v1.1.0shot.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-100/gsm8k.test.v1.1.0shot.metrics.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "acc": 0.5261561789234268,
3
+ "correct": 694,
4
+ "total": 1319
5
+ }
checkpoint-100/math.test.v1.1.0shot.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-100/math.test.v1.1.0shot.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a71cc5da8225d8d44148c13f94b769279d43e7ed1b0c8088db8eeff57bfb2ddb
3
+ size 18863250
checkpoint-100/math.test.v1.1.0shot.metrics.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "acc": 0.15,
3
+ "correct": 750,
4
+ "total": 5000
5
+ }
checkpoint-100/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6bb6fcce4e50c2a97bb2ba33ece1810e089aacf448e4ac57cd2357511787f96
3
+ size 5012367854
checkpoint-100/special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<start_of_turn>",
4
+ "<end_of_turn>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<bos>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<pad>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "unk_token": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
checkpoint-100/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05e97791a5e007260de1db7e1692e53150e08cea481e2bf25435553380c147ee
3
+ size 17477929
checkpoint-100/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2
3
+ size 4241003
checkpoint-100/tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<bos>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "106": {
38
+ "content": "<start_of_turn>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "107": {
46
+ "content": "<end_of_turn>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ }
53
+ },
54
+ "additional_special_tokens": [
55
+ "<start_of_turn>",
56
+ "<end_of_turn>"
57
+ ],
58
+ "bos_token": "<bos>",
59
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
60
+ "clean_up_tokenization_spaces": false,
61
+ "eos_token": "<eos>",
62
+ "legacy": null,
63
+ "model_max_length": 1000000000000000019884624838656,
64
+ "pad_token": "<pad>",
65
+ "sp_model_kwargs": {},
66
+ "spaces_between_special_tokens": false,
67
+ "tokenizer_class": "GemmaTokenizer",
68
+ "unk_token": "<unk>",
69
+ "use_default_system_prompt": false
70
+ }
checkpoint-100/training_config.yaml ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ds_cfg:
2
+ train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
3
+ gradient_accumulation_steps: ${gradient_accumulation_steps}
4
+ scheduler:
5
+ type: WarmupDecayLR
6
+ params:
7
+ total_num_steps: 989
8
+ warmup_max_lr: ${learning_rate}
9
+ warmup_num_steps: 59
10
+ warmup_type: linear
11
+ optimizer:
12
+ type: AdamW
13
+ params:
14
+ lr: ${learning_rate}
15
+ betas:
16
+ - 0.9
17
+ - 0.95
18
+ eps: 1.0e-06
19
+ weight_decay: ${weight_decay}
20
+ bf16:
21
+ enabled: true
22
+ zero_optimization:
23
+ stage: 1
24
+ stage3_param_persistence_threshold: 100000.0
25
+ stage3_max_live_parameters: 100000000.0
26
+ stage3_prefetch_bucket_size: 100000000.0
27
+ memory_efficient_linear: false
28
+ steps_per_print: 25
29
+ gradient_clipping: 1.0
30
+ prescale_gradients: false
31
+ sft_model_dir: experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000/
32
+ train_file: ${sft_model_dir}/meta_math_sub.25k.rap.train.0shot.n10.tem1.0.p0.7.v1.0_clean_fix.prm_cp3200_best_of_10.v1.0.(2,3).pos0.5.neg10.product.full_only_pos2pos.json
33
+ dev_file: null
34
+ test_file: null
35
+ torch_dtype:
36
+ _target_: general_util.training_utils.return_torch_dtype
37
+ dtype: bfloat16
38
+ tokenizer_init:
39
+ _target_: general_util.tokenization_utils.init_tokenizer
40
+ tokenizer_path: experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000/
41
+ padding_side: left
42
+ device_map:
43
+ _target_: models.utils.return_single_device_map
44
+ model:
45
+ _target_: models.gemma.GemmaForCausalLMDPO.from_pretrained_with_ref_model
46
+ beta: 0.5
47
+ gradient_checkpointing: false
48
+ attn_implementation: flash_attention_2
49
+ torch_dtype: ${torch_dtype}
50
+ device_map: ${device_map}
51
+ ref_model:
52
+ _target_: models.gemma.GemmaForCausalLM.from_pretrained
53
+ pretrained_model_name_or_path: ${model_name_or_path}
54
+ torch_dtype: ${torch_dtype}
55
+ attn_implementation: flash_attention_2
56
+ device_map: ${device_map}
57
+ read_tensor_dpo:
58
+ _target_: data.logic_combine.MultiMappingDataset
59
+ file_path: experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000//meta_math_sub.25k.rap.train.0shot.n10.tem1.0.p0.7.v1.0_clean_fix.json
60
+ aligner:
61
+ _target_: data.input_aligner.concat_aligner
62
+ aligners:
63
+ - _target_: data.input_aligner.dpo_pair_aligner_cleaned
64
+ response_field: response
65
+ id_field: id
66
+ do_sample: false
67
+ template:
68
+ chosen: '{instruction}
69
+
70
+
71
+ ### Question: {query}
72
+
73
+
74
+ SubQuestion 1: {pos}<eos>'
75
+ reject: '{instruction}
76
+
77
+
78
+ ### Question: {query}
79
+
80
+
81
+ SubQuestion 1: {neg}<eos>'
82
+ prompt: '{instruction}
83
+
84
+
85
+ ### Question: {query}
86
+
87
+
88
+ SubQuestion 1:'
89
+ instruction: 'Given a question, please decompose it into sub-questions. For each
90
+ sub-question, please answer it in a complete sentence, ending with "The answer
91
+ is". When the original question is answerable, please start the sub-question with
92
+ "Now we can answer the question: ".'
93
+ kv_mapping:
94
+ chosen: chosen
95
+ reject: reject
96
+ id: index
97
+ prompt: prompt
98
+ read_tensor_step_dpo:
99
+ _target_: data.logic_combine.MultiMappingDataset
100
+ aligner:
101
+ _target_: data.logic_combine.field_extract_aligner
102
+ input_index_field: id
103
+ extract_index_field: id
104
+ extract_fields:
105
+ - query
106
+ extra_file: experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000//meta_math_sub.25k.rap.train.0shot.n10.tem1.0.p0.7.v1.0_clean_fix.json
107
+ template:
108
+ chosen: '{instruction}
109
+
110
+
111
+ ### Question: {query}
112
+
113
+
114
+ SubQuestion 1: {chosen}<eos>'
115
+ reject: '{instruction}
116
+
117
+
118
+ ### Question: {query}
119
+
120
+
121
+ SubQuestion 1: {reject}<eos>'
122
+ prompt: '{instruction}
123
+
124
+
125
+ ### Question: {query}
126
+
127
+
128
+ SubQuestion 1:'
129
+ instruction: 'Given a question, please decompose it into sub-questions. For each
130
+ sub-question, please answer it in a complete sentence, ending with "The answer
131
+ is". When the original question is answerable, please start the sub-question with
132
+ "Now we can answer the question: ".'
133
+ kv_mapping:
134
+ chosen: chosen
135
+ reject: reject
136
+ id: index
137
+ prompt: prompt
138
+ read_tensor:
139
+ _target_: data.general.ReplayDataset
140
+ _recursive_: false
141
+ new_dataset_cfg: ${read_tensor_step_dpo}
142
+ old_dataset_cfg: ${read_tensor_dpo}
143
+ replay_ratio: 0.2
144
+ dist_load_data_barrier: false
145
+ extended_vocab: null
146
+ collator:
147
+ _target_: data.dpo.DPOCollator
148
+ tokenizer: ${tokenizer_init}
149
+ max_seq_length: 1024
150
+ num_workers: 8
151
+ prefetch_factor: 2
152
+ model_name_or_path: ${sft_model_dir}
153
+ pretrain: null
154
+ dp_size: 4
155
+ tp_size: 1
156
+ pp_size: 1
157
+ exp_name: gemma.2b.it.meta_math_rap.step.dpo.H100.w4.v1.0.s${seed}
158
+ exp_notes: null
159
+ output_dir: experiments/${exp_name}
160
+ do_train: true
161
+ evaluate_during_training: false
162
+ do_eval: false
163
+ eval_sub_path: checkpoint-100
164
+ per_gpu_train_batch_size: 2
165
+ per_gpu_eval_batch_size: 4
166
+ learning_rate: 1.0e-06
167
+ gradient_accumulation_steps: 8
168
+ weight_decay: 0.1
169
+ adam_epsilon: 1.0e-06
170
+ adam_betas: (0.9, 0.98)
171
+ total_dataset_len: 63348
172
+ max_grad_norm: 1.0
173
+ num_train_epochs: 1
174
+ max_steps: 0
175
+ warmup_proportion: 0.06
176
+ warmup_steps: 0
177
+ optimizer: null
178
+ use_nvlamb: null
179
+ bit_training: null
180
+ logging_steps: 5
181
+ save_ds_state: false
182
+ save_steps: 100
183
+ save_best: false
184
+ eval_steps: 400
185
+ ddp_eval: true
186
+ no_cuda: false
187
+ seed: 43
188
+ local_rank: 0
189
+ fp16: true
190
+ fp16_opt_level: O1
191
+ fp16_bfloat16: true
192
+ prediction_cfg:
193
+ metric: loss
194
+ measure: -1
195
+ best_checkpoint: null
196
+ best_result: null
197
+ eval_forward_fn:
198
+ _target_: general_util.evaluator.DefaultForwardFn
199
+ post_process:
200
+ _target_: post_processors.dpo.DPOEvalPostProcessor
201
+ summary_helper:
202
+ _target_: general_util.tensorboard_helper.WandbWriter
203
+ batch_index_or_keys: null
204
+ outputs_index_or_keys:
205
+ train/chosen_reward: chosen_reward
206
+ train/rejected_reward: rejected_reward
207
+ n_gpu: 1
208
+ device: cuda:0
209
+ train_batch_size: 2
210
+ eval_batch_size: null
211
+ world_size: 4
checkpoint-200/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000/",
3
+ "architectures": [
4
+ "GemmaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 2,
9
+ "eos_token_id": 1,
10
+ "head_dim": 256,
11
+ "hidden_act": "gelu",
12
+ "hidden_size": 2048,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 16384,
15
+ "max_position_embeddings": 8192,
16
+ "model_type": "gemma",
17
+ "num_attention_heads": 8,
18
+ "num_hidden_layers": 18,
19
+ "num_key_value_heads": 1,
20
+ "pad_token_id": 0,
21
+ "rms_norm_eps": 1e-06,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000.0,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.38.2",
26
+ "use_cache": true,
27
+ "vocab_size": 256000
28
+ }
checkpoint-200/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.38.2"
7
+ }
checkpoint-200/gsm8k.test.v1.1.0shot.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-200/gsm8k.test.v1.1.0shot.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-200/gsm8k.test.v1.1.0shot.metrics.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "acc": 0.5170583775587566,
3
+ "correct": 682,
4
+ "total": 1319
5
+ }
checkpoint-200/math.test.v1.1.0shot.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-200/math.test.v1.1.0shot.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a1d53ec71532a1b15e81828d9e4a466438ec23564bc6fb7e5cdffb82641eff0
3
+ size 19888106
checkpoint-200/math.test.v1.1.0shot.metrics.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "acc": 0.1536,
3
+ "correct": 768,
4
+ "total": 5000
5
+ }
checkpoint-200/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7eee440231421df61e23a106ea15d23ab5546186f73f635558ef0d7f179c857e
3
+ size 5012367854
checkpoint-200/special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<start_of_turn>",
4
+ "<end_of_turn>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<bos>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<pad>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "unk_token": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
checkpoint-200/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05e97791a5e007260de1db7e1692e53150e08cea481e2bf25435553380c147ee
3
+ size 17477929
checkpoint-200/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2
3
+ size 4241003
checkpoint-200/tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<bos>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "106": {
38
+ "content": "<start_of_turn>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "107": {
46
+ "content": "<end_of_turn>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ }
53
+ },
54
+ "additional_special_tokens": [
55
+ "<start_of_turn>",
56
+ "<end_of_turn>"
57
+ ],
58
+ "bos_token": "<bos>",
59
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
60
+ "clean_up_tokenization_spaces": false,
61
+ "eos_token": "<eos>",
62
+ "legacy": null,
63
+ "model_max_length": 1000000000000000019884624838656,
64
+ "pad_token": "<pad>",
65
+ "sp_model_kwargs": {},
66
+ "spaces_between_special_tokens": false,
67
+ "tokenizer_class": "GemmaTokenizer",
68
+ "unk_token": "<unk>",
69
+ "use_default_system_prompt": false
70
+ }
checkpoint-200/training_config.yaml ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ds_cfg:
2
+ train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
3
+ gradient_accumulation_steps: ${gradient_accumulation_steps}
4
+ scheduler:
5
+ type: WarmupDecayLR
6
+ params:
7
+ total_num_steps: 989
8
+ warmup_max_lr: ${learning_rate}
9
+ warmup_num_steps: 59
10
+ warmup_type: linear
11
+ optimizer:
12
+ type: AdamW
13
+ params:
14
+ lr: ${learning_rate}
15
+ betas:
16
+ - 0.9
17
+ - 0.95
18
+ eps: 1.0e-06
19
+ weight_decay: ${weight_decay}
20
+ bf16:
21
+ enabled: true
22
+ zero_optimization:
23
+ stage: 1
24
+ stage3_param_persistence_threshold: 100000.0
25
+ stage3_max_live_parameters: 100000000.0
26
+ stage3_prefetch_bucket_size: 100000000.0
27
+ memory_efficient_linear: false
28
+ steps_per_print: 25
29
+ gradient_clipping: 1.0
30
+ prescale_gradients: false
31
+ sft_model_dir: experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000/
32
+ train_file: ${sft_model_dir}/meta_math_sub.25k.rap.train.0shot.n10.tem1.0.p0.7.v1.0_clean_fix.prm_cp3200_best_of_10.v1.0.(2,3).pos0.5.neg10.product.full_only_pos2pos.json
33
+ dev_file: null
34
+ test_file: null
35
+ torch_dtype:
36
+ _target_: general_util.training_utils.return_torch_dtype
37
+ dtype: bfloat16
38
+ tokenizer_init:
39
+ _target_: general_util.tokenization_utils.init_tokenizer
40
+ tokenizer_path: experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000/
41
+ padding_side: left
42
+ device_map:
43
+ _target_: models.utils.return_single_device_map
44
+ model:
45
+ _target_: models.gemma.GemmaForCausalLMDPO.from_pretrained_with_ref_model
46
+ beta: 0.5
47
+ gradient_checkpointing: false
48
+ attn_implementation: flash_attention_2
49
+ torch_dtype: ${torch_dtype}
50
+ device_map: ${device_map}
51
+ ref_model:
52
+ _target_: models.gemma.GemmaForCausalLM.from_pretrained
53
+ pretrained_model_name_or_path: ${model_name_or_path}
54
+ torch_dtype: ${torch_dtype}
55
+ attn_implementation: flash_attention_2
56
+ device_map: ${device_map}
57
+ read_tensor_dpo:
58
+ _target_: data.logic_combine.MultiMappingDataset
59
+ file_path: experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000//meta_math_sub.25k.rap.train.0shot.n10.tem1.0.p0.7.v1.0_clean_fix.json
60
+ aligner:
61
+ _target_: data.input_aligner.concat_aligner
62
+ aligners:
63
+ - _target_: data.input_aligner.dpo_pair_aligner_cleaned
64
+ response_field: response
65
+ id_field: id
66
+ do_sample: false
67
+ template:
68
+ chosen: '{instruction}
69
+
70
+
71
+ ### Question: {query}
72
+
73
+
74
+ SubQuestion 1: {pos}<eos>'
75
+ reject: '{instruction}
76
+
77
+
78
+ ### Question: {query}
79
+
80
+
81
+ SubQuestion 1: {neg}<eos>'
82
+ prompt: '{instruction}
83
+
84
+
85
+ ### Question: {query}
86
+
87
+
88
+ SubQuestion 1:'
89
+ instruction: 'Given a question, please decompose it into sub-questions. For each
90
+ sub-question, please answer it in a complete sentence, ending with "The answer
91
+ is". When the original question is answerable, please start the sub-question with
92
+ "Now we can answer the question: ".'
93
+ kv_mapping:
94
+ chosen: chosen
95
+ reject: reject
96
+ id: index
97
+ prompt: prompt
98
+ read_tensor_step_dpo:
99
+ _target_: data.logic_combine.MultiMappingDataset
100
+ aligner:
101
+ _target_: data.logic_combine.field_extract_aligner
102
+ input_index_field: id
103
+ extract_index_field: id
104
+ extract_fields:
105
+ - query
106
+ extra_file: experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000//meta_math_sub.25k.rap.train.0shot.n10.tem1.0.p0.7.v1.0_clean_fix.json
107
+ template:
108
+ chosen: '{instruction}
109
+
110
+
111
+ ### Question: {query}
112
+
113
+
114
+ SubQuestion 1: {chosen}<eos>'
115
+ reject: '{instruction}
116
+
117
+
118
+ ### Question: {query}
119
+
120
+
121
+ SubQuestion 1: {reject}<eos>'
122
+ prompt: '{instruction}
123
+
124
+
125
+ ### Question: {query}
126
+
127
+
128
+ SubQuestion 1:'
129
+ instruction: 'Given a question, please decompose it into sub-questions. For each
130
+ sub-question, please answer it in a complete sentence, ending with "The answer
131
+ is". When the original question is answerable, please start the sub-question with
132
+ "Now we can answer the question: ".'
133
+ kv_mapping:
134
+ chosen: chosen
135
+ reject: reject
136
+ id: index
137
+ prompt: prompt
138
+ read_tensor:
139
+ _target_: data.general.ReplayDataset
140
+ _recursive_: false
141
+ new_dataset_cfg: ${read_tensor_step_dpo}
142
+ old_dataset_cfg: ${read_tensor_dpo}
143
+ replay_ratio: 0.2
144
+ dist_load_data_barrier: false
145
+ extended_vocab: null
146
+ collator:
147
+ _target_: data.dpo.DPOCollator
148
+ tokenizer: ${tokenizer_init}
149
+ max_seq_length: 1024
150
+ num_workers: 8
151
+ prefetch_factor: 2
152
+ model_name_or_path: ${sft_model_dir}
153
+ pretrain: null
154
+ dp_size: 4
155
+ tp_size: 1
156
+ pp_size: 1
157
+ exp_name: gemma.2b.it.meta_math_rap.step.dpo.H100.w4.v1.0.s${seed}
158
+ exp_notes: null
159
+ output_dir: experiments/${exp_name}
160
+ do_train: true
161
+ evaluate_during_training: false
162
+ do_eval: false
163
+ eval_sub_path: checkpoint-100
164
+ per_gpu_train_batch_size: 2
165
+ per_gpu_eval_batch_size: 4
166
+ learning_rate: 1.0e-06
167
+ gradient_accumulation_steps: 8
168
+ weight_decay: 0.1
169
+ adam_epsilon: 1.0e-06
170
+ adam_betas: (0.9, 0.98)
171
+ total_dataset_len: 63348
172
+ max_grad_norm: 1.0
173
+ num_train_epochs: 1
174
+ max_steps: 0
175
+ warmup_proportion: 0.06
176
+ warmup_steps: 0
177
+ optimizer: null
178
+ use_nvlamb: null
179
+ bit_training: null
180
+ logging_steps: 5
181
+ save_ds_state: false
182
+ save_steps: 100
183
+ save_best: false
184
+ eval_steps: 400
185
+ ddp_eval: true
186
+ no_cuda: false
187
+ seed: 43
188
+ local_rank: 0
189
+ fp16: true
190
+ fp16_opt_level: O1
191
+ fp16_bfloat16: true
192
+ prediction_cfg:
193
+ metric: loss
194
+ measure: -1
195
+ best_checkpoint: null
196
+ best_result: null
197
+ eval_forward_fn:
198
+ _target_: general_util.evaluator.DefaultForwardFn
199
+ post_process:
200
+ _target_: post_processors.dpo.DPOEvalPostProcessor
201
+ summary_helper:
202
+ _target_: general_util.tensorboard_helper.WandbWriter
203
+ batch_index_or_keys: null
204
+ outputs_index_or_keys:
205
+ train/chosen_reward: chosen_reward
206
+ train/rejected_reward: rejected_reward
207
+ n_gpu: 1
208
+ device: cuda:0
209
+ train_batch_size: 2
210
+ eval_batch_size: null
211
+ world_size: 4
checkpoint-300/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000/",
3
+ "architectures": [
4
+ "GemmaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 2,
9
+ "eos_token_id": 1,
10
+ "head_dim": 256,
11
+ "hidden_act": "gelu",
12
+ "hidden_size": 2048,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 16384,
15
+ "max_position_embeddings": 8192,
16
+ "model_type": "gemma",
17
+ "num_attention_heads": 8,
18
+ "num_hidden_layers": 18,
19
+ "num_key_value_heads": 1,
20
+ "pad_token_id": 0,
21
+ "rms_norm_eps": 1e-06,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000.0,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.38.2",
26
+ "use_cache": true,
27
+ "vocab_size": 256000
28
+ }
checkpoint-300/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.38.2"
7
+ }
checkpoint-300/gsm8k.test.v1.1.0shot.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-300/gsm8k.test.v1.1.0shot.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-300/gsm8k.test.v1.1.0shot.metrics.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "acc": 0.5261561789234268,
3
+ "correct": 694,
4
+ "total": 1319
5
+ }
checkpoint-300/math.test.v1.1.0shot.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77b0a19307fb3fb8482ba835251ee99f900ee543012b9321a1b7fffba12b472f
3
+ size 10854292
checkpoint-300/math.test.v1.1.0shot.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8872901998a94c0184e4a12807565c81e05248fc24dfce24270eaba8c3f44cb4
3
+ size 21438580
checkpoint-300/math.test.v1.1.0shot.metrics.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "acc": 0.1594,
3
+ "correct": 797,
4
+ "total": 5000
5
+ }
checkpoint-300/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae6ad37e169b44e1ed90724ef05f80000af38987f3aabd09255504b4bd889ef8
3
+ size 5012367854
checkpoint-300/special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<start_of_turn>",
4
+ "<end_of_turn>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<bos>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<pad>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "unk_token": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
checkpoint-300/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05e97791a5e007260de1db7e1692e53150e08cea481e2bf25435553380c147ee
3
+ size 17477929
checkpoint-300/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2
3
+ size 4241003
checkpoint-300/tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<bos>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "106": {
38
+ "content": "<start_of_turn>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "107": {
46
+ "content": "<end_of_turn>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ }
53
+ },
54
+ "additional_special_tokens": [
55
+ "<start_of_turn>",
56
+ "<end_of_turn>"
57
+ ],
58
+ "bos_token": "<bos>",
59
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
60
+ "clean_up_tokenization_spaces": false,
61
+ "eos_token": "<eos>",
62
+ "legacy": null,
63
+ "model_max_length": 1000000000000000019884624838656,
64
+ "pad_token": "<pad>",
65
+ "sp_model_kwargs": {},
66
+ "spaces_between_special_tokens": false,
67
+ "tokenizer_class": "GemmaTokenizer",
68
+ "unk_token": "<unk>",
69
+ "use_default_system_prompt": false
70
+ }
checkpoint-300/training_config.yaml ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ds_cfg:
2
+ train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
3
+ gradient_accumulation_steps: ${gradient_accumulation_steps}
4
+ scheduler:
5
+ type: WarmupDecayLR
6
+ params:
7
+ total_num_steps: 989
8
+ warmup_max_lr: ${learning_rate}
9
+ warmup_num_steps: 59
10
+ warmup_type: linear
11
+ optimizer:
12
+ type: AdamW
13
+ params:
14
+ lr: ${learning_rate}
15
+ betas:
16
+ - 0.9
17
+ - 0.95
18
+ eps: 1.0e-06
19
+ weight_decay: ${weight_decay}
20
+ bf16:
21
+ enabled: true
22
+ zero_optimization:
23
+ stage: 1
24
+ stage3_param_persistence_threshold: 100000.0
25
+ stage3_max_live_parameters: 100000000.0
26
+ stage3_prefetch_bucket_size: 100000000.0
27
+ memory_efficient_linear: false
28
+ steps_per_print: 25
29
+ gradient_clipping: 1.0
30
+ prescale_gradients: false
31
+ sft_model_dir: experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000/
32
+ train_file: ${sft_model_dir}/meta_math_sub.25k.rap.train.0shot.n10.tem1.0.p0.7.v1.0_clean_fix.prm_cp3200_best_of_10.v1.0.(2,3).pos0.5.neg10.product.full_only_pos2pos.json
33
+ dev_file: null
34
+ test_file: null
35
+ torch_dtype:
36
+ _target_: general_util.training_utils.return_torch_dtype
37
+ dtype: bfloat16
38
+ tokenizer_init:
39
+ _target_: general_util.tokenization_utils.init_tokenizer
40
+ tokenizer_path: experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000/
41
+ padding_side: left
42
+ device_map:
43
+ _target_: models.utils.return_single_device_map
44
+ model:
45
+ _target_: models.gemma.GemmaForCausalLMDPO.from_pretrained_with_ref_model
46
+ beta: 0.5
47
+ gradient_checkpointing: false
48
+ attn_implementation: flash_attention_2
49
+ torch_dtype: ${torch_dtype}
50
+ device_map: ${device_map}
51
+ ref_model:
52
+ _target_: models.gemma.GemmaForCausalLM.from_pretrained
53
+ pretrained_model_name_or_path: ${model_name_or_path}
54
+ torch_dtype: ${torch_dtype}
55
+ attn_implementation: flash_attention_2
56
+ device_map: ${device_map}
57
+ read_tensor_dpo:
58
+ _target_: data.logic_combine.MultiMappingDataset
59
+ file_path: experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000//meta_math_sub.25k.rap.train.0shot.n10.tem1.0.p0.7.v1.0_clean_fix.json
60
+ aligner:
61
+ _target_: data.input_aligner.concat_aligner
62
+ aligners:
63
+ - _target_: data.input_aligner.dpo_pair_aligner_cleaned
64
+ response_field: response
65
+ id_field: id
66
+ do_sample: false
67
+ template:
68
+ chosen: '{instruction}
69
+
70
+
71
+ ### Question: {query}
72
+
73
+
74
+ SubQuestion 1: {pos}<eos>'
75
+ reject: '{instruction}
76
+
77
+
78
+ ### Question: {query}
79
+
80
+
81
+ SubQuestion 1: {neg}<eos>'
82
+ prompt: '{instruction}
83
+
84
+
85
+ ### Question: {query}
86
+
87
+
88
+ SubQuestion 1:'
89
+ instruction: 'Given a question, please decompose it into sub-questions. For each
90
+ sub-question, please answer it in a complete sentence, ending with "The answer
91
+ is". When the original question is answerable, please start the sub-question with
92
+ "Now we can answer the question: ".'
93
+ kv_mapping:
94
+ chosen: chosen
95
+ reject: reject
96
+ id: index
97
+ prompt: prompt
98
+ read_tensor_step_dpo:
99
+ _target_: data.logic_combine.MultiMappingDataset
100
+ aligner:
101
+ _target_: data.logic_combine.field_extract_aligner
102
+ input_index_field: id
103
+ extract_index_field: id
104
+ extract_fields:
105
+ - query
106
+ extra_file: experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000//meta_math_sub.25k.rap.train.0shot.n10.tem1.0.p0.7.v1.0_clean_fix.json
107
+ template:
108
+ chosen: '{instruction}
109
+
110
+
111
+ ### Question: {query}
112
+
113
+
114
+ SubQuestion 1: {chosen}<eos>'
115
+ reject: '{instruction}
116
+
117
+
118
+ ### Question: {query}
119
+
120
+
121
+ SubQuestion 1: {reject}<eos>'
122
+ prompt: '{instruction}
123
+
124
+
125
+ ### Question: {query}
126
+
127
+
128
+ SubQuestion 1:'
129
+ instruction: 'Given a question, please decompose it into sub-questions. For each
130
+ sub-question, please answer it in a complete sentence, ending with "The answer
131
+ is". When the original question is answerable, please start the sub-question with
132
+ "Now we can answer the question: ".'
133
+ kv_mapping:
134
+ chosen: chosen
135
+ reject: reject
136
+ id: index
137
+ prompt: prompt
138
+ read_tensor:
139
+ _target_: data.general.ReplayDataset
140
+ _recursive_: false
141
+ new_dataset_cfg: ${read_tensor_step_dpo}
142
+ old_dataset_cfg: ${read_tensor_dpo}
143
+ replay_ratio: 0.2
144
+ dist_load_data_barrier: false
145
+ extended_vocab: null
146
+ collator:
147
+ _target_: data.dpo.DPOCollator
148
+ tokenizer: ${tokenizer_init}
149
+ max_seq_length: 1024
150
+ num_workers: 8
151
+ prefetch_factor: 2
152
+ model_name_or_path: ${sft_model_dir}
153
+ pretrain: null
154
+ dp_size: 4
155
+ tp_size: 1
156
+ pp_size: 1
157
+ exp_name: gemma.2b.it.meta_math_rap.step.dpo.H100.w4.v1.0.s${seed}
158
+ exp_notes: null
159
+ output_dir: experiments/${exp_name}
160
+ do_train: true
161
+ evaluate_during_training: false
162
+ do_eval: false
163
+ eval_sub_path: checkpoint-100
164
+ per_gpu_train_batch_size: 2
165
+ per_gpu_eval_batch_size: 4
166
+ learning_rate: 1.0e-06
167
+ gradient_accumulation_steps: 8
168
+ weight_decay: 0.1
169
+ adam_epsilon: 1.0e-06
170
+ adam_betas: (0.9, 0.98)
171
+ total_dataset_len: 63348
172
+ max_grad_norm: 1.0
173
+ num_train_epochs: 1
174
+ max_steps: 0
175
+ warmup_proportion: 0.06
176
+ warmup_steps: 0
177
+ optimizer: null
178
+ use_nvlamb: null
179
+ bit_training: null
180
+ logging_steps: 5
181
+ save_ds_state: false
182
+ save_steps: 100
183
+ save_best: false
184
+ eval_steps: 400
185
+ ddp_eval: true
186
+ no_cuda: false
187
+ seed: 43
188
+ local_rank: 0
189
+ fp16: true
190
+ fp16_opt_level: O1
191
+ fp16_bfloat16: true
192
+ prediction_cfg:
193
+ metric: loss
194
+ measure: -1
195
+ best_checkpoint: null
196
+ best_result: null
197
+ eval_forward_fn:
198
+ _target_: general_util.evaluator.DefaultForwardFn
199
+ post_process:
200
+ _target_: post_processors.dpo.DPOEvalPostProcessor
201
+ summary_helper:
202
+ _target_: general_util.tensorboard_helper.WandbWriter
203
+ batch_index_or_keys: null
204
+ outputs_index_or_keys:
205
+ train/chosen_reward: chosen_reward
206
+ train/rejected_reward: rejected_reward
207
+ n_gpu: 1
208
+ device: cuda:0
209
+ train_batch_size: 2
210
+ eval_batch_size: null
211
+ world_size: 4
checkpoint-400/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000/",
3
+ "architectures": [
4
+ "GemmaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 2,
9
+ "eos_token_id": 1,
10
+ "head_dim": 256,
11
+ "hidden_act": "gelu",
12
+ "hidden_size": 2048,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 16384,
15
+ "max_position_embeddings": 8192,
16
+ "model_type": "gemma",
17
+ "num_attention_heads": 8,
18
+ "num_hidden_layers": 18,
19
+ "num_key_value_heads": 1,
20
+ "pad_token_id": 0,
21
+ "rms_norm_eps": 1e-06,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000.0,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.38.2",
26
+ "use_cache": true,
27
+ "vocab_size": 256000
28
+ }
checkpoint-400/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.38.2"
7
+ }
checkpoint-400/gsm8k.test.v1.1.0shot.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-400/gsm8k.test.v1.1.0shot.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-400/gsm8k.test.v1.1.0shot.metrics.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "acc": 0.5200909780136467,
3
+ "correct": 686,
4
+ "total": 1319
5
+ }
checkpoint-400/math.test.v1.1.0shot.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f15ce99022a48d9acb6f9ce9588fec9468144e9b72ac88522fdb388a42a3918
3
+ size 11489913
checkpoint-400/math.test.v1.1.0shot.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cc017f82fd3d5386cdd82e01350e8a6a1ee345a54440327515aa616d2c53200
3
+ size 22709822