maviddoerdijk commited on
Commit
6a2d9e2
·
verified ·
1 Parent(s): 6a4b561

Model save

Browse files
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen2-VL-2B-Instruct
3
- datasets: datasets/id_NL_training_data_subset
4
  library_name: transformers
5
  model_name: Qwen2-VL-2B-Instruct-SFT
6
  tags:
7
  - generated_from_trainer
8
- - R1-V
9
  - trl
10
  - sft
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen2-VL-2B-Instruct-SFT
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) on the [datasets/id_NL_training_data_subset](https://huggingface.co/datasets/datasets/id_NL_training_data_subset) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/davidmoerdijk-smart-backoffice/huggingface/runs/jqfik2jd)
33
 
34
 
35
  This model was trained with SFT.
 
1
  ---
2
  base_model: Qwen/Qwen2-VL-2B-Instruct
 
3
  library_name: transformers
4
  model_name: Qwen2-VL-2B-Instruct-SFT
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - sft
9
  licence: license
 
11
 
12
  # Model Card for Qwen2-VL-2B-Instruct-SFT
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/davidmoerdijk-smart-backoffice/huggingface/runs/upgtle1o)
31
 
32
 
33
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 1233007047853056.0,
3
- "train_loss": 0.5888182865945917,
4
- "train_runtime": 310.5387,
5
- "train_samples": 19,
6
  "train_samples_per_second": 0.061,
7
  "train_steps_per_second": 0.061
8
  }
 
1
  {
2
+ "total_flos": 2.6349158744595456e+16,
3
+ "train_loss": 0.35062233002289483,
4
+ "train_runtime": 5984.3116,
5
+ "train_samples": 368,
6
  "train_samples_per_second": 0.061,
7
  "train_steps_per_second": 0.061
8
  }
config.json CHANGED
@@ -32,7 +32,7 @@
32
  "tie_word_embeddings": true,
33
  "torch_dtype": "bfloat16",
34
  "transformers_version": "4.49.0",
35
- "use_cache": true,
36
  "use_sliding_window": false,
37
  "video_token_id": 151656,
38
  "vision_config": {
 
32
  "tie_word_embeddings": true,
33
  "torch_dtype": "bfloat16",
34
  "transformers_version": "4.49.0",
35
+ "use_cache": false,
36
  "use_sliding_window": false,
37
  "video_token_id": 151656,
38
  "vision_config": {
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0285bea178194a3fdec435d742637af8d723203e0655feae60d78580716ef9e
3
  size 4418050848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f156ccc52a7782c764c1e608a1cedaa9dd68fb02692e8f00ea367e59881ac92
3
  size 4418050848
tokenizer_config.json CHANGED
@@ -138,7 +138,6 @@
138
  "model_max_length": 32768,
139
  "pad_token": "<|endoftext|>",
140
  "padding_side": "left",
141
- "processor_class": "Qwen2VLProcessor",
142
  "split_special_tokens": false,
143
  "tokenizer_class": "Qwen2Tokenizer",
144
  "unk_token": null
 
138
  "model_max_length": 32768,
139
  "pad_token": "<|endoftext|>",
140
  "padding_side": "left",
 
141
  "split_special_tokens": false,
142
  "tokenizer_class": "Qwen2Tokenizer",
143
  "unk_token": null
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 1233007047853056.0,
3
- "train_loss": 0.5888182865945917,
4
- "train_runtime": 310.5387,
5
- "train_samples": 19,
6
  "train_samples_per_second": 0.061,
7
  "train_steps_per_second": 0.061
8
  }
 
1
  {
2
+ "total_flos": 2.6349158744595456e+16,
3
+ "train_loss": 0.35062233002289483,
4
+ "train_runtime": 5984.3116,
5
+ "train_samples": 368,
6
  "train_samples_per_second": 0.061,
7
  "train_steps_per_second": 0.061
8
  }
trainer_state.json CHANGED
@@ -3,48 +3,608 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 19,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.2631578947368421,
13
- "grad_norm": 7.8125,
14
- "learning_rate": 1.8502171357296144e-05,
15
- "loss": 1.573,
16
- "mean_token_accuracy": 0.6957699418067932,
17
  "step": 5
18
  },
19
  {
20
- "epoch": 0.5263157894736842,
21
- "grad_norm": 12.1875,
22
- "learning_rate": 1.092268359463302e-05,
23
- "loss": 0.3895,
24
- "mean_token_accuracy": 0.9196829557418823,
25
  "step": 10
26
  },
27
  {
28
- "epoch": 0.7894736842105263,
29
- "grad_norm": 4.6875,
30
- "learning_rate": 2.6099108277934105e-06,
31
- "loss": 0.2011,
32
- "mean_token_accuracy": 0.9640364289283753,
33
  "step": 15
34
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  {
36
  "epoch": 1.0,
37
- "mean_token_accuracy": 0.9887278527021408,
38
- "step": 19,
39
- "total_flos": 1233007047853056.0,
40
- "train_loss": 0.5888182865945917,
41
- "train_runtime": 310.5387,
42
  "train_samples_per_second": 0.061,
43
  "train_steps_per_second": 0.061
44
  }
45
  ],
46
  "logging_steps": 5,
47
- "max_steps": 19,
48
  "num_input_tokens_seen": 0,
49
  "num_train_epochs": 1,
50
  "save_steps": 500,
@@ -60,7 +620,7 @@
60
  "attributes": {}
61
  }
62
  },
63
- "total_flos": 1233007047853056.0,
64
  "train_batch_size": 1,
65
  "trial_name": null,
66
  "trial_params": null
 
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 368,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.01358695652173913,
13
+ "grad_norm": 20.125,
14
+ "learning_rate": 2.702702702702703e-06,
15
+ "loss": 1.708,
16
+ "mean_token_accuracy": 0.6925400972366333,
17
  "step": 5
18
  },
19
  {
20
+ "epoch": 0.02717391304347826,
21
+ "grad_norm": 12.75,
22
+ "learning_rate": 5.405405405405406e-06,
23
+ "loss": 1.691,
24
+ "mean_token_accuracy": 0.6875800609588623,
25
  "step": 10
26
  },
27
  {
28
+ "epoch": 0.04076086956521739,
29
+ "grad_norm": 13.125,
30
+ "learning_rate": 8.108108108108109e-06,
31
+ "loss": 1.549,
32
+ "mean_token_accuracy": 0.7068500399589539,
33
  "step": 15
34
  },
35
+ {
36
+ "epoch": 0.05434782608695652,
37
+ "grad_norm": 11.375,
38
+ "learning_rate": 1.0810810810810812e-05,
39
+ "loss": 1.3324,
40
+ "mean_token_accuracy": 0.7429514169692993,
41
+ "step": 20
42
+ },
43
+ {
44
+ "epoch": 0.06793478260869565,
45
+ "grad_norm": 14.875,
46
+ "learning_rate": 1.3513513513513515e-05,
47
+ "loss": 0.9671,
48
+ "mean_token_accuracy": 0.8036805391311646,
49
+ "step": 25
50
+ },
51
+ {
52
+ "epoch": 0.08152173913043478,
53
+ "grad_norm": 13.5,
54
+ "learning_rate": 1.6216216216216218e-05,
55
+ "loss": 0.9606,
56
+ "mean_token_accuracy": 0.8269321084022522,
57
+ "step": 30
58
+ },
59
+ {
60
+ "epoch": 0.09510869565217392,
61
+ "grad_norm": 13.625,
62
+ "learning_rate": 1.891891891891892e-05,
63
+ "loss": 0.6205,
64
+ "mean_token_accuracy": 0.8943289399147034,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 0.10869565217391304,
69
+ "grad_norm": 11.4375,
70
+ "learning_rate": 1.9995946530314384e-05,
71
+ "loss": 0.3571,
72
+ "mean_token_accuracy": 0.934650433063507,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 0.12228260869565218,
77
+ "grad_norm": 8.75,
78
+ "learning_rate": 1.9971187226043746e-05,
79
+ "loss": 0.4531,
80
+ "mean_token_accuracy": 0.9206116795539856,
81
+ "step": 45
82
+ },
83
+ {
84
+ "epoch": 0.1358695652173913,
85
+ "grad_norm": 8.875,
86
+ "learning_rate": 1.9923976226947417e-05,
87
+ "loss": 0.4687,
88
+ "mean_token_accuracy": 0.916878092288971,
89
+ "step": 50
90
+ },
91
+ {
92
+ "epoch": 0.14945652173913043,
93
+ "grad_norm": 5.3125,
94
+ "learning_rate": 1.985441983600819e-05,
95
+ "loss": 0.3238,
96
+ "mean_token_accuracy": 0.9440895915031433,
97
+ "step": 55
98
+ },
99
+ {
100
+ "epoch": 0.16304347826086957,
101
+ "grad_norm": 16.375,
102
+ "learning_rate": 1.9762674670369757e-05,
103
+ "loss": 0.2045,
104
+ "mean_token_accuracy": 0.9566964030265808,
105
+ "step": 60
106
+ },
107
+ {
108
+ "epoch": 0.1766304347826087,
109
+ "grad_norm": 9.25,
110
+ "learning_rate": 1.9648947308688594e-05,
111
+ "loss": 0.2272,
112
+ "mean_token_accuracy": 0.9550906538963317,
113
+ "step": 65
114
+ },
115
+ {
116
+ "epoch": 0.19021739130434784,
117
+ "grad_norm": 13.75,
118
+ "learning_rate": 1.9513493825989664e-05,
119
+ "loss": 0.4506,
120
+ "mean_token_accuracy": 0.9193356156349182,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 0.20380434782608695,
125
+ "grad_norm": 12.1875,
126
+ "learning_rate": 1.9356619217073252e-05,
127
+ "loss": 0.4804,
128
+ "mean_token_accuracy": 0.9193973302841186,
129
+ "step": 75
130
+ },
131
+ {
132
+ "epoch": 0.21739130434782608,
133
+ "grad_norm": 6.96875,
134
+ "learning_rate": 1.917867670977126e-05,
135
+ "loss": 0.4535,
136
+ "mean_token_accuracy": 0.9251804232597352,
137
+ "step": 80
138
+ },
139
+ {
140
+ "epoch": 0.23097826086956522,
141
+ "grad_norm": 12.3125,
142
+ "learning_rate": 1.8980066969599216e-05,
143
+ "loss": 0.5313,
144
+ "mean_token_accuracy": 0.9139753341674804,
145
+ "step": 85
146
+ },
147
+ {
148
+ "epoch": 0.24456521739130435,
149
+ "grad_norm": 4.75,
150
+ "learning_rate": 1.8761237197594945e-05,
151
+ "loss": 0.2025,
152
+ "mean_token_accuracy": 0.9598217248916626,
153
+ "step": 90
154
+ },
155
+ {
156
+ "epoch": 0.25815217391304346,
157
+ "grad_norm": 2.859375,
158
+ "learning_rate": 1.852268012337514e-05,
159
+ "loss": 0.4308,
160
+ "mean_token_accuracy": 0.9284366369247437,
161
+ "step": 95
162
+ },
163
+ {
164
+ "epoch": 0.2717391304347826,
165
+ "grad_norm": 9.25,
166
+ "learning_rate": 1.8264932895677195e-05,
167
+ "loss": 0.4234,
168
+ "mean_token_accuracy": 0.930481493473053,
169
+ "step": 100
170
+ },
171
+ {
172
+ "epoch": 0.28532608695652173,
173
+ "grad_norm": 7.5,
174
+ "learning_rate": 1.798857587288445e-05,
175
+ "loss": 0.3343,
176
+ "mean_token_accuracy": 0.9406642079353332,
177
+ "step": 105
178
+ },
179
+ {
180
+ "epoch": 0.29891304347826086,
181
+ "grad_norm": 12.5,
182
+ "learning_rate": 1.769423131625808e-05,
183
+ "loss": 0.198,
184
+ "mean_token_accuracy": 0.9616581201553345,
185
+ "step": 110
186
+ },
187
+ {
188
+ "epoch": 0.3125,
189
+ "grad_norm": 6.875,
190
+ "learning_rate": 1.738256198881809e-05,
191
+ "loss": 0.2704,
192
+ "mean_token_accuracy": 0.9502018094062805,
193
+ "step": 115
194
+ },
195
+ {
196
+ "epoch": 0.32608695652173914,
197
+ "grad_norm": 2.859375,
198
+ "learning_rate": 1.7054269663028232e-05,
199
+ "loss": 0.208,
200
+ "mean_token_accuracy": 0.9611244797706604,
201
+ "step": 120
202
+ },
203
+ {
204
+ "epoch": 0.33967391304347827,
205
+ "grad_norm": 3.859375,
206
+ "learning_rate": 1.6710093540645056e-05,
207
+ "loss": 0.364,
208
+ "mean_token_accuracy": 0.9366228461265564,
209
+ "step": 125
210
+ },
211
+ {
212
+ "epoch": 0.3532608695652174,
213
+ "grad_norm": 5.6875,
214
+ "learning_rate": 1.6350808588288964e-05,
215
+ "loss": 0.1251,
216
+ "mean_token_accuracy": 0.9757622718811035,
217
+ "step": 130
218
+ },
219
+ {
220
+ "epoch": 0.36684782608695654,
221
+ "grad_norm": 7.1875,
222
+ "learning_rate": 1.597722379248512e-05,
223
+ "loss": 0.2729,
224
+ "mean_token_accuracy": 0.9491044163703919,
225
+ "step": 135
226
+ },
227
+ {
228
+ "epoch": 0.3804347826086957,
229
+ "grad_norm": 7.84375,
230
+ "learning_rate": 1.559018033810316e-05,
231
+ "loss": 0.3411,
232
+ "mean_token_accuracy": 0.9376911044120788,
233
+ "step": 140
234
+ },
235
+ {
236
+ "epoch": 0.39402173913043476,
237
+ "grad_norm": 2.65625,
238
+ "learning_rate": 1.5190549714297303e-05,
239
+ "loss": 0.2513,
240
+ "mean_token_accuracy": 0.9505762934684754,
241
+ "step": 145
242
+ },
243
+ {
244
+ "epoch": 0.4076086956521739,
245
+ "grad_norm": 5.875,
246
+ "learning_rate": 1.4779231752211546e-05,
247
+ "loss": 0.1996,
248
+ "mean_token_accuracy": 0.9632942318916321,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.421195652173913,
253
+ "grad_norm": 3.8125,
254
+ "learning_rate": 1.4357152598868478e-05,
255
+ "loss": 0.2669,
256
+ "mean_token_accuracy": 0.9502842426300049,
257
+ "step": 155
258
+ },
259
+ {
260
+ "epoch": 0.43478260869565216,
261
+ "grad_norm": 5.34375,
262
+ "learning_rate": 1.3925262631803722e-05,
263
+ "loss": 0.1767,
264
+ "mean_token_accuracy": 0.962754237651825,
265
+ "step": 160
266
+ },
267
+ {
268
+ "epoch": 0.4483695652173913,
269
+ "grad_norm": 2.328125,
270
+ "learning_rate": 1.3484534319141592e-05,
271
+ "loss": 0.1936,
272
+ "mean_token_accuracy": 0.9623476982116699,
273
+ "step": 165
274
+ },
275
+ {
276
+ "epoch": 0.46195652173913043,
277
+ "grad_norm": 4.09375,
278
+ "learning_rate": 1.303596002993028e-05,
279
+ "loss": 0.2875,
280
+ "mean_token_accuracy": 0.948505699634552,
281
+ "step": 170
282
+ },
283
+ {
284
+ "epoch": 0.47554347826086957,
285
+ "grad_norm": 5.0625,
286
+ "learning_rate": 1.2580549799667034e-05,
287
+ "loss": 0.1843,
288
+ "mean_token_accuracy": 0.9624204635620117,
289
+ "step": 175
290
+ },
291
+ {
292
+ "epoch": 0.4891304347826087,
293
+ "grad_norm": 5.21875,
294
+ "learning_rate": 1.2119329056044533e-05,
295
+ "loss": 0.1725,
296
+ "mean_token_accuracy": 0.9649592399597168,
297
+ "step": 180
298
+ },
299
+ {
300
+ "epoch": 0.5027173913043478,
301
+ "grad_norm": 8.8125,
302
+ "learning_rate": 1.165333631003928e-05,
303
+ "loss": 0.3102,
304
+ "mean_token_accuracy": 0.9432898998260498,
305
+ "step": 185
306
+ },
307
+ {
308
+ "epoch": 0.5163043478260869,
309
+ "grad_norm": 11.6875,
310
+ "learning_rate": 1.1183620817540985e-05,
311
+ "loss": 0.2023,
312
+ "mean_token_accuracy": 0.9577706575393676,
313
+ "step": 190
314
+ },
315
+ {
316
+ "epoch": 0.529891304347826,
317
+ "grad_norm": 3.734375,
318
+ "learning_rate": 1.0711240216788036e-05,
319
+ "loss": 0.3165,
320
+ "mean_token_accuracy": 0.9471179962158203,
321
+ "step": 195
322
+ },
323
+ {
324
+ "epoch": 0.5434782608695652,
325
+ "grad_norm": 6.90625,
326
+ "learning_rate": 1.0237258146928849e-05,
327
+ "loss": 0.1964,
328
+ "mean_token_accuracy": 0.9606888651847839,
329
+ "step": 200
330
+ },
331
+ {
332
+ "epoch": 0.5570652173913043,
333
+ "grad_norm": 5.875,
334
+ "learning_rate": 9.762741853071153e-06,
335
+ "loss": 0.2211,
336
+ "mean_token_accuracy": 0.9585472226142884,
337
+ "step": 205
338
+ },
339
+ {
340
+ "epoch": 0.5706521739130435,
341
+ "grad_norm": 5.53125,
342
+ "learning_rate": 9.288759783211967e-06,
343
+ "loss": 0.2484,
344
+ "mean_token_accuracy": 0.9550203323364258,
345
+ "step": 210
346
+ },
347
+ {
348
+ "epoch": 0.5842391304347826,
349
+ "grad_norm": 2.8125,
350
+ "learning_rate": 8.81637918245902e-06,
351
+ "loss": 0.3283,
352
+ "mean_token_accuracy": 0.9439280152320861,
353
+ "step": 215
354
+ },
355
+ {
356
+ "epoch": 0.5978260869565217,
357
+ "grad_norm": 3.8125,
358
+ "learning_rate": 8.346663689960724e-06,
359
+ "loss": 0.2486,
360
+ "mean_token_accuracy": 0.953123664855957,
361
+ "step": 220
362
+ },
363
+ {
364
+ "epoch": 0.6114130434782609,
365
+ "grad_norm": 5.4375,
366
+ "learning_rate": 7.880670943955467e-06,
367
+ "loss": 0.2411,
368
+ "mean_token_accuracy": 0.9570030689239502,
369
+ "step": 225
370
+ },
371
+ {
372
+ "epoch": 0.625,
373
+ "grad_norm": 2.078125,
374
+ "learning_rate": 7.419450200332965e-06,
375
+ "loss": 0.2028,
376
+ "mean_token_accuracy": 0.9596728563308716,
377
+ "step": 230
378
+ },
379
+ {
380
+ "epoch": 0.6385869565217391,
381
+ "grad_norm": 2.9375,
382
+ "learning_rate": 6.964039970069722e-06,
383
+ "loss": 0.2145,
384
+ "mean_token_accuracy": 0.9589278101921082,
385
+ "step": 235
386
+ },
387
+ {
388
+ "epoch": 0.6521739130434783,
389
+ "grad_norm": 6.4375,
390
+ "learning_rate": 6.515465680858412e-06,
391
+ "loss": 0.2646,
392
+ "mean_token_accuracy": 0.9482977151870727,
393
+ "step": 240
394
+ },
395
+ {
396
+ "epoch": 0.6657608695652174,
397
+ "grad_norm": 5.46875,
398
+ "learning_rate": 6.074737368196279e-06,
399
+ "loss": 0.2332,
400
+ "mean_token_accuracy": 0.9552296161651611,
401
+ "step": 245
402
+ },
403
+ {
404
+ "epoch": 0.6793478260869565,
405
+ "grad_norm": 1.8984375,
406
+ "learning_rate": 5.642847401131526e-06,
407
+ "loss": 0.2629,
408
+ "mean_token_accuracy": 0.953015148639679,
409
+ "step": 250
410
+ },
411
+ {
412
+ "epoch": 0.6929347826086957,
413
+ "grad_norm": 6.8125,
414
+ "learning_rate": 5.220768247788458e-06,
415
+ "loss": 0.2226,
416
+ "mean_token_accuracy": 0.9579466819763184,
417
+ "step": 255
418
+ },
419
+ {
420
+ "epoch": 0.7065217391304348,
421
+ "grad_norm": 2.03125,
422
+ "learning_rate": 4.809450285702697e-06,
423
+ "loss": 0.1219,
424
+ "mean_token_accuracy": 0.9744248032569885,
425
+ "step": 260
426
+ },
427
+ {
428
+ "epoch": 0.720108695652174,
429
+ "grad_norm": 4.5,
430
+ "learning_rate": 4.409819661896839e-06,
431
+ "loss": 0.1909,
432
+ "mean_token_accuracy": 0.9654230594635009,
433
+ "step": 265
434
+ },
435
+ {
436
+ "epoch": 0.7336956521739131,
437
+ "grad_norm": 8.375,
438
+ "learning_rate": 4.022776207514885e-06,
439
+ "loss": 0.183,
440
+ "mean_token_accuracy": 0.9614866137504577,
441
+ "step": 270
442
+ },
443
+ {
444
+ "epoch": 0.7472826086956522,
445
+ "grad_norm": 2.53125,
446
+ "learning_rate": 3.6491914117110405e-06,
447
+ "loss": 0.2841,
448
+ "mean_token_accuracy": 0.9495038151741028,
449
+ "step": 275
450
+ },
451
+ {
452
+ "epoch": 0.7608695652173914,
453
+ "grad_norm": 3.5,
454
+ "learning_rate": 3.2899064593549477e-06,
455
+ "loss": 0.147,
456
+ "mean_token_accuracy": 0.9706088781356812,
457
+ "step": 280
458
+ },
459
+ {
460
+ "epoch": 0.7744565217391305,
461
+ "grad_norm": 9.375,
462
+ "learning_rate": 2.945730336971767e-06,
463
+ "loss": 0.2865,
464
+ "mean_token_accuracy": 0.9473696708679199,
465
+ "step": 285
466
+ },
467
+ {
468
+ "epoch": 0.7880434782608695,
469
+ "grad_norm": 3.140625,
470
+ "learning_rate": 2.6174380111819144e-06,
471
+ "loss": 0.1647,
472
+ "mean_token_accuracy": 0.9646413207054139,
473
+ "step": 290
474
+ },
475
+ {
476
+ "epoch": 0.8016304347826086,
477
+ "grad_norm": 3.921875,
478
+ "learning_rate": 2.3057686837419246e-06,
479
+ "loss": 0.127,
480
+ "mean_token_accuracy": 0.9736526370048523,
481
+ "step": 295
482
+ },
483
+ {
484
+ "epoch": 0.8152173913043478,
485
+ "grad_norm": 2.78125,
486
+ "learning_rate": 2.011424127115552e-06,
487
+ "loss": 0.1714,
488
+ "mean_token_accuracy": 0.963629424571991,
489
+ "step": 300
490
+ },
491
+ {
492
+ "epoch": 0.8288043478260869,
493
+ "grad_norm": 3.515625,
494
+ "learning_rate": 1.7350671043228072e-06,
495
+ "loss": 0.2071,
496
+ "mean_token_accuracy": 0.9631360173225403,
497
+ "step": 305
498
+ },
499
+ {
500
+ "epoch": 0.842391304347826,
501
+ "grad_norm": 2.375,
502
+ "learning_rate": 1.4773198766248642e-06,
503
+ "loss": 0.1566,
504
+ "mean_token_accuracy": 0.971684205532074,
505
+ "step": 310
506
+ },
507
+ {
508
+ "epoch": 0.8559782608695652,
509
+ "grad_norm": 2.265625,
510
+ "learning_rate": 1.2387628024050557e-06,
511
+ "loss": 0.2517,
512
+ "mean_token_accuracy": 0.9517509698867798,
513
+ "step": 315
514
+ },
515
+ {
516
+ "epoch": 0.8695652173913043,
517
+ "grad_norm": 6.5625,
518
+ "learning_rate": 1.0199330304007858e-06,
519
+ "loss": 0.247,
520
+ "mean_token_accuracy": 0.9540893912315369,
521
+ "step": 320
522
+ },
523
+ {
524
+ "epoch": 0.8831521739130435,
525
+ "grad_norm": 6.65625,
526
+ "learning_rate": 8.213232902287438e-07,
527
+ "loss": 0.1369,
528
+ "mean_token_accuracy": 0.9701100349426269,
529
+ "step": 325
530
+ },
531
+ {
532
+ "epoch": 0.8967391304347826,
533
+ "grad_norm": 1.90625,
534
+ "learning_rate": 6.433807829267491e-07,
535
+ "loss": 0.1382,
536
+ "mean_token_accuracy": 0.9745123624801636,
537
+ "step": 330
538
+ },
539
+ {
540
+ "epoch": 0.9103260869565217,
541
+ "grad_norm": 5.125,
542
+ "learning_rate": 4.865061740103361e-07,
543
+ "loss": 0.1746,
544
+ "mean_token_accuracy": 0.9617828845977783,
545
+ "step": 335
546
+ },
547
+ {
548
+ "epoch": 0.9239130434782609,
549
+ "grad_norm": 2.765625,
550
+ "learning_rate": 3.510526913114065e-07,
551
+ "loss": 0.2696,
552
+ "mean_token_accuracy": 0.9500232100486755,
553
+ "step": 340
554
+ },
555
+ {
556
+ "epoch": 0.9375,
557
+ "grad_norm": 3.5,
558
+ "learning_rate": 2.3732532963024468e-07,
559
+ "loss": 0.2069,
560
+ "mean_token_accuracy": 0.9611208081245423,
561
+ "step": 345
562
+ },
563
+ {
564
+ "epoch": 0.9510869565217391,
565
+ "grad_norm": 5.6875,
566
+ "learning_rate": 1.4558016399181086e-07,
567
+ "loss": 0.1823,
568
+ "mean_token_accuracy": 0.965398907661438,
569
+ "step": 350
570
+ },
571
+ {
572
+ "epoch": 0.9646739130434783,
573
+ "grad_norm": 3.359375,
574
+ "learning_rate": 7.602377305258479e-08,
575
+ "loss": 0.1987,
576
+ "mean_token_accuracy": 0.9622387170791626,
577
+ "step": 355
578
+ },
579
+ {
580
+ "epoch": 0.9782608695652174,
581
+ "grad_norm": 2.203125,
582
+ "learning_rate": 2.8812773956256034e-08,
583
+ "loss": 0.1906,
584
+ "mean_token_accuracy": 0.9647317886352539,
585
+ "step": 360
586
+ },
587
+ {
588
+ "epoch": 0.9918478260869565,
589
+ "grad_norm": 4.09375,
590
+ "learning_rate": 4.053469685617595e-09,
591
+ "loss": 0.2464,
592
+ "mean_token_accuracy": 0.9541746735572815,
593
+ "step": 365
594
+ },
595
  {
596
  "epoch": 1.0,
597
+ "mean_token_accuracy": 0.9636613726615906,
598
+ "step": 368,
599
+ "total_flos": 2.6349158744595456e+16,
600
+ "train_loss": 0.35062233002289483,
601
+ "train_runtime": 5984.3116,
602
  "train_samples_per_second": 0.061,
603
  "train_steps_per_second": 0.061
604
  }
605
  ],
606
  "logging_steps": 5,
607
+ "max_steps": 368,
608
  "num_input_tokens_seen": 0,
609
  "num_train_epochs": 1,
610
  "save_steps": 500,
 
620
  "attributes": {}
621
  }
622
  },
623
+ "total_flos": 2.6349158744595456e+16,
624
  "train_batch_size": 1,
625
  "trial_name": null,
626
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd9bf3d606929546a022d892646573bcc3e970260585008b47f7aeae047c4361
3
  size 5816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91230238ed1969a866d7d8c0c74c9ddc8fe5ca75f081aec08d689c59e078032f
3
  size 5816