Askinkaty commited on
Commit
cf352fd
·
verified ·
1 Parent(s): 9853722

Upload 11 files

Browse files
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "TurkuNLP/bert-base-finnish-uncased-v1",
3
+ "architectures": [
4
+ "CustomModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "torch_dtype": "float32",
21
+ "transformers_version": "4.44.2",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 50101
25
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:315f1eb82f1131dbbac9aac43c9ff10d0306298104ca83400ed11956911c9a61
3
+ size 498493620
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d591d16b130eadc5ac4c7e1fd2ec3e8a0971657b041f970d259b64ade4c146e9
3
+ size 997109754
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4433083b78374fda17b67e9a3e865a50a40744ffdc2411a580ba008969a4e4d0
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26934ede0e8874c268bcb910767c4746fed0c4849c5de1c8634b58cd96143014
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "101": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "102": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "103": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "104": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "mask_token": "[MASK]",
48
+ "model_max_length": 512,
49
+ "pad_token": "[PAD]",
50
+ "sep_token": "[SEP]",
51
+ "strip_accents": null,
52
+ "tokenize_chinese_chars": true,
53
+ "tokenizer_class": "BertTokenizer",
54
+ "unk_token": "[UNK]"
55
+ }
trainer_state.json ADDED
@@ -0,0 +1,762 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.5330753326416016,
3
+ "best_model_checkpoint": "/scratch/project_2006600/dif_models/fi_bert_reg/diff_bert_base_sampled_unfrozen_early_real_test/checkpoint-3500",
4
+ "epoch": 10.666666666666666,
5
+ "eval_steps": 100,
6
+ "global_step": 4000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.26666666666666666,
13
+ "grad_norm": 37.87092208862305,
14
+ "learning_rate": 5.444444444444444e-07,
15
+ "loss": 13.2633,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.26666666666666666,
20
+ "eval_loss": 10.746745109558105,
21
+ "eval_mse": 10.746745109558105,
22
+ "eval_r2": -3.070736885070801,
23
+ "eval_rmse": 3.2782227993011475,
24
+ "eval_runtime": 57.8905,
25
+ "eval_samples_per_second": 103.644,
26
+ "eval_steps_per_second": 25.911,
27
+ "step": 100
28
+ },
29
+ {
30
+ "epoch": 0.5333333333333333,
31
+ "grad_norm": 11.965848922729492,
32
+ "learning_rate": 1.0944444444444445e-06,
33
+ "loss": 6.0044,
34
+ "step": 200
35
+ },
36
+ {
37
+ "epoch": 0.5333333333333333,
38
+ "eval_loss": 2.2986323833465576,
39
+ "eval_mse": 2.2986323833465576,
40
+ "eval_r2": 0.129305899143219,
41
+ "eval_rmse": 1.5161241292953491,
42
+ "eval_runtime": 57.8766,
43
+ "eval_samples_per_second": 103.669,
44
+ "eval_steps_per_second": 25.917,
45
+ "step": 200
46
+ },
47
+ {
48
+ "epoch": 0.8,
49
+ "grad_norm": 16.900310516357422,
50
+ "learning_rate": 1.6500000000000003e-06,
51
+ "loss": 1.9777,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.8,
56
+ "eval_loss": 1.0734479427337646,
57
+ "eval_mse": 1.0734479427337646,
58
+ "eval_r2": 0.593390941619873,
59
+ "eval_rmse": 1.0360733270645142,
60
+ "eval_runtime": 57.8679,
61
+ "eval_samples_per_second": 103.684,
62
+ "eval_steps_per_second": 25.921,
63
+ "step": 300
64
+ },
65
+ {
66
+ "epoch": 1.0666666666666667,
67
+ "grad_norm": 36.441951751708984,
68
+ "learning_rate": 2.2e-06,
69
+ "loss": 0.9918,
70
+ "step": 400
71
+ },
72
+ {
73
+ "epoch": 1.0666666666666667,
74
+ "eval_loss": 0.7707585692405701,
75
+ "eval_mse": 0.7707585692405701,
76
+ "eval_r2": 0.708046019077301,
77
+ "eval_rmse": 0.8779285550117493,
78
+ "eval_runtime": 57.8738,
79
+ "eval_samples_per_second": 103.674,
80
+ "eval_steps_per_second": 25.918,
81
+ "step": 400
82
+ },
83
+ {
84
+ "epoch": 1.3333333333333333,
85
+ "grad_norm": 35.2764892578125,
86
+ "learning_rate": 2.755555555555556e-06,
87
+ "loss": 0.7404,
88
+ "step": 500
89
+ },
90
+ {
91
+ "epoch": 1.3333333333333333,
92
+ "eval_loss": 0.8621485829353333,
93
+ "eval_mse": 0.8621485829353333,
94
+ "eval_r2": 0.6734285354614258,
95
+ "eval_rmse": 0.9285195469856262,
96
+ "eval_runtime": 57.8776,
97
+ "eval_samples_per_second": 103.667,
98
+ "eval_steps_per_second": 25.917,
99
+ "step": 500
100
+ },
101
+ {
102
+ "epoch": 1.6,
103
+ "grad_norm": 25.652307510375977,
104
+ "learning_rate": 3.3055555555555558e-06,
105
+ "loss": 0.5897,
106
+ "step": 600
107
+ },
108
+ {
109
+ "epoch": 1.6,
110
+ "eval_loss": 0.7726595997810364,
111
+ "eval_mse": 0.7726595997810364,
112
+ "eval_r2": 0.7073259353637695,
113
+ "eval_rmse": 0.8790105581283569,
114
+ "eval_runtime": 57.8605,
115
+ "eval_samples_per_second": 103.698,
116
+ "eval_steps_per_second": 25.924,
117
+ "step": 600
118
+ },
119
+ {
120
+ "epoch": 1.8666666666666667,
121
+ "grad_norm": 20.405384063720703,
122
+ "learning_rate": 3.861111111111112e-06,
123
+ "loss": 0.5903,
124
+ "step": 700
125
+ },
126
+ {
127
+ "epoch": 1.8666666666666667,
128
+ "eval_loss": 0.7032095789909363,
129
+ "eval_mse": 0.7032095789909363,
130
+ "eval_r2": 0.7336327433586121,
131
+ "eval_rmse": 0.8385758996009827,
132
+ "eval_runtime": 57.8612,
133
+ "eval_samples_per_second": 103.696,
134
+ "eval_steps_per_second": 25.924,
135
+ "step": 700
136
+ },
137
+ {
138
+ "epoch": 2.1333333333333333,
139
+ "grad_norm": 17.7819766998291,
140
+ "learning_rate": 4.416666666666667e-06,
141
+ "loss": 0.4654,
142
+ "step": 800
143
+ },
144
+ {
145
+ "epoch": 2.1333333333333333,
146
+ "eval_loss": 0.6557766199111938,
147
+ "eval_mse": 0.6557766199111938,
148
+ "eval_r2": 0.7515997886657715,
149
+ "eval_rmse": 0.809800386428833,
150
+ "eval_runtime": 57.8596,
151
+ "eval_samples_per_second": 103.699,
152
+ "eval_steps_per_second": 25.925,
153
+ "step": 800
154
+ },
155
+ {
156
+ "epoch": 2.4,
157
+ "grad_norm": 16.64554786682129,
158
+ "learning_rate": 4.9722222222222224e-06,
159
+ "loss": 0.4457,
160
+ "step": 900
161
+ },
162
+ {
163
+ "epoch": 2.4,
164
+ "eval_loss": 0.7730452418327332,
165
+ "eval_mse": 0.7730452418327332,
166
+ "eval_r2": 0.707179844379425,
167
+ "eval_rmse": 0.8792299032211304,
168
+ "eval_runtime": 57.8442,
169
+ "eval_samples_per_second": 103.727,
170
+ "eval_steps_per_second": 25.932,
171
+ "step": 900
172
+ },
173
+ {
174
+ "epoch": 2.6666666666666665,
175
+ "grad_norm": 23.086116790771484,
176
+ "learning_rate": 5.527777777777779e-06,
177
+ "loss": 0.3555,
178
+ "step": 1000
179
+ },
180
+ {
181
+ "epoch": 2.6666666666666665,
182
+ "eval_loss": 0.6905973553657532,
183
+ "eval_mse": 0.6905973553657532,
184
+ "eval_r2": 0.7384101152420044,
185
+ "eval_rmse": 0.8310218453407288,
186
+ "eval_runtime": 57.8673,
187
+ "eval_samples_per_second": 103.685,
188
+ "eval_steps_per_second": 25.921,
189
+ "step": 1000
190
+ },
191
+ {
192
+ "epoch": 2.9333333333333336,
193
+ "grad_norm": 12.183361053466797,
194
+ "learning_rate": 6.083333333333333e-06,
195
+ "loss": 0.3482,
196
+ "step": 1100
197
+ },
198
+ {
199
+ "epoch": 2.9333333333333336,
200
+ "eval_loss": 0.7757130265235901,
201
+ "eval_mse": 0.7757130265235901,
202
+ "eval_r2": 0.7061693072319031,
203
+ "eval_rmse": 0.8807457089424133,
204
+ "eval_runtime": 57.8693,
205
+ "eval_samples_per_second": 103.682,
206
+ "eval_steps_per_second": 25.921,
207
+ "step": 1100
208
+ },
209
+ {
210
+ "epoch": 3.2,
211
+ "grad_norm": 7.989579677581787,
212
+ "learning_rate": 6.6388888888888895e-06,
213
+ "loss": 0.2977,
214
+ "step": 1200
215
+ },
216
+ {
217
+ "epoch": 3.2,
218
+ "eval_loss": 0.756076991558075,
219
+ "eval_mse": 0.7560770511627197,
220
+ "eval_r2": 0.7136071920394897,
221
+ "eval_rmse": 0.8695269227027893,
222
+ "eval_runtime": 57.8756,
223
+ "eval_samples_per_second": 103.671,
224
+ "eval_steps_per_second": 25.918,
225
+ "step": 1200
226
+ },
227
+ {
228
+ "epoch": 3.466666666666667,
229
+ "grad_norm": 23.822872161865234,
230
+ "learning_rate": 7.194444444444445e-06,
231
+ "loss": 0.312,
232
+ "step": 1300
233
+ },
234
+ {
235
+ "epoch": 3.466666666666667,
236
+ "eval_loss": 0.9145962595939636,
237
+ "eval_mse": 0.9145963788032532,
238
+ "eval_r2": 0.6535619497299194,
239
+ "eval_rmse": 0.9563453197479248,
240
+ "eval_runtime": 57.8768,
241
+ "eval_samples_per_second": 103.669,
242
+ "eval_steps_per_second": 25.917,
243
+ "step": 1300
244
+ },
245
+ {
246
+ "epoch": 3.7333333333333334,
247
+ "grad_norm": 15.19206428527832,
248
+ "learning_rate": 7.75e-06,
249
+ "loss": 0.2681,
250
+ "step": 1400
251
+ },
252
+ {
253
+ "epoch": 3.7333333333333334,
254
+ "eval_loss": 0.6494566202163696,
255
+ "eval_mse": 0.6494566202163696,
256
+ "eval_r2": 0.7539936900138855,
257
+ "eval_rmse": 0.8058887124061584,
258
+ "eval_runtime": 57.883,
259
+ "eval_samples_per_second": 103.657,
260
+ "eval_steps_per_second": 25.914,
261
+ "step": 1400
262
+ },
263
+ {
264
+ "epoch": 4.0,
265
+ "grad_norm": 10.404943466186523,
266
+ "learning_rate": 8.305555555555557e-06,
267
+ "loss": 0.2744,
268
+ "step": 1500
269
+ },
270
+ {
271
+ "epoch": 4.0,
272
+ "eval_loss": 0.6918351054191589,
273
+ "eval_mse": 0.6918351054191589,
274
+ "eval_r2": 0.7379412651062012,
275
+ "eval_rmse": 0.8317662477493286,
276
+ "eval_runtime": 57.8842,
277
+ "eval_samples_per_second": 103.655,
278
+ "eval_steps_per_second": 25.914,
279
+ "step": 1500
280
+ },
281
+ {
282
+ "epoch": 4.266666666666667,
283
+ "grad_norm": 11.18205738067627,
284
+ "learning_rate": 8.861111111111111e-06,
285
+ "loss": 0.2142,
286
+ "step": 1600
287
+ },
288
+ {
289
+ "epoch": 4.266666666666667,
290
+ "eval_loss": 0.7790309190750122,
291
+ "eval_mse": 0.779030978679657,
292
+ "eval_r2": 0.7049124836921692,
293
+ "eval_rmse": 0.8826273083686829,
294
+ "eval_runtime": 58.0438,
295
+ "eval_samples_per_second": 103.37,
296
+ "eval_steps_per_second": 25.843,
297
+ "step": 1600
298
+ },
299
+ {
300
+ "epoch": 4.533333333333333,
301
+ "grad_norm": 15.371255874633789,
302
+ "learning_rate": 9.416666666666667e-06,
303
+ "loss": 0.2103,
304
+ "step": 1700
305
+ },
306
+ {
307
+ "epoch": 4.533333333333333,
308
+ "eval_loss": 0.6177367568016052,
309
+ "eval_mse": 0.6177367568016052,
310
+ "eval_r2": 0.7660087943077087,
311
+ "eval_rmse": 0.7859622836112976,
312
+ "eval_runtime": 57.8859,
313
+ "eval_samples_per_second": 103.652,
314
+ "eval_steps_per_second": 25.913,
315
+ "step": 1700
316
+ },
317
+ {
318
+ "epoch": 4.8,
319
+ "grad_norm": 26.033283233642578,
320
+ "learning_rate": 9.972222222222224e-06,
321
+ "loss": 0.2009,
322
+ "step": 1800
323
+ },
324
+ {
325
+ "epoch": 4.8,
326
+ "eval_loss": 0.4925681948661804,
327
+ "eval_mse": 0.4925681948661804,
328
+ "eval_r2": 0.8134211301803589,
329
+ "eval_rmse": 0.7018320560455322,
330
+ "eval_runtime": 57.8886,
331
+ "eval_samples_per_second": 103.647,
332
+ "eval_steps_per_second": 25.912,
333
+ "step": 1800
334
+ },
335
+ {
336
+ "epoch": 5.066666666666666,
337
+ "grad_norm": 14.781371116638184,
338
+ "learning_rate": 1.0527777777777778e-05,
339
+ "loss": 0.2077,
340
+ "step": 1900
341
+ },
342
+ {
343
+ "epoch": 5.066666666666666,
344
+ "eval_loss": 0.731107234954834,
345
+ "eval_mse": 0.7311073541641235,
346
+ "eval_r2": 0.7230653762817383,
347
+ "eval_rmse": 0.8550481796264648,
348
+ "eval_runtime": 57.8849,
349
+ "eval_samples_per_second": 103.654,
350
+ "eval_steps_per_second": 25.913,
351
+ "step": 1900
352
+ },
353
+ {
354
+ "epoch": 5.333333333333333,
355
+ "grad_norm": 10.876953125,
356
+ "learning_rate": 1.1083333333333335e-05,
357
+ "loss": 0.1621,
358
+ "step": 2000
359
+ },
360
+ {
361
+ "epoch": 5.333333333333333,
362
+ "eval_loss": 0.9120460748672485,
363
+ "eval_mse": 0.9120460748672485,
364
+ "eval_r2": 0.654528021812439,
365
+ "eval_rmse": 0.9550110101699829,
366
+ "eval_runtime": 57.8944,
367
+ "eval_samples_per_second": 103.637,
368
+ "eval_steps_per_second": 25.909,
369
+ "step": 2000
370
+ },
371
+ {
372
+ "epoch": 5.6,
373
+ "grad_norm": 9.433772087097168,
374
+ "learning_rate": 1.163888888888889e-05,
375
+ "loss": 0.172,
376
+ "step": 2100
377
+ },
378
+ {
379
+ "epoch": 5.6,
380
+ "eval_loss": 0.7684391140937805,
381
+ "eval_mse": 0.7684392333030701,
382
+ "eval_r2": 0.7089245319366455,
383
+ "eval_rmse": 0.8766066431999207,
384
+ "eval_runtime": 57.8919,
385
+ "eval_samples_per_second": 103.641,
386
+ "eval_steps_per_second": 25.91,
387
+ "step": 2100
388
+ },
389
+ {
390
+ "epoch": 5.866666666666667,
391
+ "grad_norm": 8.749499320983887,
392
+ "learning_rate": 1.2194444444444447e-05,
393
+ "loss": 0.205,
394
+ "step": 2200
395
+ },
396
+ {
397
+ "epoch": 5.866666666666667,
398
+ "eval_loss": 0.9115136861801147,
399
+ "eval_mse": 0.9115136861801147,
400
+ "eval_r2": 0.6547296643257141,
401
+ "eval_rmse": 0.9547322392463684,
402
+ "eval_runtime": 57.8967,
403
+ "eval_samples_per_second": 103.633,
404
+ "eval_steps_per_second": 25.908,
405
+ "step": 2200
406
+ },
407
+ {
408
+ "epoch": 6.133333333333334,
409
+ "grad_norm": 13.432199478149414,
410
+ "learning_rate": 1.275e-05,
411
+ "loss": 0.1548,
412
+ "step": 2300
413
+ },
414
+ {
415
+ "epoch": 6.133333333333334,
416
+ "eval_loss": 0.7451738119125366,
417
+ "eval_mse": 0.7451738119125366,
418
+ "eval_r2": 0.7177371978759766,
419
+ "eval_rmse": 0.8632345199584961,
420
+ "eval_runtime": 57.8927,
421
+ "eval_samples_per_second": 103.64,
422
+ "eval_steps_per_second": 25.91,
423
+ "step": 2300
424
+ },
425
+ {
426
+ "epoch": 6.4,
427
+ "grad_norm": 13.872139930725098,
428
+ "learning_rate": 1.3305555555555556e-05,
429
+ "loss": 0.1593,
430
+ "step": 2400
431
+ },
432
+ {
433
+ "epoch": 6.4,
434
+ "eval_loss": 0.6607070565223694,
435
+ "eval_mse": 0.6607070565223694,
436
+ "eval_r2": 0.7497321963310242,
437
+ "eval_rmse": 0.8128389120101929,
438
+ "eval_runtime": 57.8803,
439
+ "eval_samples_per_second": 103.662,
440
+ "eval_steps_per_second": 25.916,
441
+ "step": 2400
442
+ },
443
+ {
444
+ "epoch": 6.666666666666667,
445
+ "grad_norm": 10.801892280578613,
446
+ "learning_rate": 1.3855555555555558e-05,
447
+ "loss": 0.1684,
448
+ "step": 2500
449
+ },
450
+ {
451
+ "epoch": 6.666666666666667,
452
+ "eval_loss": 0.8161285519599915,
453
+ "eval_mse": 0.8161285519599915,
454
+ "eval_r2": 0.690860390663147,
455
+ "eval_rmse": 0.903398334980011,
456
+ "eval_runtime": 57.8953,
457
+ "eval_samples_per_second": 103.635,
458
+ "eval_steps_per_second": 25.909,
459
+ "step": 2500
460
+ },
461
+ {
462
+ "epoch": 6.933333333333334,
463
+ "grad_norm": 13.156838417053223,
464
+ "learning_rate": 1.441111111111111e-05,
465
+ "loss": 0.1823,
466
+ "step": 2600
467
+ },
468
+ {
469
+ "epoch": 6.933333333333334,
470
+ "eval_loss": 1.0073966979980469,
471
+ "eval_mse": 1.0073966979980469,
472
+ "eval_r2": 0.6184103488922119,
473
+ "eval_rmse": 1.003691554069519,
474
+ "eval_runtime": 57.9026,
475
+ "eval_samples_per_second": 103.622,
476
+ "eval_steps_per_second": 25.906,
477
+ "step": 2600
478
+ },
479
+ {
480
+ "epoch": 7.2,
481
+ "grad_norm": 16.002506256103516,
482
+ "learning_rate": 1.4966666666666667e-05,
483
+ "loss": 0.1936,
484
+ "step": 2700
485
+ },
486
+ {
487
+ "epoch": 7.2,
488
+ "eval_loss": 0.8159008622169495,
489
+ "eval_mse": 0.8159008622169495,
490
+ "eval_r2": 0.690946638584137,
491
+ "eval_rmse": 0.9032723307609558,
492
+ "eval_runtime": 57.9032,
493
+ "eval_samples_per_second": 103.621,
494
+ "eval_steps_per_second": 25.905,
495
+ "step": 2700
496
+ },
497
+ {
498
+ "epoch": 7.466666666666667,
499
+ "grad_norm": 12.294295310974121,
500
+ "learning_rate": 1.5522222222222223e-05,
501
+ "loss": 0.1506,
502
+ "step": 2800
503
+ },
504
+ {
505
+ "epoch": 7.466666666666667,
506
+ "eval_loss": 0.8324363827705383,
507
+ "eval_mse": 0.8324363827705383,
508
+ "eval_r2": 0.6846832036972046,
509
+ "eval_rmse": 0.9123795032501221,
510
+ "eval_runtime": 57.9057,
511
+ "eval_samples_per_second": 103.617,
512
+ "eval_steps_per_second": 25.904,
513
+ "step": 2800
514
+ },
515
+ {
516
+ "epoch": 7.733333333333333,
517
+ "grad_norm": 5.195356369018555,
518
+ "learning_rate": 1.607777777777778e-05,
519
+ "loss": 0.1499,
520
+ "step": 2900
521
+ },
522
+ {
523
+ "epoch": 7.733333333333333,
524
+ "eval_loss": 0.8613407611846924,
525
+ "eval_mse": 0.8613407611846924,
526
+ "eval_r2": 0.6737345457077026,
527
+ "eval_rmse": 0.9280844330787659,
528
+ "eval_runtime": 57.9123,
529
+ "eval_samples_per_second": 103.605,
530
+ "eval_steps_per_second": 25.901,
531
+ "step": 2900
532
+ },
533
+ {
534
+ "epoch": 8.0,
535
+ "grad_norm": 11.357749938964844,
536
+ "learning_rate": 1.6633333333333336e-05,
537
+ "loss": 0.1631,
538
+ "step": 3000
539
+ },
540
+ {
541
+ "epoch": 8.0,
542
+ "eval_loss": 0.6052080988883972,
543
+ "eval_mse": 0.6052080988883972,
544
+ "eval_r2": 0.7707545161247253,
545
+ "eval_rmse": 0.7779512405395508,
546
+ "eval_runtime": 57.8963,
547
+ "eval_samples_per_second": 103.634,
548
+ "eval_steps_per_second": 25.908,
549
+ "step": 3000
550
+ },
551
+ {
552
+ "epoch": 8.266666666666667,
553
+ "grad_norm": 20.62454605102539,
554
+ "learning_rate": 1.7188888888888892e-05,
555
+ "loss": 0.1559,
556
+ "step": 3100
557
+ },
558
+ {
559
+ "epoch": 8.266666666666667,
560
+ "eval_loss": 0.584992527961731,
561
+ "eval_mse": 0.584992527961731,
562
+ "eval_r2": 0.7784119248390198,
563
+ "eval_rmse": 0.7648480534553528,
564
+ "eval_runtime": 57.8939,
565
+ "eval_samples_per_second": 103.638,
566
+ "eval_steps_per_second": 25.909,
567
+ "step": 3100
568
+ },
569
+ {
570
+ "epoch": 8.533333333333333,
571
+ "grad_norm": 5.493505954742432,
572
+ "learning_rate": 1.7744444444444445e-05,
573
+ "loss": 0.1434,
574
+ "step": 3200
575
+ },
576
+ {
577
+ "epoch": 8.533333333333333,
578
+ "eval_loss": 0.6216039061546326,
579
+ "eval_mse": 0.6216039061546326,
580
+ "eval_r2": 0.7645439505577087,
581
+ "eval_rmse": 0.7884185910224915,
582
+ "eval_runtime": 57.9149,
583
+ "eval_samples_per_second": 103.6,
584
+ "eval_steps_per_second": 25.9,
585
+ "step": 3200
586
+ },
587
+ {
588
+ "epoch": 8.8,
589
+ "grad_norm": 8.887247085571289,
590
+ "learning_rate": 1.83e-05,
591
+ "loss": 0.1485,
592
+ "step": 3300
593
+ },
594
+ {
595
+ "epoch": 8.8,
596
+ "eval_loss": 0.6688755750656128,
597
+ "eval_mse": 0.6688756346702576,
598
+ "eval_r2": 0.7466380000114441,
599
+ "eval_rmse": 0.8178481459617615,
600
+ "eval_runtime": 57.8949,
601
+ "eval_samples_per_second": 103.636,
602
+ "eval_steps_per_second": 25.909,
603
+ "step": 3300
604
+ },
605
+ {
606
+ "epoch": 9.066666666666666,
607
+ "grad_norm": 54.376461029052734,
608
+ "learning_rate": 1.8855555555555557e-05,
609
+ "loss": 0.1487,
610
+ "step": 3400
611
+ },
612
+ {
613
+ "epoch": 9.066666666666666,
614
+ "eval_loss": 0.6742914915084839,
615
+ "eval_mse": 0.6742914915084839,
616
+ "eval_r2": 0.7445865869522095,
617
+ "eval_rmse": 0.8211525678634644,
618
+ "eval_runtime": 57.9237,
619
+ "eval_samples_per_second": 103.584,
620
+ "eval_steps_per_second": 25.896,
621
+ "step": 3400
622
+ },
623
+ {
624
+ "epoch": 9.333333333333334,
625
+ "grad_norm": 2.6493732929229736,
626
+ "learning_rate": 1.941111111111111e-05,
627
+ "loss": 0.1248,
628
+ "step": 3500
629
+ },
630
+ {
631
+ "epoch": 9.333333333333334,
632
+ "eval_loss": 0.5330753326416016,
633
+ "eval_mse": 0.5330753326416016,
634
+ "eval_r2": 0.7980775237083435,
635
+ "eval_rmse": 0.730120062828064,
636
+ "eval_runtime": 57.9187,
637
+ "eval_samples_per_second": 103.593,
638
+ "eval_steps_per_second": 25.898,
639
+ "step": 3500
640
+ },
641
+ {
642
+ "epoch": 9.6,
643
+ "grad_norm": 22.65760040283203,
644
+ "learning_rate": 1.9966666666666666e-05,
645
+ "loss": 0.127,
646
+ "step": 3600
647
+ },
648
+ {
649
+ "epoch": 9.6,
650
+ "eval_loss": 0.760678231716156,
651
+ "eval_mse": 0.760678231716156,
652
+ "eval_r2": 0.7118643522262573,
653
+ "eval_rmse": 0.8721687197685242,
654
+ "eval_runtime": 57.9121,
655
+ "eval_samples_per_second": 103.605,
656
+ "eval_steps_per_second": 25.901,
657
+ "step": 3600
658
+ },
659
+ {
660
+ "epoch": 9.866666666666667,
661
+ "grad_norm": 11.861601829528809,
662
+ "learning_rate": 1.999958463257905e-05,
663
+ "loss": 0.1409,
664
+ "step": 3700
665
+ },
666
+ {
667
+ "epoch": 9.866666666666667,
668
+ "eval_loss": 0.6627658009529114,
669
+ "eval_mse": 0.6627658009529114,
670
+ "eval_r2": 0.748952329158783,
671
+ "eval_rmse": 0.8141043186187744,
672
+ "eval_runtime": 57.9148,
673
+ "eval_samples_per_second": 103.601,
674
+ "eval_steps_per_second": 25.9,
675
+ "step": 3700
676
+ },
677
+ {
678
+ "epoch": 10.133333333333333,
679
+ "grad_norm": 10.758604049682617,
680
+ "learning_rate": 1.999823082667316e-05,
681
+ "loss": 0.1352,
682
+ "step": 3800
683
+ },
684
+ {
685
+ "epoch": 10.133333333333333,
686
+ "eval_loss": 0.7061165571212769,
687
+ "eval_mse": 0.7061165571212769,
688
+ "eval_r2": 0.7325316071510315,
689
+ "eval_rmse": 0.8403074145317078,
690
+ "eval_runtime": 58.8761,
691
+ "eval_samples_per_second": 101.909,
692
+ "eval_steps_per_second": 25.477,
693
+ "step": 3800
694
+ },
695
+ {
696
+ "epoch": 10.4,
697
+ "grad_norm": 4.267393112182617,
698
+ "learning_rate": 1.999593701724414e-05,
699
+ "loss": 0.1418,
700
+ "step": 3900
701
+ },
702
+ {
703
+ "epoch": 10.4,
704
+ "eval_loss": 0.6572955846786499,
705
+ "eval_mse": 0.6572955846786499,
706
+ "eval_r2": 0.7510244250297546,
707
+ "eval_rmse": 0.810737669467926,
708
+ "eval_runtime": 57.921,
709
+ "eval_samples_per_second": 103.589,
710
+ "eval_steps_per_second": 25.897,
711
+ "step": 3900
712
+ },
713
+ {
714
+ "epoch": 10.666666666666666,
715
+ "grad_norm": 4.308215618133545,
716
+ "learning_rate": 1.9992703419949032e-05,
717
+ "loss": 0.138,
718
+ "step": 4000
719
+ },
720
+ {
721
+ "epoch": 10.666666666666666,
722
+ "eval_loss": 0.6316142678260803,
723
+ "eval_mse": 0.6316142678260803,
724
+ "eval_r2": 0.7607522010803223,
725
+ "eval_rmse": 0.7947416305541992,
726
+ "eval_runtime": 57.92,
727
+ "eval_samples_per_second": 103.591,
728
+ "eval_steps_per_second": 25.898,
729
+ "step": 4000
730
+ }
731
+ ],
732
+ "logging_steps": 100,
733
+ "max_steps": 9000,
734
+ "num_input_tokens_seen": 0,
735
+ "num_train_epochs": 24,
736
+ "save_steps": 500,
737
+ "stateful_callbacks": {
738
+ "EarlyStoppingCallback": {
739
+ "args": {
740
+ "early_stopping_patience": 6,
741
+ "early_stopping_threshold": 0.0
742
+ },
743
+ "attributes": {
744
+ "early_stopping_patience_counter": 0
745
+ }
746
+ },
747
+ "TrainerControl": {
748
+ "args": {
749
+ "should_epoch_stop": false,
750
+ "should_evaluate": false,
751
+ "should_log": false,
752
+ "should_save": true,
753
+ "should_training_stop": false
754
+ },
755
+ "attributes": {}
756
+ }
757
+ },
758
+ "total_flos": 1.6858183041024e+16,
759
+ "train_batch_size": 4,
760
+ "trial_name": null,
761
+ "trial_params": null
762
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea363f28b7ca133511cd9d0d648a88c4b88ac591bacff53c8f515319c80b8e4d
3
+ size 5176
vocab.txt ADDED
The diff for this file is too large to render. See raw diff