leoho36 commited on
Commit
72b1cce
·
verified ·
1 Parent(s): e903b31

Upload 13 files

Browse files
config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/layoutlmv3-base",
3
+ "architectures": [
4
+ "LayoutLMv3ForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "coordinate_size": 128,
10
+ "eos_token_id": 2,
11
+ "has_relative_attention_bias": true,
12
+ "has_spatial_attention_bias": true,
13
+ "hidden_act": "gelu",
14
+ "hidden_dropout_prob": 0.1,
15
+ "hidden_size": 768,
16
+ "id2label": {
17
+ "0": "B-Course",
18
+ "1": "I-Course",
19
+ "2": "GPA",
20
+ "3": "O"
21
+ },
22
+ "initializer_range": 0.02,
23
+ "input_size": 224,
24
+ "intermediate_size": 3072,
25
+ "label2id": {
26
+ "B-Course": 0,
27
+ "GPA": 2,
28
+ "I-Course": 1,
29
+ "O": 3
30
+ },
31
+ "layer_norm_eps": 1e-05,
32
+ "max_2d_position_embeddings": 1024,
33
+ "max_position_embeddings": 514,
34
+ "max_rel_2d_pos": 256,
35
+ "max_rel_pos": 128,
36
+ "model_type": "layoutlmv3",
37
+ "num_attention_heads": 12,
38
+ "num_channels": 3,
39
+ "num_hidden_layers": 12,
40
+ "pad_token_id": 1,
41
+ "patch_size": 16,
42
+ "rel_2d_pos_bins": 64,
43
+ "rel_pos_bins": 32,
44
+ "second_input_size": 112,
45
+ "shape_size": 128,
46
+ "text_embed": true,
47
+ "torch_dtype": "float32",
48
+ "transformers_version": "4.49.0.dev0",
49
+ "type_vocab_size": 1,
50
+ "visual_embed": true,
51
+ "vocab_size": 50265
52
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06eaab7c0a83cf04e312d85d1f3a67dc9717e46073d35ca620282c58d37aa29e
3
+ size 501346304
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61bf734f6fa01f8b11fb4f044257778edb2c9b61e7ba0cdda7d6543b36ed5d74
3
+ size 1002801594
preprocessor_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "apply_ocr": false,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.5,
8
+ 0.5,
9
+ 0.5
10
+ ],
11
+ "image_processor_type": "LayoutLMv3ImageProcessor",
12
+ "image_std": [
13
+ 0.5,
14
+ 0.5,
15
+ 0.5
16
+ ],
17
+ "ocr_lang": null,
18
+ "processor_class": "LayoutLMv3Processor",
19
+ "resample": 2,
20
+ "rescale_factor": 0.00392156862745098,
21
+ "size": {
22
+ "height": 224,
23
+ "width": 224
24
+ },
25
+ "tesseract_config": ""
26
+ }
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d8fb39e3828c3c150006a2da427e39d0a1b616782bb70378bdb3db112621fb3
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcd60da393ff67ad2f08779377eeaa08189a6bf520f752442788825d9bce1449
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "apply_ocr": false,
46
+ "bos_token": "<s>",
47
+ "clean_up_tokenization_spaces": false,
48
+ "cls_token": "<s>",
49
+ "cls_token_box": [
50
+ 0,
51
+ 0,
52
+ 0,
53
+ 0
54
+ ],
55
+ "eos_token": "</s>",
56
+ "errors": "replace",
57
+ "extra_special_tokens": {},
58
+ "mask_token": "<mask>",
59
+ "model_max_length": 512,
60
+ "only_label_first_subword": true,
61
+ "pad_token": "<pad>",
62
+ "pad_token_box": [
63
+ 0,
64
+ 0,
65
+ 0,
66
+ 0
67
+ ],
68
+ "pad_token_label": -100,
69
+ "processor_class": "LayoutLMv3Processor",
70
+ "sep_token": "</s>",
71
+ "sep_token_box": [
72
+ 0,
73
+ 0,
74
+ 0,
75
+ 0
76
+ ],
77
+ "tokenizer_class": "LayoutLMv3Tokenizer",
78
+ "trim_offsets": true,
79
+ "unk_token": "<unk>"
80
+ }
trainer_state.json ADDED
@@ -0,0 +1,913 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.0,
3
+ "best_model_checkpoint": "test_long_tokens/checkpoint-70",
4
+ "epoch": 14.285714285714286,
5
+ "eval_steps": 500,
6
+ "global_step": 100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.14285714285714285,
13
+ "grad_norm": 18.59529685974121,
14
+ "learning_rate": 5e-06,
15
+ "loss": 1.637,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.2857142857142857,
20
+ "grad_norm": 20.817951202392578,
21
+ "learning_rate": 1e-05,
22
+ "loss": 1.6372,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.42857142857142855,
27
+ "grad_norm": 19.524600982666016,
28
+ "learning_rate": 1.5e-05,
29
+ "loss": 1.4306,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.5714285714285714,
34
+ "grad_norm": 15.0996675491333,
35
+ "learning_rate": 2e-05,
36
+ "loss": 1.2657,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.7142857142857143,
41
+ "grad_norm": 12.507349967956543,
42
+ "learning_rate": 2.5e-05,
43
+ "loss": 0.8948,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.8571428571428571,
48
+ "grad_norm": 8.167460441589355,
49
+ "learning_rate": 3e-05,
50
+ "loss": 0.5404,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 1.0,
55
+ "grad_norm": 5.33146858215332,
56
+ "learning_rate": 3.5e-05,
57
+ "loss": 0.5374,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 1.0,
62
+ "eval_accuracy": 0.8645276292335116,
63
+ "eval_f1": 0.0,
64
+ "eval_loss": 0.7161440253257751,
65
+ "eval_precision": 0.0,
66
+ "eval_recall": 0.0,
67
+ "eval_runtime": 0.2378,
68
+ "eval_samples_per_second": 12.618,
69
+ "eval_steps_per_second": 12.618,
70
+ "step": 7
71
+ },
72
+ {
73
+ "epoch": 1.1428571428571428,
74
+ "grad_norm": 2.557964324951172,
75
+ "learning_rate": 4e-05,
76
+ "loss": 0.2147,
77
+ "step": 8
78
+ },
79
+ {
80
+ "epoch": 1.2857142857142856,
81
+ "grad_norm": 4.4479289054870605,
82
+ "learning_rate": 4.5e-05,
83
+ "loss": 0.5833,
84
+ "step": 9
85
+ },
86
+ {
87
+ "epoch": 1.4285714285714286,
88
+ "grad_norm": 5.119785308837891,
89
+ "learning_rate": 5e-05,
90
+ "loss": 0.6062,
91
+ "step": 10
92
+ },
93
+ {
94
+ "epoch": 1.5714285714285714,
95
+ "grad_norm": 3.064371347427368,
96
+ "learning_rate": 4.9444444444444446e-05,
97
+ "loss": 0.451,
98
+ "step": 11
99
+ },
100
+ {
101
+ "epoch": 1.7142857142857144,
102
+ "grad_norm": 1.8660037517547607,
103
+ "learning_rate": 4.888888888888889e-05,
104
+ "loss": 0.3188,
105
+ "step": 12
106
+ },
107
+ {
108
+ "epoch": 1.8571428571428572,
109
+ "grad_norm": 2.083087205886841,
110
+ "learning_rate": 4.8333333333333334e-05,
111
+ "loss": 0.2287,
112
+ "step": 13
113
+ },
114
+ {
115
+ "epoch": 2.0,
116
+ "grad_norm": 2.0547335147857666,
117
+ "learning_rate": 4.7777777777777784e-05,
118
+ "loss": 0.3423,
119
+ "step": 14
120
+ },
121
+ {
122
+ "epoch": 2.0,
123
+ "eval_accuracy": 0.8645276292335116,
124
+ "eval_f1": 0.0,
125
+ "eval_loss": 0.459845632314682,
126
+ "eval_precision": 0.0,
127
+ "eval_recall": 0.0,
128
+ "eval_runtime": 0.2226,
129
+ "eval_samples_per_second": 13.474,
130
+ "eval_steps_per_second": 13.474,
131
+ "step": 14
132
+ },
133
+ {
134
+ "epoch": 2.142857142857143,
135
+ "grad_norm": 2.206693410873413,
136
+ "learning_rate": 4.722222222222222e-05,
137
+ "loss": 0.2449,
138
+ "step": 15
139
+ },
140
+ {
141
+ "epoch": 2.2857142857142856,
142
+ "grad_norm": 0.9722664952278137,
143
+ "learning_rate": 4.666666666666667e-05,
144
+ "loss": 0.1875,
145
+ "step": 16
146
+ },
147
+ {
148
+ "epoch": 2.4285714285714284,
149
+ "grad_norm": 1.500380039215088,
150
+ "learning_rate": 4.6111111111111115e-05,
151
+ "loss": 0.1429,
152
+ "step": 17
153
+ },
154
+ {
155
+ "epoch": 2.571428571428571,
156
+ "grad_norm": 1.9801063537597656,
157
+ "learning_rate": 4.555555555555556e-05,
158
+ "loss": 0.1752,
159
+ "step": 18
160
+ },
161
+ {
162
+ "epoch": 2.7142857142857144,
163
+ "grad_norm": 2.65030837059021,
164
+ "learning_rate": 4.5e-05,
165
+ "loss": 0.0763,
166
+ "step": 19
167
+ },
168
+ {
169
+ "epoch": 2.857142857142857,
170
+ "grad_norm": 3.4686992168426514,
171
+ "learning_rate": 4.4444444444444447e-05,
172
+ "loss": 0.168,
173
+ "step": 20
174
+ },
175
+ {
176
+ "epoch": 3.0,
177
+ "grad_norm": 6.0364298820495605,
178
+ "learning_rate": 4.388888888888889e-05,
179
+ "loss": 0.1813,
180
+ "step": 21
181
+ },
182
+ {
183
+ "epoch": 3.0,
184
+ "eval_accuracy": 0.966131907308378,
185
+ "eval_f1": 0.9019607843137256,
186
+ "eval_loss": 0.14405852556228638,
187
+ "eval_precision": 0.8846153846153846,
188
+ "eval_recall": 0.92,
189
+ "eval_runtime": 0.2168,
190
+ "eval_samples_per_second": 13.839,
191
+ "eval_steps_per_second": 13.839,
192
+ "step": 21
193
+ },
194
+ {
195
+ "epoch": 3.142857142857143,
196
+ "grad_norm": 0.7663143873214722,
197
+ "learning_rate": 4.3333333333333334e-05,
198
+ "loss": 0.0552,
199
+ "step": 22
200
+ },
201
+ {
202
+ "epoch": 3.2857142857142856,
203
+ "grad_norm": 1.2191542387008667,
204
+ "learning_rate": 4.277777777777778e-05,
205
+ "loss": 0.1005,
206
+ "step": 23
207
+ },
208
+ {
209
+ "epoch": 3.4285714285714284,
210
+ "grad_norm": 7.446482181549072,
211
+ "learning_rate": 4.222222222222222e-05,
212
+ "loss": 0.0559,
213
+ "step": 24
214
+ },
215
+ {
216
+ "epoch": 3.571428571428571,
217
+ "grad_norm": 1.18604576587677,
218
+ "learning_rate": 4.166666666666667e-05,
219
+ "loss": 0.0494,
220
+ "step": 25
221
+ },
222
+ {
223
+ "epoch": 3.7142857142857144,
224
+ "grad_norm": 0.6873874664306641,
225
+ "learning_rate": 4.111111111111111e-05,
226
+ "loss": 0.0533,
227
+ "step": 26
228
+ },
229
+ {
230
+ "epoch": 3.857142857142857,
231
+ "grad_norm": 2.966907024383545,
232
+ "learning_rate": 4.055555555555556e-05,
233
+ "loss": 0.0625,
234
+ "step": 27
235
+ },
236
+ {
237
+ "epoch": 4.0,
238
+ "grad_norm": 3.066542863845825,
239
+ "learning_rate": 4e-05,
240
+ "loss": 0.0734,
241
+ "step": 28
242
+ },
243
+ {
244
+ "epoch": 4.0,
245
+ "eval_accuracy": 0.9732620320855615,
246
+ "eval_f1": 0.8000000000000002,
247
+ "eval_loss": 0.13226450979709625,
248
+ "eval_precision": 0.8,
249
+ "eval_recall": 0.8,
250
+ "eval_runtime": 0.2294,
251
+ "eval_samples_per_second": 13.077,
252
+ "eval_steps_per_second": 13.077,
253
+ "step": 28
254
+ },
255
+ {
256
+ "epoch": 4.142857142857143,
257
+ "grad_norm": 0.8713329434394836,
258
+ "learning_rate": 3.944444444444445e-05,
259
+ "loss": 0.0673,
260
+ "step": 29
261
+ },
262
+ {
263
+ "epoch": 4.285714285714286,
264
+ "grad_norm": 0.306024968624115,
265
+ "learning_rate": 3.888888888888889e-05,
266
+ "loss": 0.0146,
267
+ "step": 30
268
+ },
269
+ {
270
+ "epoch": 4.428571428571429,
271
+ "grad_norm": 0.3102547526359558,
272
+ "learning_rate": 3.8333333333333334e-05,
273
+ "loss": 0.0169,
274
+ "step": 31
275
+ },
276
+ {
277
+ "epoch": 4.571428571428571,
278
+ "grad_norm": 5.933145046234131,
279
+ "learning_rate": 3.777777777777778e-05,
280
+ "loss": 0.0753,
281
+ "step": 32
282
+ },
283
+ {
284
+ "epoch": 4.714285714285714,
285
+ "grad_norm": 1.7290794849395752,
286
+ "learning_rate": 3.722222222222222e-05,
287
+ "loss": 0.0668,
288
+ "step": 33
289
+ },
290
+ {
291
+ "epoch": 4.857142857142857,
292
+ "grad_norm": 8.295025825500488,
293
+ "learning_rate": 3.6666666666666666e-05,
294
+ "loss": 0.0786,
295
+ "step": 34
296
+ },
297
+ {
298
+ "epoch": 5.0,
299
+ "grad_norm": 0.47509345412254333,
300
+ "learning_rate": 3.611111111111111e-05,
301
+ "loss": 0.0124,
302
+ "step": 35
303
+ },
304
+ {
305
+ "epoch": 5.0,
306
+ "eval_accuracy": 0.9928698752228164,
307
+ "eval_f1": 0.8235294117647058,
308
+ "eval_loss": 0.0685623362660408,
309
+ "eval_precision": 0.8076923076923077,
310
+ "eval_recall": 0.84,
311
+ "eval_runtime": 0.2188,
312
+ "eval_samples_per_second": 13.709,
313
+ "eval_steps_per_second": 13.709,
314
+ "step": 35
315
+ },
316
+ {
317
+ "epoch": 5.142857142857143,
318
+ "grad_norm": 1.3178348541259766,
319
+ "learning_rate": 3.555555555555556e-05,
320
+ "loss": 0.0303,
321
+ "step": 36
322
+ },
323
+ {
324
+ "epoch": 5.285714285714286,
325
+ "grad_norm": 2.2010915279388428,
326
+ "learning_rate": 3.5e-05,
327
+ "loss": 0.045,
328
+ "step": 37
329
+ },
330
+ {
331
+ "epoch": 5.428571428571429,
332
+ "grad_norm": 1.1045790910720825,
333
+ "learning_rate": 3.444444444444445e-05,
334
+ "loss": 0.0145,
335
+ "step": 38
336
+ },
337
+ {
338
+ "epoch": 5.571428571428571,
339
+ "grad_norm": 1.0724176168441772,
340
+ "learning_rate": 3.388888888888889e-05,
341
+ "loss": 0.0063,
342
+ "step": 39
343
+ },
344
+ {
345
+ "epoch": 5.714285714285714,
346
+ "grad_norm": 0.7634330987930298,
347
+ "learning_rate": 3.3333333333333335e-05,
348
+ "loss": 0.0176,
349
+ "step": 40
350
+ },
351
+ {
352
+ "epoch": 5.857142857142857,
353
+ "grad_norm": 0.48025190830230713,
354
+ "learning_rate": 3.277777777777778e-05,
355
+ "loss": 0.0228,
356
+ "step": 41
357
+ },
358
+ {
359
+ "epoch": 6.0,
360
+ "grad_norm": 0.21735815703868866,
361
+ "learning_rate": 3.222222222222223e-05,
362
+ "loss": 0.0034,
363
+ "step": 42
364
+ },
365
+ {
366
+ "epoch": 6.0,
367
+ "eval_accuracy": 0.983957219251337,
368
+ "eval_f1": 0.7058823529411765,
369
+ "eval_loss": 0.16237980127334595,
370
+ "eval_precision": 0.6923076923076923,
371
+ "eval_recall": 0.72,
372
+ "eval_runtime": 0.2281,
373
+ "eval_samples_per_second": 13.15,
374
+ "eval_steps_per_second": 13.15,
375
+ "step": 42
376
+ },
377
+ {
378
+ "epoch": 6.142857142857143,
379
+ "grad_norm": 1.1637978553771973,
380
+ "learning_rate": 3.1666666666666666e-05,
381
+ "loss": 0.0137,
382
+ "step": 43
383
+ },
384
+ {
385
+ "epoch": 6.285714285714286,
386
+ "grad_norm": 0.21785947680473328,
387
+ "learning_rate": 3.111111111111111e-05,
388
+ "loss": 0.0039,
389
+ "step": 44
390
+ },
391
+ {
392
+ "epoch": 6.428571428571429,
393
+ "grad_norm": 20.79886245727539,
394
+ "learning_rate": 3.055555555555556e-05,
395
+ "loss": 0.1387,
396
+ "step": 45
397
+ },
398
+ {
399
+ "epoch": 6.571428571428571,
400
+ "grad_norm": 7.047394752502441,
401
+ "learning_rate": 3e-05,
402
+ "loss": 0.019,
403
+ "step": 46
404
+ },
405
+ {
406
+ "epoch": 6.714285714285714,
407
+ "grad_norm": 0.3814934194087982,
408
+ "learning_rate": 2.9444444444444448e-05,
409
+ "loss": 0.0059,
410
+ "step": 47
411
+ },
412
+ {
413
+ "epoch": 6.857142857142857,
414
+ "grad_norm": 0.4642070233821869,
415
+ "learning_rate": 2.8888888888888888e-05,
416
+ "loss": 0.005,
417
+ "step": 48
418
+ },
419
+ {
420
+ "epoch": 7.0,
421
+ "grad_norm": 0.06280206888914108,
422
+ "learning_rate": 2.8333333333333335e-05,
423
+ "loss": 0.0023,
424
+ "step": 49
425
+ },
426
+ {
427
+ "epoch": 7.0,
428
+ "eval_accuracy": 0.9946524064171123,
429
+ "eval_f1": 0.830188679245283,
430
+ "eval_loss": 0.018056461587548256,
431
+ "eval_precision": 0.7857142857142857,
432
+ "eval_recall": 0.88,
433
+ "eval_runtime": 0.2418,
434
+ "eval_samples_per_second": 12.409,
435
+ "eval_steps_per_second": 12.409,
436
+ "step": 49
437
+ },
438
+ {
439
+ "epoch": 7.142857142857143,
440
+ "grad_norm": 0.042585261166095734,
441
+ "learning_rate": 2.777777777777778e-05,
442
+ "loss": 0.002,
443
+ "step": 50
444
+ },
445
+ {
446
+ "epoch": 7.285714285714286,
447
+ "grad_norm": 3.1652538776397705,
448
+ "learning_rate": 2.7222222222222223e-05,
449
+ "loss": 0.0085,
450
+ "step": 51
451
+ },
452
+ {
453
+ "epoch": 7.428571428571429,
454
+ "grad_norm": 0.997791051864624,
455
+ "learning_rate": 2.6666666666666667e-05,
456
+ "loss": 0.0154,
457
+ "step": 52
458
+ },
459
+ {
460
+ "epoch": 7.571428571428571,
461
+ "grad_norm": 0.2884855568408966,
462
+ "learning_rate": 2.6111111111111114e-05,
463
+ "loss": 0.0021,
464
+ "step": 53
465
+ },
466
+ {
467
+ "epoch": 7.714285714285714,
468
+ "grad_norm": 0.39060717821121216,
469
+ "learning_rate": 2.5555555555555554e-05,
470
+ "loss": 0.0026,
471
+ "step": 54
472
+ },
473
+ {
474
+ "epoch": 7.857142857142857,
475
+ "grad_norm": 15.578678131103516,
476
+ "learning_rate": 2.5e-05,
477
+ "loss": 0.031,
478
+ "step": 55
479
+ },
480
+ {
481
+ "epoch": 8.0,
482
+ "grad_norm": 0.1004723384976387,
483
+ "learning_rate": 2.4444444444444445e-05,
484
+ "loss": 0.0023,
485
+ "step": 56
486
+ },
487
+ {
488
+ "epoch": 8.0,
489
+ "eval_accuracy": 0.9946524064171123,
490
+ "eval_f1": 0.8627450980392156,
491
+ "eval_loss": 0.054031386971473694,
492
+ "eval_precision": 0.8461538461538461,
493
+ "eval_recall": 0.88,
494
+ "eval_runtime": 0.3115,
495
+ "eval_samples_per_second": 9.631,
496
+ "eval_steps_per_second": 9.631,
497
+ "step": 56
498
+ },
499
+ {
500
+ "epoch": 8.142857142857142,
501
+ "grad_norm": 0.5730704665184021,
502
+ "learning_rate": 2.3888888888888892e-05,
503
+ "loss": 0.005,
504
+ "step": 57
505
+ },
506
+ {
507
+ "epoch": 8.285714285714286,
508
+ "grad_norm": 0.08043497800827026,
509
+ "learning_rate": 2.3333333333333336e-05,
510
+ "loss": 0.0023,
511
+ "step": 58
512
+ },
513
+ {
514
+ "epoch": 8.428571428571429,
515
+ "grad_norm": 0.03008083440363407,
516
+ "learning_rate": 2.277777777777778e-05,
517
+ "loss": 0.0012,
518
+ "step": 59
519
+ },
520
+ {
521
+ "epoch": 8.571428571428571,
522
+ "grad_norm": 0.03248002007603645,
523
+ "learning_rate": 2.2222222222222223e-05,
524
+ "loss": 0.0008,
525
+ "step": 60
526
+ },
527
+ {
528
+ "epoch": 8.714285714285714,
529
+ "grad_norm": 0.04838703200221062,
530
+ "learning_rate": 2.1666666666666667e-05,
531
+ "loss": 0.0013,
532
+ "step": 61
533
+ },
534
+ {
535
+ "epoch": 8.857142857142858,
536
+ "grad_norm": 0.33141571283340454,
537
+ "learning_rate": 2.111111111111111e-05,
538
+ "loss": 0.0022,
539
+ "step": 62
540
+ },
541
+ {
542
+ "epoch": 9.0,
543
+ "grad_norm": 1.7117937803268433,
544
+ "learning_rate": 2.0555555555555555e-05,
545
+ "loss": 0.0069,
546
+ "step": 63
547
+ },
548
+ {
549
+ "epoch": 9.0,
550
+ "eval_accuracy": 0.9269162210338681,
551
+ "eval_f1": 0.5,
552
+ "eval_loss": 0.5584124326705933,
553
+ "eval_precision": 0.5789473684210527,
554
+ "eval_recall": 0.44,
555
+ "eval_runtime": 0.2368,
556
+ "eval_samples_per_second": 12.67,
557
+ "eval_steps_per_second": 12.67,
558
+ "step": 63
559
+ },
560
+ {
561
+ "epoch": 9.142857142857142,
562
+ "grad_norm": 0.3541106581687927,
563
+ "learning_rate": 2e-05,
564
+ "loss": 0.003,
565
+ "step": 64
566
+ },
567
+ {
568
+ "epoch": 9.285714285714286,
569
+ "grad_norm": 6.2997145652771,
570
+ "learning_rate": 1.9444444444444445e-05,
571
+ "loss": 0.0145,
572
+ "step": 65
573
+ },
574
+ {
575
+ "epoch": 9.428571428571429,
576
+ "grad_norm": 3.1364083290100098,
577
+ "learning_rate": 1.888888888888889e-05,
578
+ "loss": 0.0043,
579
+ "step": 66
580
+ },
581
+ {
582
+ "epoch": 9.571428571428571,
583
+ "grad_norm": 0.018303165212273598,
584
+ "learning_rate": 1.8333333333333333e-05,
585
+ "loss": 0.0007,
586
+ "step": 67
587
+ },
588
+ {
589
+ "epoch": 9.714285714285714,
590
+ "grad_norm": 0.06226564571261406,
591
+ "learning_rate": 1.777777777777778e-05,
592
+ "loss": 0.0011,
593
+ "step": 68
594
+ },
595
+ {
596
+ "epoch": 9.857142857142858,
597
+ "grad_norm": 0.18134748935699463,
598
+ "learning_rate": 1.7222222222222224e-05,
599
+ "loss": 0.0014,
600
+ "step": 69
601
+ },
602
+ {
603
+ "epoch": 10.0,
604
+ "grad_norm": 0.013242754153907299,
605
+ "learning_rate": 1.6666666666666667e-05,
606
+ "loss": 0.0006,
607
+ "step": 70
608
+ },
609
+ {
610
+ "epoch": 10.0,
611
+ "eval_accuracy": 1.0,
612
+ "eval_f1": 1.0,
613
+ "eval_loss": 0.004223768133670092,
614
+ "eval_precision": 1.0,
615
+ "eval_recall": 1.0,
616
+ "eval_runtime": 0.2664,
617
+ "eval_samples_per_second": 11.263,
618
+ "eval_steps_per_second": 11.263,
619
+ "step": 70
620
+ },
621
+ {
622
+ "epoch": 10.142857142857142,
623
+ "grad_norm": 0.5124610662460327,
624
+ "learning_rate": 1.6111111111111115e-05,
625
+ "loss": 0.0017,
626
+ "step": 71
627
+ },
628
+ {
629
+ "epoch": 10.285714285714286,
630
+ "grad_norm": 0.014037979766726494,
631
+ "learning_rate": 1.5555555555555555e-05,
632
+ "loss": 0.0008,
633
+ "step": 72
634
+ },
635
+ {
636
+ "epoch": 10.428571428571429,
637
+ "grad_norm": 0.020040445029735565,
638
+ "learning_rate": 1.5e-05,
639
+ "loss": 0.001,
640
+ "step": 73
641
+ },
642
+ {
643
+ "epoch": 10.571428571428571,
644
+ "grad_norm": 0.0177422147244215,
645
+ "learning_rate": 1.4444444444444444e-05,
646
+ "loss": 0.0007,
647
+ "step": 74
648
+ },
649
+ {
650
+ "epoch": 10.714285714285714,
651
+ "grad_norm": 0.02175714634358883,
652
+ "learning_rate": 1.388888888888889e-05,
653
+ "loss": 0.0011,
654
+ "step": 75
655
+ },
656
+ {
657
+ "epoch": 10.857142857142858,
658
+ "grad_norm": 0.016904808580875397,
659
+ "learning_rate": 1.3333333333333333e-05,
660
+ "loss": 0.0009,
661
+ "step": 76
662
+ },
663
+ {
664
+ "epoch": 11.0,
665
+ "grad_norm": 0.7300955653190613,
666
+ "learning_rate": 1.2777777777777777e-05,
667
+ "loss": 0.0026,
668
+ "step": 77
669
+ },
670
+ {
671
+ "epoch": 11.0,
672
+ "eval_accuracy": 1.0,
673
+ "eval_f1": 1.0,
674
+ "eval_loss": 0.0016724281013011932,
675
+ "eval_precision": 1.0,
676
+ "eval_recall": 1.0,
677
+ "eval_runtime": 0.2244,
678
+ "eval_samples_per_second": 13.369,
679
+ "eval_steps_per_second": 13.369,
680
+ "step": 77
681
+ },
682
+ {
683
+ "epoch": 11.142857142857142,
684
+ "grad_norm": 0.01708046905696392,
685
+ "learning_rate": 1.2222222222222222e-05,
686
+ "loss": 0.001,
687
+ "step": 78
688
+ },
689
+ {
690
+ "epoch": 11.285714285714286,
691
+ "grad_norm": 0.025557199493050575,
692
+ "learning_rate": 1.1666666666666668e-05,
693
+ "loss": 0.0011,
694
+ "step": 79
695
+ },
696
+ {
697
+ "epoch": 11.428571428571429,
698
+ "grad_norm": 0.015760347247123718,
699
+ "learning_rate": 1.1111111111111112e-05,
700
+ "loss": 0.0008,
701
+ "step": 80
702
+ },
703
+ {
704
+ "epoch": 11.571428571428571,
705
+ "grad_norm": 0.4625304341316223,
706
+ "learning_rate": 1.0555555555555555e-05,
707
+ "loss": 0.002,
708
+ "step": 81
709
+ },
710
+ {
711
+ "epoch": 11.714285714285714,
712
+ "grad_norm": 0.02315388433635235,
713
+ "learning_rate": 1e-05,
714
+ "loss": 0.0009,
715
+ "step": 82
716
+ },
717
+ {
718
+ "epoch": 11.857142857142858,
719
+ "grad_norm": 0.013032798655331135,
720
+ "learning_rate": 9.444444444444445e-06,
721
+ "loss": 0.0006,
722
+ "step": 83
723
+ },
724
+ {
725
+ "epoch": 12.0,
726
+ "grad_norm": 0.01681060716509819,
727
+ "learning_rate": 8.88888888888889e-06,
728
+ "loss": 0.0007,
729
+ "step": 84
730
+ },
731
+ {
732
+ "epoch": 12.0,
733
+ "eval_accuracy": 0.9982174688057041,
734
+ "eval_f1": 0.9411764705882353,
735
+ "eval_loss": 0.006076255813241005,
736
+ "eval_precision": 0.9230769230769231,
737
+ "eval_recall": 0.96,
738
+ "eval_runtime": 0.2313,
739
+ "eval_samples_per_second": 12.968,
740
+ "eval_steps_per_second": 12.968,
741
+ "step": 84
742
+ },
743
+ {
744
+ "epoch": 12.142857142857142,
745
+ "grad_norm": 0.021738462150096893,
746
+ "learning_rate": 8.333333333333334e-06,
747
+ "loss": 0.0012,
748
+ "step": 85
749
+ },
750
+ {
751
+ "epoch": 12.285714285714286,
752
+ "grad_norm": 0.07413913309574127,
753
+ "learning_rate": 7.777777777777777e-06,
754
+ "loss": 0.001,
755
+ "step": 86
756
+ },
757
+ {
758
+ "epoch": 12.428571428571429,
759
+ "grad_norm": 0.015694081783294678,
760
+ "learning_rate": 7.222222222222222e-06,
761
+ "loss": 0.0007,
762
+ "step": 87
763
+ },
764
+ {
765
+ "epoch": 12.571428571428571,
766
+ "grad_norm": 0.07455814629793167,
767
+ "learning_rate": 6.666666666666667e-06,
768
+ "loss": 0.0011,
769
+ "step": 88
770
+ },
771
+ {
772
+ "epoch": 12.714285714285714,
773
+ "grad_norm": 2.512777805328369,
774
+ "learning_rate": 6.111111111111111e-06,
775
+ "loss": 0.002,
776
+ "step": 89
777
+ },
778
+ {
779
+ "epoch": 12.857142857142858,
780
+ "grad_norm": 0.020945778116583824,
781
+ "learning_rate": 5.555555555555556e-06,
782
+ "loss": 0.0008,
783
+ "step": 90
784
+ },
785
+ {
786
+ "epoch": 13.0,
787
+ "grad_norm": 0.014726142399013042,
788
+ "learning_rate": 5e-06,
789
+ "loss": 0.0006,
790
+ "step": 91
791
+ },
792
+ {
793
+ "epoch": 13.0,
794
+ "eval_accuracy": 0.9982174688057041,
795
+ "eval_f1": 0.9411764705882353,
796
+ "eval_loss": 0.010821976698935032,
797
+ "eval_precision": 0.9230769230769231,
798
+ "eval_recall": 0.96,
799
+ "eval_runtime": 0.2328,
800
+ "eval_samples_per_second": 12.888,
801
+ "eval_steps_per_second": 12.888,
802
+ "step": 91
803
+ },
804
+ {
805
+ "epoch": 13.142857142857142,
806
+ "grad_norm": 0.022652102634310722,
807
+ "learning_rate": 4.444444444444445e-06,
808
+ "loss": 0.0007,
809
+ "step": 92
810
+ },
811
+ {
812
+ "epoch": 13.285714285714286,
813
+ "grad_norm": 0.0192624032497406,
814
+ "learning_rate": 3.888888888888889e-06,
815
+ "loss": 0.0009,
816
+ "step": 93
817
+ },
818
+ {
819
+ "epoch": 13.428571428571429,
820
+ "grad_norm": 0.03646668419241905,
821
+ "learning_rate": 3.3333333333333333e-06,
822
+ "loss": 0.0009,
823
+ "step": 94
824
+ },
825
+ {
826
+ "epoch": 13.571428571428571,
827
+ "grad_norm": 0.02297368273139,
828
+ "learning_rate": 2.777777777777778e-06,
829
+ "loss": 0.0006,
830
+ "step": 95
831
+ },
832
+ {
833
+ "epoch": 13.714285714285714,
834
+ "grad_norm": 0.013909080997109413,
835
+ "learning_rate": 2.2222222222222225e-06,
836
+ "loss": 0.0006,
837
+ "step": 96
838
+ },
839
+ {
840
+ "epoch": 13.857142857142858,
841
+ "grad_norm": 0.021894708275794983,
842
+ "learning_rate": 1.6666666666666667e-06,
843
+ "loss": 0.0008,
844
+ "step": 97
845
+ },
846
+ {
847
+ "epoch": 14.0,
848
+ "grad_norm": 0.019053662195801735,
849
+ "learning_rate": 1.1111111111111112e-06,
850
+ "loss": 0.0011,
851
+ "step": 98
852
+ },
853
+ {
854
+ "epoch": 14.0,
855
+ "eval_accuracy": 0.9982174688057041,
856
+ "eval_f1": 0.9411764705882353,
857
+ "eval_loss": 0.011927765794098377,
858
+ "eval_precision": 0.9230769230769231,
859
+ "eval_recall": 0.96,
860
+ "eval_runtime": 0.2367,
861
+ "eval_samples_per_second": 12.675,
862
+ "eval_steps_per_second": 12.675,
863
+ "step": 98
864
+ },
865
+ {
866
+ "epoch": 14.142857142857142,
867
+ "grad_norm": 0.015299996361136436,
868
+ "learning_rate": 5.555555555555556e-07,
869
+ "loss": 0.0007,
870
+ "step": 99
871
+ },
872
+ {
873
+ "epoch": 14.285714285714286,
874
+ "grad_norm": 0.06155708432197571,
875
+ "learning_rate": 0.0,
876
+ "loss": 0.0009,
877
+ "step": 100
878
+ },
879
+ {
880
+ "epoch": 14.285714285714286,
881
+ "eval_accuracy": 0.9982174688057041,
882
+ "eval_f1": 0.9411764705882353,
883
+ "eval_loss": 0.011938165873289108,
884
+ "eval_precision": 0.9230769230769231,
885
+ "eval_recall": 0.96,
886
+ "eval_runtime": 0.459,
887
+ "eval_samples_per_second": 6.535,
888
+ "eval_steps_per_second": 6.535,
889
+ "step": 100
890
+ }
891
+ ],
892
+ "logging_steps": 1,
893
+ "max_steps": 100,
894
+ "num_input_tokens_seen": 0,
895
+ "num_train_epochs": 15,
896
+ "save_steps": 500,
897
+ "stateful_callbacks": {
898
+ "TrainerControl": {
899
+ "args": {
900
+ "should_epoch_stop": false,
901
+ "should_evaluate": false,
902
+ "should_log": false,
903
+ "should_save": true,
904
+ "should_training_stop": true
905
+ },
906
+ "attributes": {}
907
+ }
908
+ },
909
+ "total_flos": 52719651225600.0,
910
+ "train_batch_size": 1,
911
+ "trial_name": null,
912
+ "trial_params": null
913
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81f1ef4de374f0e50f7535fc7aef0393aeb38851bfb2b5f2a7bad6c19b9a6811
3
+ size 5304
vocab.json ADDED
The diff for this file is too large to render. See raw diff