lkw2024 commited on
Commit
7646221
·
verified ·
1 Parent(s): 5f33521

Upload 11 files

Browse files
config.json CHANGED
@@ -1,12 +1,11 @@
1
  {
2
- "_name_or_path": "skt/kobert-base-v1",
3
  "architectures": [
4
- "BertForSequenceClassification"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
- "author": "Heewon Jeon([email protected])",
8
  "classifier_dropout": null,
9
- "gradient_checkpointing": false,
10
  "hidden_act": "gelu",
11
  "hidden_dropout_prob": 0.1,
12
  "hidden_size": 768,
@@ -17,7 +16,6 @@
17
  },
18
  "initializer_range": 0.02,
19
  "intermediate_size": 3072,
20
- "kobert_version": 1.0,
21
  "label2id": {
22
  "LABEL_0": 0,
23
  "LABEL_1": 1,
@@ -25,15 +23,19 @@
25
  },
26
  "layer_norm_eps": 1e-12,
27
  "max_position_embeddings": 512,
28
- "model_type": "bert",
29
  "num_attention_heads": 12,
30
  "num_hidden_layers": 12,
31
- "pad_token_id": 1,
32
  "position_embedding_type": "absolute",
33
  "problem_type": "single_label_classification",
 
 
 
 
34
  "torch_dtype": "float32",
35
  "transformers_version": "4.45.1",
36
  "type_vocab_size": 2,
37
  "use_cache": true,
38
- "vocab_size": 8002
39
  }
 
1
  {
2
+ "_name_or_path": "monologg/koelectra-base-v3-discriminator",
3
  "architectures": [
4
+ "ElectraForSequenceClassification"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
 
7
  "classifier_dropout": null,
8
+ "embedding_size": 768,
9
  "hidden_act": "gelu",
10
  "hidden_dropout_prob": 0.1,
11
  "hidden_size": 768,
 
16
  },
17
  "initializer_range": 0.02,
18
  "intermediate_size": 3072,
 
19
  "label2id": {
20
  "LABEL_0": 0,
21
  "LABEL_1": 1,
 
23
  },
24
  "layer_norm_eps": 1e-12,
25
  "max_position_embeddings": 512,
26
+ "model_type": "electra",
27
  "num_attention_heads": 12,
28
  "num_hidden_layers": 12,
29
+ "pad_token_id": 0,
30
  "position_embedding_type": "absolute",
31
  "problem_type": "single_label_classification",
32
+ "summary_activation": "gelu",
33
+ "summary_last_dropout": 0.1,
34
+ "summary_type": "first",
35
+ "summary_use_proj": true,
36
  "torch_dtype": "float32",
37
  "transformers_version": "4.45.1",
38
  "type_vocab_size": 2,
39
  "use_cache": true,
40
+ "vocab_size": 35000
41
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f82d7a65656d1912f2914baa8290c48b9c55044f1f05bcafb8e1ee2590b9dd0a
3
- size 368780204
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7ce6384be92f1c7f441e7d36892fd4af2d8942446e7ec62ab1300075d6eead1
3
+ size 451718748
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5080a9a6edc7288b55de6518de49d2b65f4886fd9e8e3d9c400a0b0a9920cdda
3
+ size 903551738
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38bdd138de6a56a2479d7804e54bf21693ad07775ca65e9e18d4fd68b3b67efc
3
+ size 13990
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65fe7bfd1f7a265858dbc13c464e7304a5cf563b67c7f8c8477bb66cb89c2811
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -41,7 +41,7 @@
41
  "special": true
42
  }
43
  },
44
- "clean_up_tokenization_spaces": false,
45
  "cls_token": "[CLS]",
46
  "do_basic_tokenize": true,
47
  "do_lower_case": false,
@@ -53,6 +53,6 @@
53
  "strip_accents": null,
54
  "timeout": 60,
55
  "tokenize_chinese_chars": true,
56
- "tokenizer_class": "BertTokenizer",
57
  "unk_token": "[UNK]"
58
  }
 
41
  "special": true
42
  }
43
  },
44
+ "clean_up_tokenization_spaces": true,
45
  "cls_token": "[CLS]",
46
  "do_basic_tokenize": true,
47
  "do_lower_case": false,
 
53
  "strip_accents": null,
54
  "timeout": 60,
55
  "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "ElectraTokenizer",
57
  "unk_token": "[UNK]"
58
  }
trainer_state.json ADDED
@@ -0,0 +1,714 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9008317338451696,
3
+ "best_model_checkpoint": "./checkpoints/monologg_koelectra-base-v3-discriminator\\checkpoint-4689",
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 4689,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03198976327575176,
13
+ "grad_norm": 1.8438289165496826,
14
+ "learning_rate": 1.9786734911494988e-05,
15
+ "loss": 0.9998,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.06397952655150352,
20
+ "grad_norm": 4.708126544952393,
21
+ "learning_rate": 1.9573469822989978e-05,
22
+ "loss": 0.8181,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.09596928982725528,
27
+ "grad_norm": 7.408482551574707,
28
+ "learning_rate": 1.9360204734484968e-05,
29
+ "loss": 0.6265,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.12795905310300704,
34
+ "grad_norm": 6.211757183074951,
35
+ "learning_rate": 1.9146939645979955e-05,
36
+ "loss": 0.5046,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.1599488163787588,
41
+ "grad_norm": 7.470839500427246,
42
+ "learning_rate": 1.8933674557474945e-05,
43
+ "loss": 0.5358,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.19193857965451055,
48
+ "grad_norm": 2.0797882080078125,
49
+ "learning_rate": 1.872040946896993e-05,
50
+ "loss": 0.483,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.22392834293026231,
55
+ "grad_norm": 14.55879020690918,
56
+ "learning_rate": 1.8507144380464918e-05,
57
+ "loss": 0.4462,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.2559181062060141,
62
+ "grad_norm": 3.5867443084716797,
63
+ "learning_rate": 1.8293879291959908e-05,
64
+ "loss": 0.4348,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 0.28790786948176583,
69
+ "grad_norm": 11.995755195617676,
70
+ "learning_rate": 1.8080614203454897e-05,
71
+ "loss": 0.4236,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 0.3198976327575176,
76
+ "grad_norm": 7.211252689361572,
77
+ "learning_rate": 1.7867349114949884e-05,
78
+ "loss": 0.4099,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 0.35188739603326935,
83
+ "grad_norm": 5.884838581085205,
84
+ "learning_rate": 1.7654084026444874e-05,
85
+ "loss": 0.4265,
86
+ "step": 550
87
+ },
88
+ {
89
+ "epoch": 0.3838771593090211,
90
+ "grad_norm": 4.443331241607666,
91
+ "learning_rate": 1.744081893793986e-05,
92
+ "loss": 0.3863,
93
+ "step": 600
94
+ },
95
+ {
96
+ "epoch": 0.41586692258477287,
97
+ "grad_norm": 3.68717622756958,
98
+ "learning_rate": 1.7227553849434847e-05,
99
+ "loss": 0.4244,
100
+ "step": 650
101
+ },
102
+ {
103
+ "epoch": 0.44785668586052463,
104
+ "grad_norm": 4.4258599281311035,
105
+ "learning_rate": 1.7014288760929837e-05,
106
+ "loss": 0.3701,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 0.4798464491362764,
111
+ "grad_norm": 8.484159469604492,
112
+ "learning_rate": 1.6801023672424827e-05,
113
+ "loss": 0.4079,
114
+ "step": 750
115
+ },
116
+ {
117
+ "epoch": 0.5118362124120281,
118
+ "grad_norm": 8.976361274719238,
119
+ "learning_rate": 1.6587758583919813e-05,
120
+ "loss": 0.3413,
121
+ "step": 800
122
+ },
123
+ {
124
+ "epoch": 0.5438259756877799,
125
+ "grad_norm": 7.113468170166016,
126
+ "learning_rate": 1.6374493495414803e-05,
127
+ "loss": 0.3928,
128
+ "step": 850
129
+ },
130
+ {
131
+ "epoch": 0.5758157389635317,
132
+ "grad_norm": 7.242956161499023,
133
+ "learning_rate": 1.616122840690979e-05,
134
+ "loss": 0.4013,
135
+ "step": 900
136
+ },
137
+ {
138
+ "epoch": 0.6078055022392834,
139
+ "grad_norm": 6.204492092132568,
140
+ "learning_rate": 1.5947963318404776e-05,
141
+ "loss": 0.3838,
142
+ "step": 950
143
+ },
144
+ {
145
+ "epoch": 0.6397952655150352,
146
+ "grad_norm": 5.312352180480957,
147
+ "learning_rate": 1.5734698229899766e-05,
148
+ "loss": 0.3838,
149
+ "step": 1000
150
+ },
151
+ {
152
+ "epoch": 0.6717850287907869,
153
+ "grad_norm": 22.331918716430664,
154
+ "learning_rate": 1.5521433141394756e-05,
155
+ "loss": 0.3618,
156
+ "step": 1050
157
+ },
158
+ {
159
+ "epoch": 0.7037747920665387,
160
+ "grad_norm": 11.372283935546875,
161
+ "learning_rate": 1.5308168052889743e-05,
162
+ "loss": 0.3486,
163
+ "step": 1100
164
+ },
165
+ {
166
+ "epoch": 0.7357645553422905,
167
+ "grad_norm": 31.921972274780273,
168
+ "learning_rate": 1.5094902964384733e-05,
169
+ "loss": 0.3807,
170
+ "step": 1150
171
+ },
172
+ {
173
+ "epoch": 0.7677543186180422,
174
+ "grad_norm": 6.360013484954834,
175
+ "learning_rate": 1.488163787587972e-05,
176
+ "loss": 0.3903,
177
+ "step": 1200
178
+ },
179
+ {
180
+ "epoch": 0.799744081893794,
181
+ "grad_norm": 5.0191330909729,
182
+ "learning_rate": 1.4668372787374708e-05,
183
+ "loss": 0.3477,
184
+ "step": 1250
185
+ },
186
+ {
187
+ "epoch": 0.8317338451695457,
188
+ "grad_norm": 20.319231033325195,
189
+ "learning_rate": 1.4455107698869698e-05,
190
+ "loss": 0.3193,
191
+ "step": 1300
192
+ },
193
+ {
194
+ "epoch": 0.8637236084452975,
195
+ "grad_norm": 3.2279179096221924,
196
+ "learning_rate": 1.4241842610364684e-05,
197
+ "loss": 0.3583,
198
+ "step": 1350
199
+ },
200
+ {
201
+ "epoch": 0.8957133717210493,
202
+ "grad_norm": 5.339118480682373,
203
+ "learning_rate": 1.4028577521859672e-05,
204
+ "loss": 0.3343,
205
+ "step": 1400
206
+ },
207
+ {
208
+ "epoch": 0.927703134996801,
209
+ "grad_norm": 3.4250569343566895,
210
+ "learning_rate": 1.3815312433354662e-05,
211
+ "loss": 0.3756,
212
+ "step": 1450
213
+ },
214
+ {
215
+ "epoch": 0.9596928982725528,
216
+ "grad_norm": 7.723750591278076,
217
+ "learning_rate": 1.3602047344849649e-05,
218
+ "loss": 0.3342,
219
+ "step": 1500
220
+ },
221
+ {
222
+ "epoch": 0.9916826615483045,
223
+ "grad_norm": 9.071324348449707,
224
+ "learning_rate": 1.3388782256344637e-05,
225
+ "loss": 0.3282,
226
+ "step": 1550
227
+ },
228
+ {
229
+ "epoch": 1.0,
230
+ "eval_accuracy": 0.8848368522072937,
231
+ "eval_f1": 0.8749968140479337,
232
+ "eval_loss": 0.3197398781776428,
233
+ "eval_runtime": 69.8877,
234
+ "eval_samples_per_second": 44.729,
235
+ "eval_steps_per_second": 2.804,
236
+ "step": 1563
237
+ },
238
+ {
239
+ "epoch": 1.0236724248240563,
240
+ "grad_norm": 3.9245243072509766,
241
+ "learning_rate": 1.3175517167839627e-05,
242
+ "loss": 0.2311,
243
+ "step": 1600
244
+ },
245
+ {
246
+ "epoch": 1.055662188099808,
247
+ "grad_norm": 13.951614379882812,
248
+ "learning_rate": 1.2962252079334613e-05,
249
+ "loss": 0.2844,
250
+ "step": 1650
251
+ },
252
+ {
253
+ "epoch": 1.0876519513755598,
254
+ "grad_norm": 8.549053192138672,
255
+ "learning_rate": 1.2748986990829602e-05,
256
+ "loss": 0.2573,
257
+ "step": 1700
258
+ },
259
+ {
260
+ "epoch": 1.1196417146513116,
261
+ "grad_norm": 7.1960835456848145,
262
+ "learning_rate": 1.2535721902324592e-05,
263
+ "loss": 0.2515,
264
+ "step": 1750
265
+ },
266
+ {
267
+ "epoch": 1.1516314779270633,
268
+ "grad_norm": 3.4280333518981934,
269
+ "learning_rate": 1.2322456813819578e-05,
270
+ "loss": 0.3032,
271
+ "step": 1800
272
+ },
273
+ {
274
+ "epoch": 1.183621241202815,
275
+ "grad_norm": 7.896517753601074,
276
+ "learning_rate": 1.2109191725314566e-05,
277
+ "loss": 0.2503,
278
+ "step": 1850
279
+ },
280
+ {
281
+ "epoch": 1.2156110044785668,
282
+ "grad_norm": 3.302367687225342,
283
+ "learning_rate": 1.1895926636809556e-05,
284
+ "loss": 0.2815,
285
+ "step": 1900
286
+ },
287
+ {
288
+ "epoch": 1.2476007677543186,
289
+ "grad_norm": 7.449910640716553,
290
+ "learning_rate": 1.1682661548304543e-05,
291
+ "loss": 0.2483,
292
+ "step": 1950
293
+ },
294
+ {
295
+ "epoch": 1.2795905310300704,
296
+ "grad_norm": 6.277085304260254,
297
+ "learning_rate": 1.1469396459799531e-05,
298
+ "loss": 0.2709,
299
+ "step": 2000
300
+ },
301
+ {
302
+ "epoch": 1.3115802943058221,
303
+ "grad_norm": 4.672796726226807,
304
+ "learning_rate": 1.1256131371294521e-05,
305
+ "loss": 0.2555,
306
+ "step": 2050
307
+ },
308
+ {
309
+ "epoch": 1.3435700575815739,
310
+ "grad_norm": 11.666603088378906,
311
+ "learning_rate": 1.1042866282789508e-05,
312
+ "loss": 0.2381,
313
+ "step": 2100
314
+ },
315
+ {
316
+ "epoch": 1.3755598208573256,
317
+ "grad_norm": 9.201951026916504,
318
+ "learning_rate": 1.0829601194284496e-05,
319
+ "loss": 0.2762,
320
+ "step": 2150
321
+ },
322
+ {
323
+ "epoch": 1.4075495841330774,
324
+ "grad_norm": 2.2189481258392334,
325
+ "learning_rate": 1.0616336105779486e-05,
326
+ "loss": 0.2466,
327
+ "step": 2200
328
+ },
329
+ {
330
+ "epoch": 1.4395393474088292,
331
+ "grad_norm": 8.02763557434082,
332
+ "learning_rate": 1.0403071017274472e-05,
333
+ "loss": 0.2556,
334
+ "step": 2250
335
+ },
336
+ {
337
+ "epoch": 1.471529110684581,
338
+ "grad_norm": 2.0652530193328857,
339
+ "learning_rate": 1.018980592876946e-05,
340
+ "loss": 0.2789,
341
+ "step": 2300
342
+ },
343
+ {
344
+ "epoch": 1.5035188739603327,
345
+ "grad_norm": 13.113646507263184,
346
+ "learning_rate": 9.976540840264449e-06,
347
+ "loss": 0.2235,
348
+ "step": 2350
349
+ },
350
+ {
351
+ "epoch": 1.5355086372360844,
352
+ "grad_norm": 10.362279891967773,
353
+ "learning_rate": 9.763275751759437e-06,
354
+ "loss": 0.2482,
355
+ "step": 2400
356
+ },
357
+ {
358
+ "epoch": 1.5674984005118362,
359
+ "grad_norm": 1.568915843963623,
360
+ "learning_rate": 9.550010663254427e-06,
361
+ "loss": 0.2622,
362
+ "step": 2450
363
+ },
364
+ {
365
+ "epoch": 1.599488163787588,
366
+ "grad_norm": 10.095715522766113,
367
+ "learning_rate": 9.336745574749414e-06,
368
+ "loss": 0.2668,
369
+ "step": 2500
370
+ },
371
+ {
372
+ "epoch": 1.6314779270633397,
373
+ "grad_norm": 12.999874114990234,
374
+ "learning_rate": 9.123480486244403e-06,
375
+ "loss": 0.2361,
376
+ "step": 2550
377
+ },
378
+ {
379
+ "epoch": 1.6634676903390915,
380
+ "grad_norm": 7.512565612792969,
381
+ "learning_rate": 8.910215397739392e-06,
382
+ "loss": 0.2646,
383
+ "step": 2600
384
+ },
385
+ {
386
+ "epoch": 1.6954574536148432,
387
+ "grad_norm": 6.645097732543945,
388
+ "learning_rate": 8.696950309234378e-06,
389
+ "loss": 0.2478,
390
+ "step": 2650
391
+ },
392
+ {
393
+ "epoch": 1.727447216890595,
394
+ "grad_norm": 4.22629976272583,
395
+ "learning_rate": 8.483685220729368e-06,
396
+ "loss": 0.2686,
397
+ "step": 2700
398
+ },
399
+ {
400
+ "epoch": 1.7594369801663468,
401
+ "grad_norm": 8.55104923248291,
402
+ "learning_rate": 8.270420132224356e-06,
403
+ "loss": 0.2615,
404
+ "step": 2750
405
+ },
406
+ {
407
+ "epoch": 1.7914267434420985,
408
+ "grad_norm": 4.150308132171631,
409
+ "learning_rate": 8.057155043719343e-06,
410
+ "loss": 0.2669,
411
+ "step": 2800
412
+ },
413
+ {
414
+ "epoch": 1.8234165067178503,
415
+ "grad_norm": 15.736576080322266,
416
+ "learning_rate": 7.843889955214333e-06,
417
+ "loss": 0.252,
418
+ "step": 2850
419
+ },
420
+ {
421
+ "epoch": 1.855406269993602,
422
+ "grad_norm": 1.219308853149414,
423
+ "learning_rate": 7.630624866709321e-06,
424
+ "loss": 0.2619,
425
+ "step": 2900
426
+ },
427
+ {
428
+ "epoch": 1.8873960332693538,
429
+ "grad_norm": 4.5792999267578125,
430
+ "learning_rate": 7.4173597782043085e-06,
431
+ "loss": 0.2516,
432
+ "step": 2950
433
+ },
434
+ {
435
+ "epoch": 1.9193857965451055,
436
+ "grad_norm": 7.346240043640137,
437
+ "learning_rate": 7.204094689699297e-06,
438
+ "loss": 0.2374,
439
+ "step": 3000
440
+ },
441
+ {
442
+ "epoch": 1.9513755598208573,
443
+ "grad_norm": 15.958673477172852,
444
+ "learning_rate": 6.990829601194286e-06,
445
+ "loss": 0.2568,
446
+ "step": 3050
447
+ },
448
+ {
449
+ "epoch": 1.983365323096609,
450
+ "grad_norm": 8.143491744995117,
451
+ "learning_rate": 6.777564512689273e-06,
452
+ "loss": 0.2196,
453
+ "step": 3100
454
+ },
455
+ {
456
+ "epoch": 2.0,
457
+ "eval_accuracy": 0.8902751119641714,
458
+ "eval_f1": 0.8821468217693207,
459
+ "eval_loss": 0.35885584354400635,
460
+ "eval_runtime": 70.4611,
461
+ "eval_samples_per_second": 44.365,
462
+ "eval_steps_per_second": 2.782,
463
+ "step": 3126
464
+ },
465
+ {
466
+ "epoch": 2.015355086372361,
467
+ "grad_norm": 2.4161245822906494,
468
+ "learning_rate": 6.5642994241842614e-06,
469
+ "loss": 0.211,
470
+ "step": 3150
471
+ },
472
+ {
473
+ "epoch": 2.0473448496481126,
474
+ "grad_norm": 6.2580156326293945,
475
+ "learning_rate": 6.3510343356792505e-06,
476
+ "loss": 0.1819,
477
+ "step": 3200
478
+ },
479
+ {
480
+ "epoch": 2.0793346129238643,
481
+ "grad_norm": 17.149974822998047,
482
+ "learning_rate": 6.137769247174238e-06,
483
+ "loss": 0.188,
484
+ "step": 3250
485
+ },
486
+ {
487
+ "epoch": 2.111324376199616,
488
+ "grad_norm": 4.2034382820129395,
489
+ "learning_rate": 5.924504158669226e-06,
490
+ "loss": 0.1653,
491
+ "step": 3300
492
+ },
493
+ {
494
+ "epoch": 2.143314139475368,
495
+ "grad_norm": 7.357328414916992,
496
+ "learning_rate": 5.711239070164215e-06,
497
+ "loss": 0.1748,
498
+ "step": 3350
499
+ },
500
+ {
501
+ "epoch": 2.1753039027511196,
502
+ "grad_norm": 13.677566528320312,
503
+ "learning_rate": 5.497973981659203e-06,
504
+ "loss": 0.2177,
505
+ "step": 3400
506
+ },
507
+ {
508
+ "epoch": 2.2072936660268714,
509
+ "grad_norm": 8.67300033569336,
510
+ "learning_rate": 5.284708893154191e-06,
511
+ "loss": 0.1571,
512
+ "step": 3450
513
+ },
514
+ {
515
+ "epoch": 2.239283429302623,
516
+ "grad_norm": 22.893770217895508,
517
+ "learning_rate": 5.07144380464918e-06,
518
+ "loss": 0.2009,
519
+ "step": 3500
520
+ },
521
+ {
522
+ "epoch": 2.271273192578375,
523
+ "grad_norm": 3.602328300476074,
524
+ "learning_rate": 4.858178716144167e-06,
525
+ "loss": 0.1329,
526
+ "step": 3550
527
+ },
528
+ {
529
+ "epoch": 2.3032629558541267,
530
+ "grad_norm": 18.8084659576416,
531
+ "learning_rate": 4.644913627639156e-06,
532
+ "loss": 0.1814,
533
+ "step": 3600
534
+ },
535
+ {
536
+ "epoch": 2.3352527191298784,
537
+ "grad_norm": 0.4535435736179352,
538
+ "learning_rate": 4.431648539134144e-06,
539
+ "loss": 0.2069,
540
+ "step": 3650
541
+ },
542
+ {
543
+ "epoch": 2.36724248240563,
544
+ "grad_norm": 11.971237182617188,
545
+ "learning_rate": 4.218383450629132e-06,
546
+ "loss": 0.188,
547
+ "step": 3700
548
+ },
549
+ {
550
+ "epoch": 2.399232245681382,
551
+ "grad_norm": 5.598160266876221,
552
+ "learning_rate": 4.005118362124121e-06,
553
+ "loss": 0.1943,
554
+ "step": 3750
555
+ },
556
+ {
557
+ "epoch": 2.4312220089571337,
558
+ "grad_norm": 11.97499942779541,
559
+ "learning_rate": 3.791853273619109e-06,
560
+ "loss": 0.2155,
561
+ "step": 3800
562
+ },
563
+ {
564
+ "epoch": 2.4632117722328855,
565
+ "grad_norm": 9.149175643920898,
566
+ "learning_rate": 3.5785881851140968e-06,
567
+ "loss": 0.2025,
568
+ "step": 3850
569
+ },
570
+ {
571
+ "epoch": 2.495201535508637,
572
+ "grad_norm": 7.965933799743652,
573
+ "learning_rate": 3.3653230966090854e-06,
574
+ "loss": 0.2121,
575
+ "step": 3900
576
+ },
577
+ {
578
+ "epoch": 2.527191298784389,
579
+ "grad_norm": 9.397235870361328,
580
+ "learning_rate": 3.1520580081040737e-06,
581
+ "loss": 0.1724,
582
+ "step": 3950
583
+ },
584
+ {
585
+ "epoch": 2.5591810620601407,
586
+ "grad_norm": 14.580031394958496,
587
+ "learning_rate": 2.9387929195990615e-06,
588
+ "loss": 0.1636,
589
+ "step": 4000
590
+ },
591
+ {
592
+ "epoch": 2.5911708253358925,
593
+ "grad_norm": 2.2896785736083984,
594
+ "learning_rate": 2.72552783109405e-06,
595
+ "loss": 0.1603,
596
+ "step": 4050
597
+ },
598
+ {
599
+ "epoch": 2.6231605886116443,
600
+ "grad_norm": 22.936344146728516,
601
+ "learning_rate": 2.5122627425890384e-06,
602
+ "loss": 0.1944,
603
+ "step": 4100
604
+ },
605
+ {
606
+ "epoch": 2.655150351887396,
607
+ "grad_norm": 37.436187744140625,
608
+ "learning_rate": 2.2989976540840266e-06,
609
+ "loss": 0.2059,
610
+ "step": 4150
611
+ },
612
+ {
613
+ "epoch": 2.6871401151631478,
614
+ "grad_norm": 26.6770076751709,
615
+ "learning_rate": 2.085732565579015e-06,
616
+ "loss": 0.1448,
617
+ "step": 4200
618
+ },
619
+ {
620
+ "epoch": 2.7191298784388995,
621
+ "grad_norm": 20.561311721801758,
622
+ "learning_rate": 1.872467477074003e-06,
623
+ "loss": 0.2094,
624
+ "step": 4250
625
+ },
626
+ {
627
+ "epoch": 2.7511196417146513,
628
+ "grad_norm": 4.114218711853027,
629
+ "learning_rate": 1.6592023885689915e-06,
630
+ "loss": 0.1682,
631
+ "step": 4300
632
+ },
633
+ {
634
+ "epoch": 2.783109404990403,
635
+ "grad_norm": 27.9166202545166,
636
+ "learning_rate": 1.4459373000639796e-06,
637
+ "loss": 0.1931,
638
+ "step": 4350
639
+ },
640
+ {
641
+ "epoch": 2.815099168266155,
642
+ "grad_norm": 13.576592445373535,
643
+ "learning_rate": 1.2326722115589678e-06,
644
+ "loss": 0.1989,
645
+ "step": 4400
646
+ },
647
+ {
648
+ "epoch": 2.8470889315419066,
649
+ "grad_norm": 4.428896427154541,
650
+ "learning_rate": 1.019407123053956e-06,
651
+ "loss": 0.18,
652
+ "step": 4450
653
+ },
654
+ {
655
+ "epoch": 2.8790786948176583,
656
+ "grad_norm": 10.087977409362793,
657
+ "learning_rate": 8.061420345489445e-07,
658
+ "loss": 0.2102,
659
+ "step": 4500
660
+ },
661
+ {
662
+ "epoch": 2.91106845809341,
663
+ "grad_norm": 16.517681121826172,
664
+ "learning_rate": 5.928769460439326e-07,
665
+ "loss": 0.1662,
666
+ "step": 4550
667
+ },
668
+ {
669
+ "epoch": 2.943058221369162,
670
+ "grad_norm": 15.329133987426758,
671
+ "learning_rate": 3.796118575389209e-07,
672
+ "loss": 0.1997,
673
+ "step": 4600
674
+ },
675
+ {
676
+ "epoch": 2.9750479846449136,
677
+ "grad_norm": 18.751319885253906,
678
+ "learning_rate": 1.6634676903390917e-07,
679
+ "loss": 0.1924,
680
+ "step": 4650
681
+ },
682
+ {
683
+ "epoch": 3.0,
684
+ "eval_accuracy": 0.9008317338451696,
685
+ "eval_f1": 0.892446816336566,
686
+ "eval_loss": 0.3720133900642395,
687
+ "eval_runtime": 71.8708,
688
+ "eval_samples_per_second": 43.495,
689
+ "eval_steps_per_second": 2.727,
690
+ "step": 4689
691
+ }
692
+ ],
693
+ "logging_steps": 50,
694
+ "max_steps": 4689,
695
+ "num_input_tokens_seen": 0,
696
+ "num_train_epochs": 3,
697
+ "save_steps": 500,
698
+ "stateful_callbacks": {
699
+ "TrainerControl": {
700
+ "args": {
701
+ "should_epoch_stop": false,
702
+ "should_evaluate": false,
703
+ "should_log": false,
704
+ "should_save": true,
705
+ "should_training_stop": true
706
+ },
707
+ "attributes": {}
708
+ }
709
+ },
710
+ "total_flos": 837402132857850.0,
711
+ "train_batch_size": 16,
712
+ "trial_name": null,
713
+ "trial_params": null
714
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80719dde8ee12368c548b9d0fd820c908d79ccc5612eb348afb50e76be1caccc
3
+ size 5240
vocab.txt CHANGED
The diff for this file is too large to render. See raw diff