lkw2024 commited on
Commit
1a82a3e
·
verified ·
1 Parent(s): 7646221

Upload 11 files

Browse files
config.json CHANGED
@@ -1,11 +1,12 @@
1
  {
2
- "_name_or_path": "monologg/koelectra-base-v3-discriminator",
3
  "architectures": [
4
- "ElectraForSequenceClassification"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
 
7
  "classifier_dropout": null,
8
- "embedding_size": 768,
9
  "hidden_act": "gelu",
10
  "hidden_dropout_prob": 0.1,
11
  "hidden_size": 768,
@@ -16,6 +17,7 @@
16
  },
17
  "initializer_range": 0.02,
18
  "intermediate_size": 3072,
 
19
  "label2id": {
20
  "LABEL_0": 0,
21
  "LABEL_1": 1,
@@ -23,19 +25,15 @@
23
  },
24
  "layer_norm_eps": 1e-12,
25
  "max_position_embeddings": 512,
26
- "model_type": "electra",
27
  "num_attention_heads": 12,
28
  "num_hidden_layers": 12,
29
- "pad_token_id": 0,
30
  "position_embedding_type": "absolute",
31
  "problem_type": "single_label_classification",
32
- "summary_activation": "gelu",
33
- "summary_last_dropout": 0.1,
34
- "summary_type": "first",
35
- "summary_use_proj": true,
36
  "torch_dtype": "float32",
37
  "transformers_version": "4.45.1",
38
  "type_vocab_size": 2,
39
  "use_cache": true,
40
- "vocab_size": 35000
41
  }
 
1
  {
2
+ "_name_or_path": "skt/kobert-base-v1",
3
  "architectures": [
4
+ "BertForSequenceClassification"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
+ "author": "Heewon Jeon([email protected])",
8
  "classifier_dropout": null,
9
+ "gradient_checkpointing": false,
10
  "hidden_act": "gelu",
11
  "hidden_dropout_prob": 0.1,
12
  "hidden_size": 768,
 
17
  },
18
  "initializer_range": 0.02,
19
  "intermediate_size": 3072,
20
+ "kobert_version": 1.0,
21
  "label2id": {
22
  "LABEL_0": 0,
23
  "LABEL_1": 1,
 
25
  },
26
  "layer_norm_eps": 1e-12,
27
  "max_position_embeddings": 512,
28
+ "model_type": "bert",
29
  "num_attention_heads": 12,
30
  "num_hidden_layers": 12,
31
+ "pad_token_id": 1,
32
  "position_embedding_type": "absolute",
33
  "problem_type": "single_label_classification",
 
 
 
 
34
  "torch_dtype": "float32",
35
  "transformers_version": "4.45.1",
36
  "type_vocab_size": 2,
37
  "use_cache": true,
38
+ "vocab_size": 8002
39
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7ce6384be92f1c7f441e7d36892fd4af2d8942446e7ec62ab1300075d6eead1
3
- size 451718748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f82d7a65656d1912f2914baa8290c48b9c55044f1f05bcafb8e1ee2590b9dd0a
3
+ size 368780204
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5080a9a6edc7288b55de6518de49d2b65f4886fd9e8e3d9c400a0b0a9920cdda
3
- size 903551738
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af29b42c6fce25f4772e217cb873f5bcb9dff8b526dd14d37818bb174701e2cb
3
+ size 737676026
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38bdd138de6a56a2479d7804e54bf21693ad07775ca65e9e18d4fd68b3b67efc
3
  size 13990
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7e2a646ca72077b2f4e495817913734817a4d0b59c43adf947fc829113f0db1
3
  size 13990
special_tokens_map.json CHANGED
@@ -1,6 +1,14 @@
1
  {
 
2
  "cls_token": "[CLS]",
3
- "mask_token": "[MASK]",
 
 
 
 
 
 
 
4
  "pad_token": "[PAD]",
5
  "sep_token": "[SEP]",
6
  "unk_token": "[UNK]"
 
1
  {
2
+ "bos_token": "[CLS]",
3
  "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": {
6
+ "content": "[MASK]",
7
+ "lstrip": true,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
  "pad_token": "[PAD]",
13
  "sep_token": "[SEP]",
14
  "unk_token": "[UNK]"
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17dc471055592d3cc9e0a5831e769246a8a001a4d27551c9ed79668173c7b407
3
+ size 371427
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
- "content": "[PAD]",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
@@ -9,12 +9,12 @@
9
  "special": true
10
  },
11
  "1": {
12
- "content": "[UNK]",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
16
  "single_word": false,
17
- "special": true
18
  },
19
  "2": {
20
  "content": "[CLS]",
@@ -22,7 +22,7 @@
22
  "normalized": false,
23
  "rstrip": false,
24
  "single_word": false,
25
- "special": true
26
  },
27
  "3": {
28
  "content": "[SEP]",
@@ -30,29 +30,31 @@
30
  "normalized": false,
31
  "rstrip": false,
32
  "single_word": false,
33
- "special": true
34
  },
35
  "4": {
36
  "content": "[MASK]",
37
- "lstrip": false,
38
- "normalized": false,
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
  }
43
  },
44
- "clean_up_tokenization_spaces": true,
 
 
45
  "cls_token": "[CLS]",
46
- "do_basic_tokenize": true,
47
  "do_lower_case": false,
 
 
48
  "mask_token": "[MASK]",
49
- "model_max_length": 512,
50
- "never_split": null,
51
  "pad_token": "[PAD]",
 
52
  "sep_token": "[SEP]",
53
- "strip_accents": null,
54
  "timeout": 60,
55
- "tokenize_chinese_chars": true,
56
- "tokenizer_class": "ElectraTokenizer",
57
  "unk_token": "[UNK]"
58
  }
 
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
+ "content": "[UNK]",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
 
9
  "special": true
10
  },
11
  "1": {
12
+ "content": "[PAD]",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
16
  "single_word": false,
17
+ "special": false
18
  },
19
  "2": {
20
  "content": "[CLS]",
 
22
  "normalized": false,
23
  "rstrip": false,
24
  "single_word": false,
25
+ "special": false
26
  },
27
  "3": {
28
  "content": "[SEP]",
 
30
  "normalized": false,
31
  "rstrip": false,
32
  "single_word": false,
33
+ "special": false
34
  },
35
  "4": {
36
  "content": "[MASK]",
37
+ "lstrip": true,
38
+ "normalized": true,
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
  }
43
  },
44
+ "additional_special_tokens": [],
45
+ "bos_token": "[CLS]",
46
+ "clean_up_tokenization_spaces": false,
47
  "cls_token": "[CLS]",
 
48
  "do_lower_case": false,
49
+ "eos_token": "[SEP]",
50
+ "keep_accents": false,
51
  "mask_token": "[MASK]",
52
+ "model_max_length": 1000000000000000019884624838656,
 
53
  "pad_token": "[PAD]",
54
+ "remove_space": true,
55
  "sep_token": "[SEP]",
56
+ "sp_model_kwargs": {},
57
  "timeout": 60,
58
+ "tokenizer_class": "XLNetTokenizer",
 
59
  "unk_token": "[UNK]"
60
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 0.9008317338451696,
3
- "best_model_checkpoint": "./checkpoints/monologg_koelectra-base-v3-discriminator\\checkpoint-4689",
4
  "epoch": 3.0,
5
  "eval_steps": 500,
6
  "global_step": 4689,
@@ -10,683 +10,683 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.03198976327575176,
13
- "grad_norm": 1.8438289165496826,
14
  "learning_rate": 1.9786734911494988e-05,
15
- "loss": 0.9998,
16
  "step": 50
17
  },
18
  {
19
  "epoch": 0.06397952655150352,
20
- "grad_norm": 4.708126544952393,
21
  "learning_rate": 1.9573469822989978e-05,
22
- "loss": 0.8181,
23
  "step": 100
24
  },
25
  {
26
  "epoch": 0.09596928982725528,
27
- "grad_norm": 7.408482551574707,
28
  "learning_rate": 1.9360204734484968e-05,
29
- "loss": 0.6265,
30
  "step": 150
31
  },
32
  {
33
  "epoch": 0.12795905310300704,
34
- "grad_norm": 6.211757183074951,
35
  "learning_rate": 1.9146939645979955e-05,
36
- "loss": 0.5046,
37
  "step": 200
38
  },
39
  {
40
  "epoch": 0.1599488163787588,
41
- "grad_norm": 7.470839500427246,
42
  "learning_rate": 1.8933674557474945e-05,
43
- "loss": 0.5358,
44
  "step": 250
45
  },
46
  {
47
  "epoch": 0.19193857965451055,
48
- "grad_norm": 2.0797882080078125,
49
  "learning_rate": 1.872040946896993e-05,
50
- "loss": 0.483,
51
  "step": 300
52
  },
53
  {
54
  "epoch": 0.22392834293026231,
55
- "grad_norm": 14.55879020690918,
56
  "learning_rate": 1.8507144380464918e-05,
57
- "loss": 0.4462,
58
  "step": 350
59
  },
60
  {
61
  "epoch": 0.2559181062060141,
62
- "grad_norm": 3.5867443084716797,
63
  "learning_rate": 1.8293879291959908e-05,
64
- "loss": 0.4348,
65
  "step": 400
66
  },
67
  {
68
  "epoch": 0.28790786948176583,
69
- "grad_norm": 11.995755195617676,
70
  "learning_rate": 1.8080614203454897e-05,
71
- "loss": 0.4236,
72
  "step": 450
73
  },
74
  {
75
  "epoch": 0.3198976327575176,
76
- "grad_norm": 7.211252689361572,
77
  "learning_rate": 1.7867349114949884e-05,
78
- "loss": 0.4099,
79
  "step": 500
80
  },
81
  {
82
  "epoch": 0.35188739603326935,
83
- "grad_norm": 5.884838581085205,
84
  "learning_rate": 1.7654084026444874e-05,
85
- "loss": 0.4265,
86
  "step": 550
87
  },
88
  {
89
  "epoch": 0.3838771593090211,
90
- "grad_norm": 4.443331241607666,
91
  "learning_rate": 1.744081893793986e-05,
92
- "loss": 0.3863,
93
  "step": 600
94
  },
95
  {
96
  "epoch": 0.41586692258477287,
97
- "grad_norm": 3.68717622756958,
98
  "learning_rate": 1.7227553849434847e-05,
99
- "loss": 0.4244,
100
  "step": 650
101
  },
102
  {
103
  "epoch": 0.44785668586052463,
104
- "grad_norm": 4.4258599281311035,
105
  "learning_rate": 1.7014288760929837e-05,
106
- "loss": 0.3701,
107
  "step": 700
108
  },
109
  {
110
  "epoch": 0.4798464491362764,
111
- "grad_norm": 8.484159469604492,
112
  "learning_rate": 1.6801023672424827e-05,
113
- "loss": 0.4079,
114
  "step": 750
115
  },
116
  {
117
  "epoch": 0.5118362124120281,
118
- "grad_norm": 8.976361274719238,
119
  "learning_rate": 1.6587758583919813e-05,
120
- "loss": 0.3413,
121
  "step": 800
122
  },
123
  {
124
  "epoch": 0.5438259756877799,
125
- "grad_norm": 7.113468170166016,
126
  "learning_rate": 1.6374493495414803e-05,
127
- "loss": 0.3928,
128
  "step": 850
129
  },
130
  {
131
  "epoch": 0.5758157389635317,
132
- "grad_norm": 7.242956161499023,
133
  "learning_rate": 1.616122840690979e-05,
134
- "loss": 0.4013,
135
  "step": 900
136
  },
137
  {
138
  "epoch": 0.6078055022392834,
139
- "grad_norm": 6.204492092132568,
140
  "learning_rate": 1.5947963318404776e-05,
141
- "loss": 0.3838,
142
  "step": 950
143
  },
144
  {
145
  "epoch": 0.6397952655150352,
146
- "grad_norm": 5.312352180480957,
147
  "learning_rate": 1.5734698229899766e-05,
148
- "loss": 0.3838,
149
  "step": 1000
150
  },
151
  {
152
  "epoch": 0.6717850287907869,
153
- "grad_norm": 22.331918716430664,
154
  "learning_rate": 1.5521433141394756e-05,
155
- "loss": 0.3618,
156
  "step": 1050
157
  },
158
  {
159
  "epoch": 0.7037747920665387,
160
- "grad_norm": 11.372283935546875,
161
  "learning_rate": 1.5308168052889743e-05,
162
- "loss": 0.3486,
163
  "step": 1100
164
  },
165
  {
166
  "epoch": 0.7357645553422905,
167
- "grad_norm": 31.921972274780273,
168
  "learning_rate": 1.5094902964384733e-05,
169
- "loss": 0.3807,
170
  "step": 1150
171
  },
172
  {
173
  "epoch": 0.7677543186180422,
174
- "grad_norm": 6.360013484954834,
175
  "learning_rate": 1.488163787587972e-05,
176
- "loss": 0.3903,
177
  "step": 1200
178
  },
179
  {
180
  "epoch": 0.799744081893794,
181
- "grad_norm": 5.0191330909729,
182
  "learning_rate": 1.4668372787374708e-05,
183
- "loss": 0.3477,
184
  "step": 1250
185
  },
186
  {
187
  "epoch": 0.8317338451695457,
188
- "grad_norm": 20.319231033325195,
189
  "learning_rate": 1.4455107698869698e-05,
190
- "loss": 0.3193,
191
  "step": 1300
192
  },
193
  {
194
  "epoch": 0.8637236084452975,
195
- "grad_norm": 3.2279179096221924,
196
  "learning_rate": 1.4241842610364684e-05,
197
- "loss": 0.3583,
198
  "step": 1350
199
  },
200
  {
201
  "epoch": 0.8957133717210493,
202
- "grad_norm": 5.339118480682373,
203
  "learning_rate": 1.4028577521859672e-05,
204
- "loss": 0.3343,
205
  "step": 1400
206
  },
207
  {
208
  "epoch": 0.927703134996801,
209
- "grad_norm": 3.4250569343566895,
210
  "learning_rate": 1.3815312433354662e-05,
211
- "loss": 0.3756,
212
  "step": 1450
213
  },
214
  {
215
  "epoch": 0.9596928982725528,
216
- "grad_norm": 7.723750591278076,
217
  "learning_rate": 1.3602047344849649e-05,
218
- "loss": 0.3342,
219
  "step": 1500
220
  },
221
  {
222
  "epoch": 0.9916826615483045,
223
- "grad_norm": 9.071324348449707,
224
  "learning_rate": 1.3388782256344637e-05,
225
- "loss": 0.3282,
226
  "step": 1550
227
  },
228
  {
229
  "epoch": 1.0,
230
- "eval_accuracy": 0.8848368522072937,
231
- "eval_f1": 0.8749968140479337,
232
- "eval_loss": 0.3197398781776428,
233
- "eval_runtime": 69.8877,
234
- "eval_samples_per_second": 44.729,
235
- "eval_steps_per_second": 2.804,
236
  "step": 1563
237
  },
238
  {
239
  "epoch": 1.0236724248240563,
240
- "grad_norm": 3.9245243072509766,
241
  "learning_rate": 1.3175517167839627e-05,
242
- "loss": 0.2311,
243
  "step": 1600
244
  },
245
  {
246
  "epoch": 1.055662188099808,
247
- "grad_norm": 13.951614379882812,
248
  "learning_rate": 1.2962252079334613e-05,
249
- "loss": 0.2844,
250
  "step": 1650
251
  },
252
  {
253
  "epoch": 1.0876519513755598,
254
- "grad_norm": 8.549053192138672,
255
  "learning_rate": 1.2748986990829602e-05,
256
- "loss": 0.2573,
257
  "step": 1700
258
  },
259
  {
260
  "epoch": 1.1196417146513116,
261
- "grad_norm": 7.1960835456848145,
262
  "learning_rate": 1.2535721902324592e-05,
263
- "loss": 0.2515,
264
  "step": 1750
265
  },
266
  {
267
  "epoch": 1.1516314779270633,
268
- "grad_norm": 3.4280333518981934,
269
  "learning_rate": 1.2322456813819578e-05,
270
- "loss": 0.3032,
271
  "step": 1800
272
  },
273
  {
274
  "epoch": 1.183621241202815,
275
- "grad_norm": 7.896517753601074,
276
  "learning_rate": 1.2109191725314566e-05,
277
- "loss": 0.2503,
278
  "step": 1850
279
  },
280
  {
281
  "epoch": 1.2156110044785668,
282
- "grad_norm": 3.302367687225342,
283
  "learning_rate": 1.1895926636809556e-05,
284
- "loss": 0.2815,
285
  "step": 1900
286
  },
287
  {
288
  "epoch": 1.2476007677543186,
289
- "grad_norm": 7.449910640716553,
290
  "learning_rate": 1.1682661548304543e-05,
291
- "loss": 0.2483,
292
  "step": 1950
293
  },
294
  {
295
  "epoch": 1.2795905310300704,
296
- "grad_norm": 6.277085304260254,
297
  "learning_rate": 1.1469396459799531e-05,
298
- "loss": 0.2709,
299
  "step": 2000
300
  },
301
  {
302
  "epoch": 1.3115802943058221,
303
- "grad_norm": 4.672796726226807,
304
  "learning_rate": 1.1256131371294521e-05,
305
- "loss": 0.2555,
306
  "step": 2050
307
  },
308
  {
309
  "epoch": 1.3435700575815739,
310
- "grad_norm": 11.666603088378906,
311
  "learning_rate": 1.1042866282789508e-05,
312
- "loss": 0.2381,
313
  "step": 2100
314
  },
315
  {
316
  "epoch": 1.3755598208573256,
317
- "grad_norm": 9.201951026916504,
318
  "learning_rate": 1.0829601194284496e-05,
319
- "loss": 0.2762,
320
  "step": 2150
321
  },
322
  {
323
  "epoch": 1.4075495841330774,
324
- "grad_norm": 2.2189481258392334,
325
  "learning_rate": 1.0616336105779486e-05,
326
- "loss": 0.2466,
327
  "step": 2200
328
  },
329
  {
330
  "epoch": 1.4395393474088292,
331
- "grad_norm": 8.02763557434082,
332
  "learning_rate": 1.0403071017274472e-05,
333
- "loss": 0.2556,
334
  "step": 2250
335
  },
336
  {
337
  "epoch": 1.471529110684581,
338
- "grad_norm": 2.0652530193328857,
339
  "learning_rate": 1.018980592876946e-05,
340
- "loss": 0.2789,
341
  "step": 2300
342
  },
343
  {
344
  "epoch": 1.5035188739603327,
345
- "grad_norm": 13.113646507263184,
346
  "learning_rate": 9.976540840264449e-06,
347
- "loss": 0.2235,
348
  "step": 2350
349
  },
350
  {
351
  "epoch": 1.5355086372360844,
352
- "grad_norm": 10.362279891967773,
353
  "learning_rate": 9.763275751759437e-06,
354
- "loss": 0.2482,
355
  "step": 2400
356
  },
357
  {
358
  "epoch": 1.5674984005118362,
359
- "grad_norm": 1.568915843963623,
360
  "learning_rate": 9.550010663254427e-06,
361
- "loss": 0.2622,
362
  "step": 2450
363
  },
364
  {
365
  "epoch": 1.599488163787588,
366
- "grad_norm": 10.095715522766113,
367
  "learning_rate": 9.336745574749414e-06,
368
- "loss": 0.2668,
369
  "step": 2500
370
  },
371
  {
372
  "epoch": 1.6314779270633397,
373
- "grad_norm": 12.999874114990234,
374
  "learning_rate": 9.123480486244403e-06,
375
- "loss": 0.2361,
376
  "step": 2550
377
  },
378
  {
379
  "epoch": 1.6634676903390915,
380
- "grad_norm": 7.512565612792969,
381
  "learning_rate": 8.910215397739392e-06,
382
- "loss": 0.2646,
383
  "step": 2600
384
  },
385
  {
386
  "epoch": 1.6954574536148432,
387
- "grad_norm": 6.645097732543945,
388
  "learning_rate": 8.696950309234378e-06,
389
- "loss": 0.2478,
390
  "step": 2650
391
  },
392
  {
393
  "epoch": 1.727447216890595,
394
- "grad_norm": 4.22629976272583,
395
  "learning_rate": 8.483685220729368e-06,
396
- "loss": 0.2686,
397
  "step": 2700
398
  },
399
  {
400
  "epoch": 1.7594369801663468,
401
- "grad_norm": 8.55104923248291,
402
  "learning_rate": 8.270420132224356e-06,
403
- "loss": 0.2615,
404
  "step": 2750
405
  },
406
  {
407
  "epoch": 1.7914267434420985,
408
- "grad_norm": 4.150308132171631,
409
  "learning_rate": 8.057155043719343e-06,
410
- "loss": 0.2669,
411
  "step": 2800
412
  },
413
  {
414
  "epoch": 1.8234165067178503,
415
- "grad_norm": 15.736576080322266,
416
  "learning_rate": 7.843889955214333e-06,
417
- "loss": 0.252,
418
  "step": 2850
419
  },
420
  {
421
  "epoch": 1.855406269993602,
422
- "grad_norm": 1.219308853149414,
423
  "learning_rate": 7.630624866709321e-06,
424
- "loss": 0.2619,
425
  "step": 2900
426
  },
427
  {
428
  "epoch": 1.8873960332693538,
429
- "grad_norm": 4.5792999267578125,
430
  "learning_rate": 7.4173597782043085e-06,
431
- "loss": 0.2516,
432
  "step": 2950
433
  },
434
  {
435
  "epoch": 1.9193857965451055,
436
- "grad_norm": 7.346240043640137,
437
  "learning_rate": 7.204094689699297e-06,
438
- "loss": 0.2374,
439
  "step": 3000
440
  },
441
  {
442
  "epoch": 1.9513755598208573,
443
- "grad_norm": 15.958673477172852,
444
  "learning_rate": 6.990829601194286e-06,
445
- "loss": 0.2568,
446
  "step": 3050
447
  },
448
  {
449
  "epoch": 1.983365323096609,
450
- "grad_norm": 8.143491744995117,
451
  "learning_rate": 6.777564512689273e-06,
452
- "loss": 0.2196,
453
  "step": 3100
454
  },
455
  {
456
  "epoch": 2.0,
457
- "eval_accuracy": 0.8902751119641714,
458
- "eval_f1": 0.8821468217693207,
459
- "eval_loss": 0.35885584354400635,
460
- "eval_runtime": 70.4611,
461
- "eval_samples_per_second": 44.365,
462
- "eval_steps_per_second": 2.782,
463
  "step": 3126
464
  },
465
  {
466
  "epoch": 2.015355086372361,
467
- "grad_norm": 2.4161245822906494,
468
  "learning_rate": 6.5642994241842614e-06,
469
- "loss": 0.211,
470
  "step": 3150
471
  },
472
  {
473
  "epoch": 2.0473448496481126,
474
- "grad_norm": 6.2580156326293945,
475
  "learning_rate": 6.3510343356792505e-06,
476
- "loss": 0.1819,
477
  "step": 3200
478
  },
479
  {
480
  "epoch": 2.0793346129238643,
481
- "grad_norm": 17.149974822998047,
482
  "learning_rate": 6.137769247174238e-06,
483
- "loss": 0.188,
484
  "step": 3250
485
  },
486
  {
487
  "epoch": 2.111324376199616,
488
- "grad_norm": 4.2034382820129395,
489
  "learning_rate": 5.924504158669226e-06,
490
- "loss": 0.1653,
491
  "step": 3300
492
  },
493
  {
494
  "epoch": 2.143314139475368,
495
- "grad_norm": 7.357328414916992,
496
  "learning_rate": 5.711239070164215e-06,
497
- "loss": 0.1748,
498
  "step": 3350
499
  },
500
  {
501
  "epoch": 2.1753039027511196,
502
- "grad_norm": 13.677566528320312,
503
  "learning_rate": 5.497973981659203e-06,
504
- "loss": 0.2177,
505
  "step": 3400
506
  },
507
  {
508
  "epoch": 2.2072936660268714,
509
- "grad_norm": 8.67300033569336,
510
  "learning_rate": 5.284708893154191e-06,
511
- "loss": 0.1571,
512
  "step": 3450
513
  },
514
  {
515
  "epoch": 2.239283429302623,
516
- "grad_norm": 22.893770217895508,
517
  "learning_rate": 5.07144380464918e-06,
518
- "loss": 0.2009,
519
  "step": 3500
520
  },
521
  {
522
  "epoch": 2.271273192578375,
523
- "grad_norm": 3.602328300476074,
524
  "learning_rate": 4.858178716144167e-06,
525
- "loss": 0.1329,
526
  "step": 3550
527
  },
528
  {
529
  "epoch": 2.3032629558541267,
530
- "grad_norm": 18.8084659576416,
531
  "learning_rate": 4.644913627639156e-06,
532
- "loss": 0.1814,
533
  "step": 3600
534
  },
535
  {
536
  "epoch": 2.3352527191298784,
537
- "grad_norm": 0.4535435736179352,
538
  "learning_rate": 4.431648539134144e-06,
539
- "loss": 0.2069,
540
  "step": 3650
541
  },
542
  {
543
  "epoch": 2.36724248240563,
544
- "grad_norm": 11.971237182617188,
545
  "learning_rate": 4.218383450629132e-06,
546
- "loss": 0.188,
547
  "step": 3700
548
  },
549
  {
550
  "epoch": 2.399232245681382,
551
- "grad_norm": 5.598160266876221,
552
  "learning_rate": 4.005118362124121e-06,
553
- "loss": 0.1943,
554
  "step": 3750
555
  },
556
  {
557
  "epoch": 2.4312220089571337,
558
- "grad_norm": 11.97499942779541,
559
  "learning_rate": 3.791853273619109e-06,
560
- "loss": 0.2155,
561
  "step": 3800
562
  },
563
  {
564
  "epoch": 2.4632117722328855,
565
- "grad_norm": 9.149175643920898,
566
  "learning_rate": 3.5785881851140968e-06,
567
- "loss": 0.2025,
568
  "step": 3850
569
  },
570
  {
571
  "epoch": 2.495201535508637,
572
- "grad_norm": 7.965933799743652,
573
  "learning_rate": 3.3653230966090854e-06,
574
- "loss": 0.2121,
575
  "step": 3900
576
  },
577
  {
578
  "epoch": 2.527191298784389,
579
- "grad_norm": 9.397235870361328,
580
  "learning_rate": 3.1520580081040737e-06,
581
- "loss": 0.1724,
582
  "step": 3950
583
  },
584
  {
585
  "epoch": 2.5591810620601407,
586
- "grad_norm": 14.580031394958496,
587
  "learning_rate": 2.9387929195990615e-06,
588
- "loss": 0.1636,
589
  "step": 4000
590
  },
591
  {
592
  "epoch": 2.5911708253358925,
593
- "grad_norm": 2.2896785736083984,
594
  "learning_rate": 2.72552783109405e-06,
595
- "loss": 0.1603,
596
  "step": 4050
597
  },
598
  {
599
  "epoch": 2.6231605886116443,
600
- "grad_norm": 22.936344146728516,
601
  "learning_rate": 2.5122627425890384e-06,
602
- "loss": 0.1944,
603
  "step": 4100
604
  },
605
  {
606
  "epoch": 2.655150351887396,
607
- "grad_norm": 37.436187744140625,
608
  "learning_rate": 2.2989976540840266e-06,
609
- "loss": 0.2059,
610
  "step": 4150
611
  },
612
  {
613
  "epoch": 2.6871401151631478,
614
- "grad_norm": 26.6770076751709,
615
  "learning_rate": 2.085732565579015e-06,
616
- "loss": 0.1448,
617
  "step": 4200
618
  },
619
  {
620
  "epoch": 2.7191298784388995,
621
- "grad_norm": 20.561311721801758,
622
  "learning_rate": 1.872467477074003e-06,
623
- "loss": 0.2094,
624
  "step": 4250
625
  },
626
  {
627
  "epoch": 2.7511196417146513,
628
- "grad_norm": 4.114218711853027,
629
  "learning_rate": 1.6592023885689915e-06,
630
- "loss": 0.1682,
631
  "step": 4300
632
  },
633
  {
634
  "epoch": 2.783109404990403,
635
- "grad_norm": 27.9166202545166,
636
  "learning_rate": 1.4459373000639796e-06,
637
- "loss": 0.1931,
638
  "step": 4350
639
  },
640
  {
641
  "epoch": 2.815099168266155,
642
- "grad_norm": 13.576592445373535,
643
  "learning_rate": 1.2326722115589678e-06,
644
- "loss": 0.1989,
645
  "step": 4400
646
  },
647
  {
648
  "epoch": 2.8470889315419066,
649
- "grad_norm": 4.428896427154541,
650
  "learning_rate": 1.019407123053956e-06,
651
- "loss": 0.18,
652
  "step": 4450
653
  },
654
  {
655
  "epoch": 2.8790786948176583,
656
- "grad_norm": 10.087977409362793,
657
  "learning_rate": 8.061420345489445e-07,
658
- "loss": 0.2102,
659
  "step": 4500
660
  },
661
  {
662
  "epoch": 2.91106845809341,
663
- "grad_norm": 16.517681121826172,
664
  "learning_rate": 5.928769460439326e-07,
665
- "loss": 0.1662,
666
  "step": 4550
667
  },
668
  {
669
  "epoch": 2.943058221369162,
670
- "grad_norm": 15.329133987426758,
671
  "learning_rate": 3.796118575389209e-07,
672
- "loss": 0.1997,
673
  "step": 4600
674
  },
675
  {
676
  "epoch": 2.9750479846449136,
677
- "grad_norm": 18.751319885253906,
678
  "learning_rate": 1.6634676903390917e-07,
679
- "loss": 0.1924,
680
  "step": 4650
681
  },
682
  {
683
  "epoch": 3.0,
684
- "eval_accuracy": 0.9008317338451696,
685
- "eval_f1": 0.892446816336566,
686
- "eval_loss": 0.3720133900642395,
687
- "eval_runtime": 71.8708,
688
- "eval_samples_per_second": 43.495,
689
- "eval_steps_per_second": 2.727,
690
  "step": 4689
691
  }
692
  ],
@@ -707,7 +707,7 @@
707
  "attributes": {}
708
  }
709
  },
710
- "total_flos": 837402132857850.0,
711
  "train_batch_size": 16,
712
  "trial_name": null,
713
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.6842610364683301,
3
+ "best_model_checkpoint": "./checkpoints/skt_kobert-base-v1\\checkpoint-3126",
4
  "epoch": 3.0,
5
  "eval_steps": 500,
6
  "global_step": 4689,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.03198976327575176,
13
+ "grad_norm": 2.0306639671325684,
14
  "learning_rate": 1.9786734911494988e-05,
15
+ "loss": 1.013,
16
  "step": 50
17
  },
18
  {
19
  "epoch": 0.06397952655150352,
20
+ "grad_norm": 2.591745138168335,
21
  "learning_rate": 1.9573469822989978e-05,
22
+ "loss": 0.9463,
23
  "step": 100
24
  },
25
  {
26
  "epoch": 0.09596928982725528,
27
+ "grad_norm": 2.405771017074585,
28
  "learning_rate": 1.9360204734484968e-05,
29
+ "loss": 0.912,
30
  "step": 150
31
  },
32
  {
33
  "epoch": 0.12795905310300704,
34
+ "grad_norm": 3.699129104614258,
35
  "learning_rate": 1.9146939645979955e-05,
36
+ "loss": 0.8349,
37
  "step": 200
38
  },
39
  {
40
  "epoch": 0.1599488163787588,
41
+ "grad_norm": 3.0695576667785645,
42
  "learning_rate": 1.8933674557474945e-05,
43
+ "loss": 0.8967,
44
  "step": 250
45
  },
46
  {
47
  "epoch": 0.19193857965451055,
48
+ "grad_norm": 2.762423038482666,
49
  "learning_rate": 1.872040946896993e-05,
50
+ "loss": 0.8609,
51
  "step": 300
52
  },
53
  {
54
  "epoch": 0.22392834293026231,
55
+ "grad_norm": 5.626348495483398,
56
  "learning_rate": 1.8507144380464918e-05,
57
+ "loss": 0.8619,
58
  "step": 350
59
  },
60
  {
61
  "epoch": 0.2559181062060141,
62
+ "grad_norm": 2.3783187866210938,
63
  "learning_rate": 1.8293879291959908e-05,
64
+ "loss": 0.847,
65
  "step": 400
66
  },
67
  {
68
  "epoch": 0.28790786948176583,
69
+ "grad_norm": 2.9742441177368164,
70
  "learning_rate": 1.8080614203454897e-05,
71
+ "loss": 0.8161,
72
  "step": 450
73
  },
74
  {
75
  "epoch": 0.3198976327575176,
76
+ "grad_norm": 1.8092529773712158,
77
  "learning_rate": 1.7867349114949884e-05,
78
+ "loss": 0.7752,
79
  "step": 500
80
  },
81
  {
82
  "epoch": 0.35188739603326935,
83
+ "grad_norm": 2.72413969039917,
84
  "learning_rate": 1.7654084026444874e-05,
85
+ "loss": 0.8281,
86
  "step": 550
87
  },
88
  {
89
  "epoch": 0.3838771593090211,
90
+ "grad_norm": 3.9095618724823,
91
  "learning_rate": 1.744081893793986e-05,
92
+ "loss": 0.7984,
93
  "step": 600
94
  },
95
  {
96
  "epoch": 0.41586692258477287,
97
+ "grad_norm": 3.821829319000244,
98
  "learning_rate": 1.7227553849434847e-05,
99
+ "loss": 0.845,
100
  "step": 650
101
  },
102
  {
103
  "epoch": 0.44785668586052463,
104
+ "grad_norm": 6.882596492767334,
105
  "learning_rate": 1.7014288760929837e-05,
106
+ "loss": 0.8218,
107
  "step": 700
108
  },
109
  {
110
  "epoch": 0.4798464491362764,
111
+ "grad_norm": 3.819617986679077,
112
  "learning_rate": 1.6801023672424827e-05,
113
+ "loss": 0.8316,
114
  "step": 750
115
  },
116
  {
117
  "epoch": 0.5118362124120281,
118
+ "grad_norm": 2.8060302734375,
119
  "learning_rate": 1.6587758583919813e-05,
120
+ "loss": 0.8009,
121
  "step": 800
122
  },
123
  {
124
  "epoch": 0.5438259756877799,
125
+ "grad_norm": 6.112952709197998,
126
  "learning_rate": 1.6374493495414803e-05,
127
+ "loss": 0.7959,
128
  "step": 850
129
  },
130
  {
131
  "epoch": 0.5758157389635317,
132
+ "grad_norm": 6.098555564880371,
133
  "learning_rate": 1.616122840690979e-05,
134
+ "loss": 0.822,
135
  "step": 900
136
  },
137
  {
138
  "epoch": 0.6078055022392834,
139
+ "grad_norm": 8.28978157043457,
140
  "learning_rate": 1.5947963318404776e-05,
141
+ "loss": 0.7781,
142
  "step": 950
143
  },
144
  {
145
  "epoch": 0.6397952655150352,
146
+ "grad_norm": 3.776757001876831,
147
  "learning_rate": 1.5734698229899766e-05,
148
+ "loss": 0.8017,
149
  "step": 1000
150
  },
151
  {
152
  "epoch": 0.6717850287907869,
153
+ "grad_norm": 2.7059381008148193,
154
  "learning_rate": 1.5521433141394756e-05,
155
+ "loss": 0.7822,
156
  "step": 1050
157
  },
158
  {
159
  "epoch": 0.7037747920665387,
160
+ "grad_norm": 6.732589244842529,
161
  "learning_rate": 1.5308168052889743e-05,
162
+ "loss": 0.7965,
163
  "step": 1100
164
  },
165
  {
166
  "epoch": 0.7357645553422905,
167
+ "grad_norm": 8.803948402404785,
168
  "learning_rate": 1.5094902964384733e-05,
169
+ "loss": 0.772,
170
  "step": 1150
171
  },
172
  {
173
  "epoch": 0.7677543186180422,
174
+ "grad_norm": 3.7579758167266846,
175
  "learning_rate": 1.488163787587972e-05,
176
+ "loss": 0.877,
177
  "step": 1200
178
  },
179
  {
180
  "epoch": 0.799744081893794,
181
+ "grad_norm": 3.5114402770996094,
182
  "learning_rate": 1.4668372787374708e-05,
183
+ "loss": 0.7834,
184
  "step": 1250
185
  },
186
  {
187
  "epoch": 0.8317338451695457,
188
+ "grad_norm": 3.5781891345977783,
189
  "learning_rate": 1.4455107698869698e-05,
190
+ "loss": 0.7746,
191
  "step": 1300
192
  },
193
  {
194
  "epoch": 0.8637236084452975,
195
+ "grad_norm": 4.3330278396606445,
196
  "learning_rate": 1.4241842610364684e-05,
197
+ "loss": 0.8138,
198
  "step": 1350
199
  },
200
  {
201
  "epoch": 0.8957133717210493,
202
+ "grad_norm": 3.0052335262298584,
203
  "learning_rate": 1.4028577521859672e-05,
204
+ "loss": 0.7651,
205
  "step": 1400
206
  },
207
  {
208
  "epoch": 0.927703134996801,
209
+ "grad_norm": 3.8835694789886475,
210
  "learning_rate": 1.3815312433354662e-05,
211
+ "loss": 0.7984,
212
  "step": 1450
213
  },
214
  {
215
  "epoch": 0.9596928982725528,
216
+ "grad_norm": 5.093181133270264,
217
  "learning_rate": 1.3602047344849649e-05,
218
+ "loss": 0.7943,
219
  "step": 1500
220
  },
221
  {
222
  "epoch": 0.9916826615483045,
223
+ "grad_norm": 2.615415096282959,
224
  "learning_rate": 1.3388782256344637e-05,
225
+ "loss": 0.7568,
226
  "step": 1550
227
  },
228
  {
229
  "epoch": 1.0,
230
+ "eval_accuracy": 0.673064619321817,
231
+ "eval_f1": 0.5774602220992764,
232
+ "eval_loss": 0.755181610584259,
233
+ "eval_runtime": 146.1897,
234
+ "eval_samples_per_second": 21.383,
235
+ "eval_steps_per_second": 1.341,
236
  "step": 1563
237
  },
238
  {
239
  "epoch": 1.0236724248240563,
240
+ "grad_norm": 3.9289746284484863,
241
  "learning_rate": 1.3175517167839627e-05,
242
+ "loss": 0.7968,
243
  "step": 1600
244
  },
245
  {
246
  "epoch": 1.055662188099808,
247
+ "grad_norm": 3.1284472942352295,
248
  "learning_rate": 1.2962252079334613e-05,
249
+ "loss": 0.7693,
250
  "step": 1650
251
  },
252
  {
253
  "epoch": 1.0876519513755598,
254
+ "grad_norm": 6.5303544998168945,
255
  "learning_rate": 1.2748986990829602e-05,
256
+ "loss": 0.7763,
257
  "step": 1700
258
  },
259
  {
260
  "epoch": 1.1196417146513116,
261
+ "grad_norm": 4.240009784698486,
262
  "learning_rate": 1.2535721902324592e-05,
263
+ "loss": 0.7499,
264
  "step": 1750
265
  },
266
  {
267
  "epoch": 1.1516314779270633,
268
+ "grad_norm": 3.231553554534912,
269
  "learning_rate": 1.2322456813819578e-05,
270
+ "loss": 0.8081,
271
  "step": 1800
272
  },
273
  {
274
  "epoch": 1.183621241202815,
275
+ "grad_norm": 3.248556613922119,
276
  "learning_rate": 1.2109191725314566e-05,
277
+ "loss": 0.7372,
278
  "step": 1850
279
  },
280
  {
281
  "epoch": 1.2156110044785668,
282
+ "grad_norm": 4.725924968719482,
283
  "learning_rate": 1.1895926636809556e-05,
284
+ "loss": 0.7564,
285
  "step": 1900
286
  },
287
  {
288
  "epoch": 1.2476007677543186,
289
+ "grad_norm": 7.154216766357422,
290
  "learning_rate": 1.1682661548304543e-05,
291
+ "loss": 0.7358,
292
  "step": 1950
293
  },
294
  {
295
  "epoch": 1.2795905310300704,
296
+ "grad_norm": 5.529367923736572,
297
  "learning_rate": 1.1469396459799531e-05,
298
+ "loss": 0.7921,
299
  "step": 2000
300
  },
301
  {
302
  "epoch": 1.3115802943058221,
303
+ "grad_norm": 2.6566271781921387,
304
  "learning_rate": 1.1256131371294521e-05,
305
+ "loss": 0.7684,
306
  "step": 2050
307
  },
308
  {
309
  "epoch": 1.3435700575815739,
310
+ "grad_norm": 3.7145371437072754,
311
  "learning_rate": 1.1042866282789508e-05,
312
+ "loss": 0.7612,
313
  "step": 2100
314
  },
315
  {
316
  "epoch": 1.3755598208573256,
317
+ "grad_norm": 4.2554707527160645,
318
  "learning_rate": 1.0829601194284496e-05,
319
+ "loss": 0.7622,
320
  "step": 2150
321
  },
322
  {
323
  "epoch": 1.4075495841330774,
324
+ "grad_norm": 3.207913398742676,
325
  "learning_rate": 1.0616336105779486e-05,
326
+ "loss": 0.7331,
327
  "step": 2200
328
  },
329
  {
330
  "epoch": 1.4395393474088292,
331
+ "grad_norm": 7.0538153648376465,
332
  "learning_rate": 1.0403071017274472e-05,
333
+ "loss": 0.7578,
334
  "step": 2250
335
  },
336
  {
337
  "epoch": 1.471529110684581,
338
+ "grad_norm": 4.054543972015381,
339
  "learning_rate": 1.018980592876946e-05,
340
+ "loss": 0.7711,
341
  "step": 2300
342
  },
343
  {
344
  "epoch": 1.5035188739603327,
345
+ "grad_norm": 5.265899658203125,
346
  "learning_rate": 9.976540840264449e-06,
347
+ "loss": 0.742,
348
  "step": 2350
349
  },
350
  {
351
  "epoch": 1.5355086372360844,
352
+ "grad_norm": 5.515292167663574,
353
  "learning_rate": 9.763275751759437e-06,
354
+ "loss": 0.763,
355
  "step": 2400
356
  },
357
  {
358
  "epoch": 1.5674984005118362,
359
+ "grad_norm": 3.4068896770477295,
360
  "learning_rate": 9.550010663254427e-06,
361
+ "loss": 0.7787,
362
  "step": 2450
363
  },
364
  {
365
  "epoch": 1.599488163787588,
366
+ "grad_norm": 3.8786113262176514,
367
  "learning_rate": 9.336745574749414e-06,
368
+ "loss": 0.7665,
369
  "step": 2500
370
  },
371
  {
372
  "epoch": 1.6314779270633397,
373
+ "grad_norm": 7.389112949371338,
374
  "learning_rate": 9.123480486244403e-06,
375
+ "loss": 0.7324,
376
  "step": 2550
377
  },
378
  {
379
  "epoch": 1.6634676903390915,
380
+ "grad_norm": 3.790463924407959,
381
  "learning_rate": 8.910215397739392e-06,
382
+ "loss": 0.7479,
383
  "step": 2600
384
  },
385
  {
386
  "epoch": 1.6954574536148432,
387
+ "grad_norm": 3.296069622039795,
388
  "learning_rate": 8.696950309234378e-06,
389
+ "loss": 0.6861,
390
  "step": 2650
391
  },
392
  {
393
  "epoch": 1.727447216890595,
394
+ "grad_norm": 3.9884438514709473,
395
  "learning_rate": 8.483685220729368e-06,
396
+ "loss": 0.7621,
397
  "step": 2700
398
  },
399
  {
400
  "epoch": 1.7594369801663468,
401
+ "grad_norm": 4.0090532302856445,
402
  "learning_rate": 8.270420132224356e-06,
403
+ "loss": 0.7455,
404
  "step": 2750
405
  },
406
  {
407
  "epoch": 1.7914267434420985,
408
+ "grad_norm": 5.72037935256958,
409
  "learning_rate": 8.057155043719343e-06,
410
+ "loss": 0.7615,
411
  "step": 2800
412
  },
413
  {
414
  "epoch": 1.8234165067178503,
415
+ "grad_norm": 5.367597579956055,
416
  "learning_rate": 7.843889955214333e-06,
417
+ "loss": 0.7478,
418
  "step": 2850
419
  },
420
  {
421
  "epoch": 1.855406269993602,
422
+ "grad_norm": 4.492830753326416,
423
  "learning_rate": 7.630624866709321e-06,
424
+ "loss": 0.7644,
425
  "step": 2900
426
  },
427
  {
428
  "epoch": 1.8873960332693538,
429
+ "grad_norm": 4.395884990692139,
430
  "learning_rate": 7.4173597782043085e-06,
431
+ "loss": 0.726,
432
  "step": 2950
433
  },
434
  {
435
  "epoch": 1.9193857965451055,
436
+ "grad_norm": 5.388432025909424,
437
  "learning_rate": 7.204094689699297e-06,
438
+ "loss": 0.7305,
439
  "step": 3000
440
  },
441
  {
442
  "epoch": 1.9513755598208573,
443
+ "grad_norm": 3.6554412841796875,
444
  "learning_rate": 6.990829601194286e-06,
445
+ "loss": 0.7312,
446
  "step": 3050
447
  },
448
  {
449
  "epoch": 1.983365323096609,
450
+ "grad_norm": 2.9544596672058105,
451
  "learning_rate": 6.777564512689273e-06,
452
+ "loss": 0.7535,
453
  "step": 3100
454
  },
455
  {
456
  "epoch": 2.0,
457
+ "eval_accuracy": 0.6842610364683301,
458
+ "eval_f1": 0.5914024960556081,
459
+ "eval_loss": 0.71613609790802,
460
+ "eval_runtime": 145.4665,
461
+ "eval_samples_per_second": 21.489,
462
+ "eval_steps_per_second": 1.347,
463
  "step": 3126
464
  },
465
  {
466
  "epoch": 2.015355086372361,
467
+ "grad_norm": 2.3898167610168457,
468
  "learning_rate": 6.5642994241842614e-06,
469
+ "loss": 0.7209,
470
  "step": 3150
471
  },
472
  {
473
  "epoch": 2.0473448496481126,
474
+ "grad_norm": 3.216728448867798,
475
  "learning_rate": 6.3510343356792505e-06,
476
+ "loss": 0.7328,
477
  "step": 3200
478
  },
479
  {
480
  "epoch": 2.0793346129238643,
481
+ "grad_norm": 5.2315673828125,
482
  "learning_rate": 6.137769247174238e-06,
483
+ "loss": 0.6979,
484
  "step": 3250
485
  },
486
  {
487
  "epoch": 2.111324376199616,
488
+ "grad_norm": 5.086841583251953,
489
  "learning_rate": 5.924504158669226e-06,
490
+ "loss": 0.7401,
491
  "step": 3300
492
  },
493
  {
494
  "epoch": 2.143314139475368,
495
+ "grad_norm": 3.975651502609253,
496
  "learning_rate": 5.711239070164215e-06,
497
+ "loss": 0.7695,
498
  "step": 3350
499
  },
500
  {
501
  "epoch": 2.1753039027511196,
502
+ "grad_norm": 5.14952278137207,
503
  "learning_rate": 5.497973981659203e-06,
504
+ "loss": 0.7259,
505
  "step": 3400
506
  },
507
  {
508
  "epoch": 2.2072936660268714,
509
+ "grad_norm": 10.398428916931152,
510
  "learning_rate": 5.284708893154191e-06,
511
+ "loss": 0.7099,
512
  "step": 3450
513
  },
514
  {
515
  "epoch": 2.239283429302623,
516
+ "grad_norm": 5.051026821136475,
517
  "learning_rate": 5.07144380464918e-06,
518
+ "loss": 0.7256,
519
  "step": 3500
520
  },
521
  {
522
  "epoch": 2.271273192578375,
523
+ "grad_norm": 5.900022506713867,
524
  "learning_rate": 4.858178716144167e-06,
525
+ "loss": 0.6803,
526
  "step": 3550
527
  },
528
  {
529
  "epoch": 2.3032629558541267,
530
+ "grad_norm": 3.650153636932373,
531
  "learning_rate": 4.644913627639156e-06,
532
+ "loss": 0.6936,
533
  "step": 3600
534
  },
535
  {
536
  "epoch": 2.3352527191298784,
537
+ "grad_norm": 3.192567825317383,
538
  "learning_rate": 4.431648539134144e-06,
539
+ "loss": 0.7388,
540
  "step": 3650
541
  },
542
  {
543
  "epoch": 2.36724248240563,
544
+ "grad_norm": 2.792283058166504,
545
  "learning_rate": 4.218383450629132e-06,
546
+ "loss": 0.7199,
547
  "step": 3700
548
  },
549
  {
550
  "epoch": 2.399232245681382,
551
+ "grad_norm": 4.723822593688965,
552
  "learning_rate": 4.005118362124121e-06,
553
+ "loss": 0.7536,
554
  "step": 3750
555
  },
556
  {
557
  "epoch": 2.4312220089571337,
558
+ "grad_norm": 6.793713092803955,
559
  "learning_rate": 3.791853273619109e-06,
560
+ "loss": 0.7043,
561
  "step": 3800
562
  },
563
  {
564
  "epoch": 2.4632117722328855,
565
+ "grad_norm": 4.855713367462158,
566
  "learning_rate": 3.5785881851140968e-06,
567
+ "loss": 0.7587,
568
  "step": 3850
569
  },
570
  {
571
  "epoch": 2.495201535508637,
572
+ "grad_norm": 3.8809781074523926,
573
  "learning_rate": 3.3653230966090854e-06,
574
+ "loss": 0.7007,
575
  "step": 3900
576
  },
577
  {
578
  "epoch": 2.527191298784389,
579
+ "grad_norm": 3.591956377029419,
580
  "learning_rate": 3.1520580081040737e-06,
581
+ "loss": 0.6805,
582
  "step": 3950
583
  },
584
  {
585
  "epoch": 2.5591810620601407,
586
+ "grad_norm": 4.135153293609619,
587
  "learning_rate": 2.9387929195990615e-06,
588
+ "loss": 0.7628,
589
  "step": 4000
590
  },
591
  {
592
  "epoch": 2.5911708253358925,
593
+ "grad_norm": 4.887539386749268,
594
  "learning_rate": 2.72552783109405e-06,
595
+ "loss": 0.7203,
596
  "step": 4050
597
  },
598
  {
599
  "epoch": 2.6231605886116443,
600
+ "grad_norm": 5.000283241271973,
601
  "learning_rate": 2.5122627425890384e-06,
602
+ "loss": 0.709,
603
  "step": 4100
604
  },
605
  {
606
  "epoch": 2.655150351887396,
607
+ "grad_norm": 5.802310943603516,
608
  "learning_rate": 2.2989976540840266e-06,
609
+ "loss": 0.6938,
610
  "step": 4150
611
  },
612
  {
613
  "epoch": 2.6871401151631478,
614
+ "grad_norm": 2.909243583679199,
615
  "learning_rate": 2.085732565579015e-06,
616
+ "loss": 0.7236,
617
  "step": 4200
618
  },
619
  {
620
  "epoch": 2.7191298784388995,
621
+ "grad_norm": 3.879182815551758,
622
  "learning_rate": 1.872467477074003e-06,
623
+ "loss": 0.7327,
624
  "step": 4250
625
  },
626
  {
627
  "epoch": 2.7511196417146513,
628
+ "grad_norm": 5.3820295333862305,
629
  "learning_rate": 1.6592023885689915e-06,
630
+ "loss": 0.6762,
631
  "step": 4300
632
  },
633
  {
634
  "epoch": 2.783109404990403,
635
+ "grad_norm": 4.583397388458252,
636
  "learning_rate": 1.4459373000639796e-06,
637
+ "loss": 0.7111,
638
  "step": 4350
639
  },
640
  {
641
  "epoch": 2.815099168266155,
642
+ "grad_norm": 3.990649938583374,
643
  "learning_rate": 1.2326722115589678e-06,
644
+ "loss": 0.7177,
645
  "step": 4400
646
  },
647
  {
648
  "epoch": 2.8470889315419066,
649
+ "grad_norm": 5.5702433586120605,
650
  "learning_rate": 1.019407123053956e-06,
651
+ "loss": 0.7068,
652
  "step": 4450
653
  },
654
  {
655
  "epoch": 2.8790786948176583,
656
+ "grad_norm": 4.175040245056152,
657
  "learning_rate": 8.061420345489445e-07,
658
+ "loss": 0.7092,
659
  "step": 4500
660
  },
661
  {
662
  "epoch": 2.91106845809341,
663
+ "grad_norm": 4.912069797515869,
664
  "learning_rate": 5.928769460439326e-07,
665
+ "loss": 0.7022,
666
  "step": 4550
667
  },
668
  {
669
  "epoch": 2.943058221369162,
670
+ "grad_norm": 4.595305442810059,
671
  "learning_rate": 3.796118575389209e-07,
672
+ "loss": 0.7228,
673
  "step": 4600
674
  },
675
  {
676
  "epoch": 2.9750479846449136,
677
+ "grad_norm": 3.835669755935669,
678
  "learning_rate": 1.6634676903390917e-07,
679
+ "loss": 0.6946,
680
  "step": 4650
681
  },
682
  {
683
  "epoch": 3.0,
684
+ "eval_accuracy": 0.6829814459373,
685
+ "eval_f1": 0.5863851194804445,
686
+ "eval_loss": 0.7020198106765747,
687
+ "eval_runtime": 144.2948,
688
+ "eval_samples_per_second": 21.664,
689
+ "eval_steps_per_second": 1.358,
690
  "step": 4689
691
  }
692
  ],
 
707
  "attributes": {}
708
  }
709
  },
710
+ "total_flos": 1757912647073562.0,
711
  "train_batch_size": 16,
712
  "trial_name": null,
713
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:80719dde8ee12368c548b9d0fd820c908d79ccc5612eb348afb50e76be1caccc
3
- size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b1a2694e34f5fe6ee1f92624820bc26f69ac4d9b0f9439ad2d5b4381e756508
3
+ size 5176