Training in progress, epoch 1
Browse files- logs/events.out.tfevents.1709875287.6c1c016b0e3e.7263.10 +2 -2
- logs/events.out.tfevents.1709876067.6c1c016b0e3e.7263.11 +3 -0
- model.safetensors +1 -1
- run-10/checkpoint-288/model.safetensors +1 -1
- run-10/checkpoint-288/optimizer.pt +1 -1
- run-10/checkpoint-288/rng_state.pth +1 -1
- run-10/checkpoint-288/scheduler.pt +1 -1
- run-10/checkpoint-288/trainer_state.json +104 -47
- run-10/checkpoint-288/training_args.bin +1 -1
- run-10/checkpoint-336/config.json +34 -0
- run-10/checkpoint-336/model.safetensors +3 -0
- run-10/checkpoint-336/optimizer.pt +3 -0
- run-10/checkpoint-336/rng_state.pth +3 -0
- run-10/checkpoint-336/scheduler.pt +3 -0
- run-10/checkpoint-336/special_tokens_map.json +7 -0
- run-10/checkpoint-336/tokenizer.json +0 -0
- run-10/checkpoint-336/tokenizer_config.json +57 -0
- run-10/checkpoint-336/trainer_state.json +159 -0
- run-10/checkpoint-336/training_args.bin +3 -0
- run-10/checkpoint-336/vocab.txt +0 -0
- run-11/checkpoint-48/config.json +34 -0
- run-11/checkpoint-48/model.safetensors +3 -0
- run-11/checkpoint-48/optimizer.pt +3 -0
- run-11/checkpoint-48/rng_state.pth +3 -0
- run-11/checkpoint-48/scheduler.pt +3 -0
- run-11/checkpoint-48/special_tokens_map.json +7 -0
- run-11/checkpoint-48/tokenizer.json +0 -0
- run-11/checkpoint-48/tokenizer_config.json +57 -0
- run-11/checkpoint-48/trainer_state.json +45 -0
- run-11/checkpoint-48/training_args.bin +3 -0
- run-11/checkpoint-48/vocab.txt +0 -0
- training_args.bin +1 -1
logs/events.out.tfevents.1709875287.6c1c016b0e3e.7263.10
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4327b87919dbbe94f8e79a4891c578c3a1b7e1ea3caa619c6a632bd7c54f00b9
|
3 |
+
size 9753
|
logs/events.out.tfevents.1709876067.6c1c016b0e3e.7263.11
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e6d98aed345117a010060b30bf0c50ede7b829e269a95a23a3f2d664634f193
|
3 |
+
size 5314
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 17549312
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:361284d6cbbbf0747d1d2496b9712460a833e47055619c1ce3d78588bf871550
|
3 |
size 17549312
|
run-10/checkpoint-288/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 17549312
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:16fde600bac51204144e3af5549aadff6874a092e8bba58ee66b59cf759a6f1c
|
3 |
size 17549312
|
run-10/checkpoint-288/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 35122746
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2537703686efc188dccb3cb6511447099a3ded3581554b578e100ed57db6d64d
|
3 |
size 35122746
|
run-10/checkpoint-288/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14054
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dcc277e7d1d83522a853f43efa19a0dd29e8896ab414cd166ac88116bce74f64
|
3 |
size 14054
|
run-10/checkpoint-288/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:733baf3bbf0bc4e04a501520755b8d276b20695f85c3038a12c240464c860b0c
|
3 |
size 1064
|
run-10/checkpoint-288/trainer_state.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
{
|
2 |
-
"best_metric": 0.
|
3 |
"best_model_checkpoint": "tiny-bert-sst2-distilled/run-10/checkpoint-288",
|
4 |
-
"epoch":
|
5 |
"eval_steps": 500,
|
6 |
"global_step": 288,
|
7 |
"is_hyper_param_search": true,
|
@@ -10,74 +10,131 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
-
"grad_norm": 1.
|
14 |
-
"learning_rate": 0.
|
15 |
-
"loss": 0.
|
16 |
-
"step":
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
-
"eval_accuracy": 0.
|
21 |
-
"eval_f1": 0.
|
22 |
-
"eval_loss": 0.
|
23 |
-
"eval_precision": 0.
|
24 |
-
"eval_recall": 0.
|
25 |
-
"eval_runtime":
|
26 |
-
"eval_samples_per_second":
|
27 |
-
"eval_steps_per_second":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
"step": 96
|
29 |
},
|
30 |
{
|
31 |
"epoch": 2.0,
|
32 |
-
"
|
33 |
-
"
|
34 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
"step": 192
|
36 |
},
|
37 |
{
|
38 |
-
"epoch":
|
39 |
-
"eval_accuracy": 0.
|
40 |
-
"eval_f1": 0.
|
41 |
-
"eval_loss": 0.
|
42 |
-
"eval_precision": 0.
|
43 |
-
"eval_recall": 0.
|
44 |
-
"eval_runtime":
|
45 |
-
"eval_samples_per_second":
|
46 |
-
"eval_steps_per_second":
|
47 |
"step": 192
|
48 |
},
|
49 |
{
|
50 |
-
"epoch":
|
51 |
-
"grad_norm":
|
52 |
-
"learning_rate": 0.
|
53 |
-
"loss": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
"step": 288
|
55 |
},
|
56 |
{
|
57 |
-
"epoch":
|
58 |
-
"eval_accuracy": 0.
|
59 |
-
"eval_f1": 0.
|
60 |
-
"eval_loss": 0.
|
61 |
-
"eval_precision": 0.
|
62 |
-
"eval_recall": 0.
|
63 |
-
"eval_runtime":
|
64 |
-
"eval_samples_per_second":
|
65 |
-
"eval_steps_per_second": 0.
|
66 |
"step": 288
|
67 |
}
|
68 |
],
|
69 |
"logging_steps": 500,
|
70 |
-
"max_steps":
|
71 |
"num_input_tokens_seen": 0,
|
72 |
-
"num_train_epochs":
|
73 |
"save_steps": 500,
|
74 |
-
"total_flos":
|
75 |
-
"train_batch_size":
|
76 |
"trial_name": null,
|
77 |
"trial_params": {
|
78 |
-
"alpha": 0.
|
79 |
-
"learning_rate": 0.
|
80 |
-
"num_train_epochs":
|
81 |
-
"temperature":
|
82 |
}
|
83 |
}
|
|
|
1 |
{
|
2 |
+
"best_metric": 0.8140900195694716,
|
3 |
"best_model_checkpoint": "tiny-bert-sst2-distilled/run-10/checkpoint-288",
|
4 |
+
"epoch": 6.0,
|
5 |
"eval_steps": 500,
|
6 |
"global_step": 288,
|
7 |
"is_hyper_param_search": true,
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 1.0,
|
13 |
+
"grad_norm": 1.5607351064682007,
|
14 |
+
"learning_rate": 0.0008549093212842401,
|
15 |
+
"loss": 0.5996,
|
16 |
+
"step": 48
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
+
"eval_accuracy": 0.7632093933463796,
|
21 |
+
"eval_f1": 0.7949152542372881,
|
22 |
+
"eval_loss": 0.502604067325592,
|
23 |
+
"eval_precision": 0.7010463378176383,
|
24 |
+
"eval_recall": 0.9178082191780822,
|
25 |
+
"eval_runtime": 27.7883,
|
26 |
+
"eval_samples_per_second": 36.778,
|
27 |
+
"eval_steps_per_second": 0.576,
|
28 |
+
"step": 48
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"epoch": 2.0,
|
32 |
+
"grad_norm": 3.883798122406006,
|
33 |
+
"learning_rate": 0.0007124244344035335,
|
34 |
+
"loss": 0.4919,
|
35 |
"step": 96
|
36 |
},
|
37 |
{
|
38 |
"epoch": 2.0,
|
39 |
+
"eval_accuracy": 0.7847358121330724,
|
40 |
+
"eval_f1": 0.8066783831282953,
|
41 |
+
"eval_loss": 0.4441834092140198,
|
42 |
+
"eval_precision": 0.7320574162679426,
|
43 |
+
"eval_recall": 0.898238747553816,
|
44 |
+
"eval_runtime": 27.0257,
|
45 |
+
"eval_samples_per_second": 37.816,
|
46 |
+
"eval_steps_per_second": 0.592,
|
47 |
+
"step": 96
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"epoch": 3.0,
|
51 |
+
"grad_norm": 3.711146116256714,
|
52 |
+
"learning_rate": 0.0005699395475228268,
|
53 |
+
"loss": 0.449,
|
54 |
+
"step": 144
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"epoch": 3.0,
|
58 |
+
"eval_accuracy": 0.799412915851272,
|
59 |
+
"eval_f1": 0.8222029488291415,
|
60 |
+
"eval_loss": 0.46845388412475586,
|
61 |
+
"eval_precision": 0.7383177570093458,
|
62 |
+
"eval_recall": 0.9275929549902152,
|
63 |
+
"eval_runtime": 27.3696,
|
64 |
+
"eval_samples_per_second": 37.341,
|
65 |
+
"eval_steps_per_second": 0.585,
|
66 |
+
"step": 144
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"epoch": 4.0,
|
70 |
+
"grad_norm": 4.223331451416016,
|
71 |
+
"learning_rate": 0.0004274546606421201,
|
72 |
+
"loss": 0.4144,
|
73 |
"step": 192
|
74 |
},
|
75 |
{
|
76 |
+
"epoch": 4.0,
|
77 |
+
"eval_accuracy": 0.8003913894324853,
|
78 |
+
"eval_f1": 0.796812749003984,
|
79 |
+
"eval_loss": 0.45222949981689453,
|
80 |
+
"eval_precision": 0.8113590263691683,
|
81 |
+
"eval_recall": 0.7827788649706457,
|
82 |
+
"eval_runtime": 27.0074,
|
83 |
+
"eval_samples_per_second": 37.841,
|
84 |
+
"eval_steps_per_second": 0.592,
|
85 |
"step": 192
|
86 |
},
|
87 |
{
|
88 |
+
"epoch": 5.0,
|
89 |
+
"grad_norm": 2.6906895637512207,
|
90 |
+
"learning_rate": 0.0002849697737614134,
|
91 |
+
"loss": 0.3883,
|
92 |
+
"step": 240
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"epoch": 5.0,
|
96 |
+
"eval_accuracy": 0.8033268101761253,
|
97 |
+
"eval_f1": 0.8280581693755347,
|
98 |
+
"eval_loss": 0.4314705431461334,
|
99 |
+
"eval_precision": 0.7355623100303952,
|
100 |
+
"eval_recall": 0.9471624266144814,
|
101 |
+
"eval_runtime": 27.3713,
|
102 |
+
"eval_samples_per_second": 37.338,
|
103 |
+
"eval_steps_per_second": 0.585,
|
104 |
+
"step": 240
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"epoch": 6.0,
|
108 |
+
"grad_norm": 2.0857512950897217,
|
109 |
+
"learning_rate": 0.0001424848868807067,
|
110 |
+
"loss": 0.3715,
|
111 |
"step": 288
|
112 |
},
|
113 |
{
|
114 |
+
"epoch": 6.0,
|
115 |
+
"eval_accuracy": 0.8140900195694716,
|
116 |
+
"eval_f1": 0.8327464788732395,
|
117 |
+
"eval_loss": 0.4203811585903168,
|
118 |
+
"eval_precision": 0.7568,
|
119 |
+
"eval_recall": 0.9256360078277887,
|
120 |
+
"eval_runtime": 27.3476,
|
121 |
+
"eval_samples_per_second": 37.371,
|
122 |
+
"eval_steps_per_second": 0.585,
|
123 |
"step": 288
|
124 |
}
|
125 |
],
|
126 |
"logging_steps": 500,
|
127 |
+
"max_steps": 336,
|
128 |
"num_input_tokens_seen": 0,
|
129 |
+
"num_train_epochs": 7,
|
130 |
"save_steps": 500,
|
131 |
+
"total_flos": 1414171183680.0,
|
132 |
+
"train_batch_size": 64,
|
133 |
"trial_name": null,
|
134 |
"trial_params": {
|
135 |
+
"alpha": 0.9873851004059778,
|
136 |
+
"learning_rate": 0.0009973942081649468,
|
137 |
+
"num_train_epochs": 7,
|
138 |
+
"temperature": 16
|
139 |
}
|
140 |
}
|
run-10/checkpoint-288/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8cb76553ba5d5fd16067059456c55e07ddbd2f0f51720f91f3d3e0bb7e3a6405
|
3 |
size 4920
|
run-10/checkpoint-336/config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "google/bert_uncased_L-2_H-128_A-2",
|
3 |
+
"architectures": [
|
4 |
+
"BertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"hidden_act": "gelu",
|
9 |
+
"hidden_dropout_prob": 0.1,
|
10 |
+
"hidden_size": 128,
|
11 |
+
"id2label": {
|
12 |
+
"0": "negative",
|
13 |
+
"1": "positive"
|
14 |
+
},
|
15 |
+
"initializer_range": 0.02,
|
16 |
+
"intermediate_size": 512,
|
17 |
+
"label2id": {
|
18 |
+
"negative": "0",
|
19 |
+
"positive": "1"
|
20 |
+
},
|
21 |
+
"layer_norm_eps": 1e-12,
|
22 |
+
"max_position_embeddings": 512,
|
23 |
+
"model_type": "bert",
|
24 |
+
"num_attention_heads": 2,
|
25 |
+
"num_hidden_layers": 2,
|
26 |
+
"pad_token_id": 0,
|
27 |
+
"position_embedding_type": "absolute",
|
28 |
+
"problem_type": "single_label_classification",
|
29 |
+
"torch_dtype": "float32",
|
30 |
+
"transformers_version": "4.38.2",
|
31 |
+
"type_vocab_size": 2,
|
32 |
+
"use_cache": true,
|
33 |
+
"vocab_size": 30522
|
34 |
+
}
|
run-10/checkpoint-336/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db8fa0c6189436d5f7ed954aecc452f28c14a2f60bae8edc6e3af80084c3c2eb
|
3 |
+
size 17549312
|
run-10/checkpoint-336/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:05006656683bf23068bad41417e56cd397242a22b1831ffd9d753f9ee90bcb0e
|
3 |
+
size 35122746
|
run-10/checkpoint-336/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:30720e851cc54860df5aa89a6892389c4264a6061ff42a4eef7ff950f504b083
|
3 |
+
size 14054
|
run-10/checkpoint-336/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f8a700295948ab1cf174647dfa8e4b564d6690fbaaafa7ad04ce093ef700034
|
3 |
+
size 1064
|
run-10/checkpoint-336/special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
run-10/checkpoint-336/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
run-10/checkpoint-336/tokenizer_config.json
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"100": {
|
12 |
+
"content": "[UNK]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"101": {
|
20 |
+
"content": "[CLS]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"102": {
|
28 |
+
"content": "[SEP]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"103": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"clean_up_tokenization_spaces": true,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_basic_tokenize": true,
|
47 |
+
"do_lower_case": true,
|
48 |
+
"mask_token": "[MASK]",
|
49 |
+
"model_max_length": 512,
|
50 |
+
"never_split": null,
|
51 |
+
"pad_token": "[PAD]",
|
52 |
+
"sep_token": "[SEP]",
|
53 |
+
"strip_accents": null,
|
54 |
+
"tokenize_chinese_chars": true,
|
55 |
+
"tokenizer_class": "BertTokenizer",
|
56 |
+
"unk_token": "[UNK]"
|
57 |
+
}
|
run-10/checkpoint-336/trainer_state.json
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.824853228962818,
|
3 |
+
"best_model_checkpoint": "tiny-bert-sst2-distilled/run-10/checkpoint-336",
|
4 |
+
"epoch": 7.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 336,
|
7 |
+
"is_hyper_param_search": true,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 1.0,
|
13 |
+
"grad_norm": 1.5607351064682007,
|
14 |
+
"learning_rate": 0.0008549093212842401,
|
15 |
+
"loss": 0.5996,
|
16 |
+
"step": 48
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 1.0,
|
20 |
+
"eval_accuracy": 0.7632093933463796,
|
21 |
+
"eval_f1": 0.7949152542372881,
|
22 |
+
"eval_loss": 0.502604067325592,
|
23 |
+
"eval_precision": 0.7010463378176383,
|
24 |
+
"eval_recall": 0.9178082191780822,
|
25 |
+
"eval_runtime": 27.7883,
|
26 |
+
"eval_samples_per_second": 36.778,
|
27 |
+
"eval_steps_per_second": 0.576,
|
28 |
+
"step": 48
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"epoch": 2.0,
|
32 |
+
"grad_norm": 3.883798122406006,
|
33 |
+
"learning_rate": 0.0007124244344035335,
|
34 |
+
"loss": 0.4919,
|
35 |
+
"step": 96
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"epoch": 2.0,
|
39 |
+
"eval_accuracy": 0.7847358121330724,
|
40 |
+
"eval_f1": 0.8066783831282953,
|
41 |
+
"eval_loss": 0.4441834092140198,
|
42 |
+
"eval_precision": 0.7320574162679426,
|
43 |
+
"eval_recall": 0.898238747553816,
|
44 |
+
"eval_runtime": 27.0257,
|
45 |
+
"eval_samples_per_second": 37.816,
|
46 |
+
"eval_steps_per_second": 0.592,
|
47 |
+
"step": 96
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"epoch": 3.0,
|
51 |
+
"grad_norm": 3.711146116256714,
|
52 |
+
"learning_rate": 0.0005699395475228268,
|
53 |
+
"loss": 0.449,
|
54 |
+
"step": 144
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"epoch": 3.0,
|
58 |
+
"eval_accuracy": 0.799412915851272,
|
59 |
+
"eval_f1": 0.8222029488291415,
|
60 |
+
"eval_loss": 0.46845388412475586,
|
61 |
+
"eval_precision": 0.7383177570093458,
|
62 |
+
"eval_recall": 0.9275929549902152,
|
63 |
+
"eval_runtime": 27.3696,
|
64 |
+
"eval_samples_per_second": 37.341,
|
65 |
+
"eval_steps_per_second": 0.585,
|
66 |
+
"step": 144
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"epoch": 4.0,
|
70 |
+
"grad_norm": 4.223331451416016,
|
71 |
+
"learning_rate": 0.0004274546606421201,
|
72 |
+
"loss": 0.4144,
|
73 |
+
"step": 192
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"epoch": 4.0,
|
77 |
+
"eval_accuracy": 0.8003913894324853,
|
78 |
+
"eval_f1": 0.796812749003984,
|
79 |
+
"eval_loss": 0.45222949981689453,
|
80 |
+
"eval_precision": 0.8113590263691683,
|
81 |
+
"eval_recall": 0.7827788649706457,
|
82 |
+
"eval_runtime": 27.0074,
|
83 |
+
"eval_samples_per_second": 37.841,
|
84 |
+
"eval_steps_per_second": 0.592,
|
85 |
+
"step": 192
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"epoch": 5.0,
|
89 |
+
"grad_norm": 2.6906895637512207,
|
90 |
+
"learning_rate": 0.0002849697737614134,
|
91 |
+
"loss": 0.3883,
|
92 |
+
"step": 240
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"epoch": 5.0,
|
96 |
+
"eval_accuracy": 0.8033268101761253,
|
97 |
+
"eval_f1": 0.8280581693755347,
|
98 |
+
"eval_loss": 0.4314705431461334,
|
99 |
+
"eval_precision": 0.7355623100303952,
|
100 |
+
"eval_recall": 0.9471624266144814,
|
101 |
+
"eval_runtime": 27.3713,
|
102 |
+
"eval_samples_per_second": 37.338,
|
103 |
+
"eval_steps_per_second": 0.585,
|
104 |
+
"step": 240
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"epoch": 6.0,
|
108 |
+
"grad_norm": 2.0857512950897217,
|
109 |
+
"learning_rate": 0.0001424848868807067,
|
110 |
+
"loss": 0.3715,
|
111 |
+
"step": 288
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"epoch": 6.0,
|
115 |
+
"eval_accuracy": 0.8140900195694716,
|
116 |
+
"eval_f1": 0.8327464788732395,
|
117 |
+
"eval_loss": 0.4203811585903168,
|
118 |
+
"eval_precision": 0.7568,
|
119 |
+
"eval_recall": 0.9256360078277887,
|
120 |
+
"eval_runtime": 27.3476,
|
121 |
+
"eval_samples_per_second": 37.371,
|
122 |
+
"eval_steps_per_second": 0.585,
|
123 |
+
"step": 288
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"epoch": 7.0,
|
127 |
+
"grad_norm": 2.2891719341278076,
|
128 |
+
"learning_rate": 0.0,
|
129 |
+
"loss": 0.3626,
|
130 |
+
"step": 336
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"epoch": 7.0,
|
134 |
+
"eval_accuracy": 0.824853228962818,
|
135 |
+
"eval_f1": 0.8359303391384051,
|
136 |
+
"eval_loss": 0.4121144413948059,
|
137 |
+
"eval_precision": 0.7862068965517242,
|
138 |
+
"eval_recall": 0.8923679060665362,
|
139 |
+
"eval_runtime": 30.2756,
|
140 |
+
"eval_samples_per_second": 33.757,
|
141 |
+
"eval_steps_per_second": 0.528,
|
142 |
+
"step": 336
|
143 |
+
}
|
144 |
+
],
|
145 |
+
"logging_steps": 500,
|
146 |
+
"max_steps": 336,
|
147 |
+
"num_input_tokens_seen": 0,
|
148 |
+
"num_train_epochs": 7,
|
149 |
+
"save_steps": 500,
|
150 |
+
"total_flos": 1649866380960.0,
|
151 |
+
"train_batch_size": 64,
|
152 |
+
"trial_name": null,
|
153 |
+
"trial_params": {
|
154 |
+
"alpha": 0.9873851004059778,
|
155 |
+
"learning_rate": 0.0009973942081649468,
|
156 |
+
"num_train_epochs": 7,
|
157 |
+
"temperature": 16
|
158 |
+
}
|
159 |
+
}
|
run-10/checkpoint-336/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8cb76553ba5d5fd16067059456c55e07ddbd2f0f51720f91f3d3e0bb7e3a6405
|
3 |
+
size 4920
|
run-10/checkpoint-336/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
run-11/checkpoint-48/config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "google/bert_uncased_L-2_H-128_A-2",
|
3 |
+
"architectures": [
|
4 |
+
"BertForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"hidden_act": "gelu",
|
9 |
+
"hidden_dropout_prob": 0.1,
|
10 |
+
"hidden_size": 128,
|
11 |
+
"id2label": {
|
12 |
+
"0": "negative",
|
13 |
+
"1": "positive"
|
14 |
+
},
|
15 |
+
"initializer_range": 0.02,
|
16 |
+
"intermediate_size": 512,
|
17 |
+
"label2id": {
|
18 |
+
"negative": "0",
|
19 |
+
"positive": "1"
|
20 |
+
},
|
21 |
+
"layer_norm_eps": 1e-12,
|
22 |
+
"max_position_embeddings": 512,
|
23 |
+
"model_type": "bert",
|
24 |
+
"num_attention_heads": 2,
|
25 |
+
"num_hidden_layers": 2,
|
26 |
+
"pad_token_id": 0,
|
27 |
+
"position_embedding_type": "absolute",
|
28 |
+
"problem_type": "single_label_classification",
|
29 |
+
"torch_dtype": "float32",
|
30 |
+
"transformers_version": "4.38.2",
|
31 |
+
"type_vocab_size": 2,
|
32 |
+
"use_cache": true,
|
33 |
+
"vocab_size": 30522
|
34 |
+
}
|
run-11/checkpoint-48/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:361284d6cbbbf0747d1d2496b9712460a833e47055619c1ce3d78588bf871550
|
3 |
+
size 17549312
|
run-11/checkpoint-48/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6c25644a65bdf41de7ee0309c13f675afc49f167008ba62ccb37e4e30760af0c
|
3 |
+
size 35122746
|
run-11/checkpoint-48/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f731098665929ed6a815501631b54240caaac6508207c5c55cf0fe36ad39b17
|
3 |
+
size 14054
|
run-11/checkpoint-48/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:678e7fd168382ae4d233368ea59a020ab00987ed0913f4b94fa8eb4e73fa2007
|
3 |
+
size 1064
|
run-11/checkpoint-48/special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
run-11/checkpoint-48/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
run-11/checkpoint-48/tokenizer_config.json
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"100": {
|
12 |
+
"content": "[UNK]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"101": {
|
20 |
+
"content": "[CLS]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"102": {
|
28 |
+
"content": "[SEP]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"103": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"clean_up_tokenization_spaces": true,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_basic_tokenize": true,
|
47 |
+
"do_lower_case": true,
|
48 |
+
"mask_token": "[MASK]",
|
49 |
+
"model_max_length": 512,
|
50 |
+
"never_split": null,
|
51 |
+
"pad_token": "[PAD]",
|
52 |
+
"sep_token": "[SEP]",
|
53 |
+
"strip_accents": null,
|
54 |
+
"tokenize_chinese_chars": true,
|
55 |
+
"tokenizer_class": "BertTokenizer",
|
56 |
+
"unk_token": "[UNK]"
|
57 |
+
}
|
run-11/checkpoint-48/trainer_state.json
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 0.7524461839530333,
|
3 |
+
"best_model_checkpoint": "tiny-bert-sst2-distilled/run-11/checkpoint-48",
|
4 |
+
"epoch": 1.0,
|
5 |
+
"eval_steps": 500,
|
6 |
+
"global_step": 48,
|
7 |
+
"is_hyper_param_search": true,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 1.0,
|
13 |
+
"grad_norm": 2.152926206588745,
|
14 |
+
"learning_rate": 0.0007816128679887146,
|
15 |
+
"loss": 0.591,
|
16 |
+
"step": 48
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 1.0,
|
20 |
+
"eval_accuracy": 0.7524461839530333,
|
21 |
+
"eval_f1": 0.7944760357432982,
|
22 |
+
"eval_loss": 0.4880000948905945,
|
23 |
+
"eval_precision": 0.6791666666666667,
|
24 |
+
"eval_recall": 0.9569471624266145,
|
25 |
+
"eval_runtime": 29.551,
|
26 |
+
"eval_samples_per_second": 34.584,
|
27 |
+
"eval_steps_per_second": 0.541,
|
28 |
+
"step": 48
|
29 |
+
}
|
30 |
+
],
|
31 |
+
"logging_steps": 500,
|
32 |
+
"max_steps": 336,
|
33 |
+
"num_input_tokens_seen": 0,
|
34 |
+
"num_train_epochs": 7,
|
35 |
+
"save_steps": 500,
|
36 |
+
"total_flos": 235695197280.0,
|
37 |
+
"train_batch_size": 64,
|
38 |
+
"trial_name": null,
|
39 |
+
"trial_params": {
|
40 |
+
"alpha": 0.9971844266637861,
|
41 |
+
"learning_rate": 0.0009118816793201671,
|
42 |
+
"num_train_epochs": 7,
|
43 |
+
"temperature": 16
|
44 |
+
}
|
45 |
+
}
|
run-11/checkpoint-48/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a283f7ca8fa712756294b7fd6d3247bcb44ee70c9bb9ae961204f34de91039c
|
3 |
+
size 4920
|
run-11/checkpoint-48/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4920
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a283f7ca8fa712756294b7fd6d3247bcb44ee70c9bb9ae961204f34de91039c
|
3 |
size 4920
|