lapp0 commited on
Commit
f56b68f
·
verified ·
1 Parent(s): 7da8b46

End of training

Browse files
README.md CHANGED
@@ -41,38 +41,38 @@ More information needed
41
 
42
  # Benchmark Metrics Comparison
43
 
44
- | Metric | attn_layer_mapper=all, attn_loss_fn=raw_mse, attn_projector=miles | teacher |
45
- | :--- | :--- | :--- |
46
- | ai2_arc (acc) | 0.256 | 0.304 |
47
- | ai2_arc (acc_norm) | 0.267 | 0.309 |
48
- | arc_challenge (acc) | 0.177 | 0.184 |
49
- | arc_challenge (acc_norm) | 0.202 | 0.214 |
50
- | arc_easy (acc) | 0.335 | 0.424 |
51
- | arc_easy (acc_norm) | 0.332 | 0.405 |
52
- | boolq (acc) | 0.377 | 0.541 |
53
- | cola (mcc) | 0.0 | 0.009 |
54
- | glue (acc) | 0.444 | 0.41 |
55
- | glue (f1) | 0.279 | 0.526 |
56
- | glue (mcc) | 0.0 | 0.009 |
57
- | hellaswag (acc) | 0.302 | 0.337 |
58
- | hellaswag (acc_norm) | 0.308 | 0.384 |
59
- | mnli (acc) | 0.331 | 0.323 |
60
- | mnli_mismatch (acc) | 0.367 | 0.344 |
61
- | mrpc (acc) | 0.336 | 0.515 |
62
- | mrpc (f1) | 0.075 | 0.631 |
63
- | qnli (acc) | 0.519 | 0.472 |
64
- | qqp (acc) | 0.515 | 0.34 |
65
- | qqp (f1) | 0.363 | 0.483 |
66
- | rte (acc) | 0.57 | 0.516 |
67
- | sst2 (acc) | 0.498 | 0.511 |
68
- | wikitext (bits_per_byte) | 1.273 | 0.98 |
69
- | wikitext (byte_perplexity) | 2.416 | 1.973 |
70
- | wikitext (word_perplexity) | 111.9 | 37.82 |
71
- | wnli (acc) | 0.521 | 0.451 |
72
 
73
  # Resource Usage Comparison
74
 
75
- - VRAM Use: 7.7854 GB
76
 
77
  # Distillation (Teacher -> Student) Architecture Difference:
78
 
@@ -92,7 +92,7 @@ More information needed
92
  <br/>
93
 
94
  # Train Dataset
95
- Trained on 145,722,156 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
96
 
97
  - Num Samples: `247,500`
98
  - Subset: `20231101.en`
@@ -102,7 +102,7 @@ Trained on 145,722,156 tokens from the [wikimedia/wikipedia](https://huggingface
102
  # Training Objective
103
 
104
  ```
105
- DistillationObjective(logits_loss_component=LossComponent(label=logits, weight=1, loss_fn=kl), attn_loss_component=LossComponent(label=attn, weight=25.0, loss_fn=raw_mse, layer_mapper=all, projector=miles))
106
  ```
107
 
108
  # Hyperparameters
@@ -119,9 +119,9 @@ The following hyperparameters were used during training:
119
  - lr_scheduler_type: `cosine_with_min_lr`
120
  - lr_scheduler_warmup_ratio: `0.5`
121
  - num_epochs: `1.0`
122
- - distillation_objective: `DistillationObjective(logits_loss_component=LossComponent(label=logits, weight=1, loss_fn=kl), attn_loss_component=LossComponent(label=attn, weight=25.0, loss_fn=raw_mse, layer_mapper=all, projector=miles))`
123
  - train_embeddings: `True`
124
- - lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7f6efc464fd0>`
125
  - student_model_name_or_path: `None`
126
  - student_config_name_or_path: `None`
127
  - student_model_config: `None`
 
41
 
42
  # Benchmark Metrics Comparison
43
 
44
+ | Metric | attn_layer_mapper=all, attn_loss_fn=logsum, attn_projector=miles | attn_layer_mapper=all, attn_loss_fn=raw_mse, attn_projector=miles | teacher |
45
+ | :--- | :--- | :--- | :--- |
46
+ | ai2_arc (acc) | 0.228 | 0.256 | 0.304 |
47
+ | ai2_arc (acc_norm) | 0.258 | 0.267 | 0.309 |
48
+ | arc_challenge (acc) | 0.186 | 0.177 | 0.184 |
49
+ | arc_challenge (acc_norm) | 0.227 | 0.202 | 0.214 |
50
+ | arc_easy (acc) | 0.27 | 0.335 | 0.424 |
51
+ | arc_easy (acc_norm) | 0.288 | 0.332 | 0.405 |
52
+ | boolq (acc) | 0.375 | 0.377 | 0.541 |
53
+ | cola (mcc) | 0.0 | 0.0 | 0.009 |
54
+ | glue (acc) | 0.454 | 0.444 | 0.41 |
55
+ | glue (f1) | 0.0 | 0.279 | 0.526 |
56
+ | glue (mcc) | 0.0 | 0.0 | 0.009 |
57
+ | hellaswag (acc) | 0.282 | 0.302 | 0.337 |
58
+ | hellaswag (acc_norm) | 0.275 | 0.308 | 0.384 |
59
+ | mnli (acc) | 0.326 | 0.331 | 0.323 |
60
+ | mnli_mismatch (acc) | 0.295 | 0.367 | 0.344 |
61
+ | mrpc (acc) | 0.316 | 0.336 | 0.515 |
62
+ | mrpc (f1) | 0.0 | 0.075 | 0.631 |
63
+ | qnli (acc) | 0.527 | 0.519 | 0.472 |
64
+ | qqp (acc) | 0.673 | 0.515 | 0.34 |
65
+ | qqp (f1) | 0.0 | 0.363 | 0.483 |
66
+ | rte (acc) | 0.52 | 0.57 | 0.516 |
67
+ | sst2 (acc) | 0.492 | 0.498 | 0.511 |
68
+ | wikitext (bits_per_byte) | 1.888 | 1.273 | 0.98 |
69
+ | wikitext (byte_perplexity) | 3.701 | 2.416 | 1.973 |
70
+ | wikitext (word_perplexity) | 1094.0 | 111.9 | 37.82 |
71
+ | wnli (acc) | 0.437 | 0.521 | 0.451 |
72
 
73
  # Resource Usage Comparison
74
 
75
+ - VRAM Use: 8.2920 GB
76
 
77
  # Distillation (Teacher -> Student) Architecture Difference:
78
 
 
92
  <br/>
93
 
94
  # Train Dataset
95
+ Trained on 145,724,804 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
96
 
97
  - Num Samples: `247,500`
98
  - Subset: `20231101.en`
 
102
  # Training Objective
103
 
104
  ```
105
+ DistillationObjective(logits_loss_component=LossComponent(label=logits, weight=1, loss_fn=kl), attn_loss_component=LossComponent(label=attn, weight=25.0, loss_fn=logsum, layer_mapper=all, projector=miles))
106
  ```
107
 
108
  # Hyperparameters
 
119
  - lr_scheduler_type: `cosine_with_min_lr`
120
  - lr_scheduler_warmup_ratio: `0.5`
121
  - num_epochs: `1.0`
122
+ - distillation_objective: `DistillationObjective(logits_loss_component=LossComponent(label=logits, weight=1, loss_fn=kl), attn_loss_component=LossComponent(label=attn, weight=25.0, loss_fn=logsum, layer_mapper=all, projector=miles))`
123
  - train_embeddings: `True`
124
+ - lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7f6927719540>`
125
  - student_model_name_or_path: `None`
126
  - student_config_name_or_path: `None`
127
  - student_model_config: `None`
benchmarks.shelve.bak CHANGED
@@ -1,2 +1,3 @@
1
  'teacher', (0, 14412556)
2
  'attn_layer_mapper=all, attn_loss_fn=raw_mse, attn_projector=miles', (14412800, 14412543)
 
 
1
  'teacher', (0, 14412556)
2
  'attn_layer_mapper=all, attn_loss_fn=raw_mse, attn_projector=miles', (14412800, 14412543)
3
+ 'attn_layer_mapper=all, attn_loss_fn=logsum, attn_projector=miles', (28825600, 14412543)
benchmarks.shelve.dat CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15752df966654d7744eed260668d6b9896db6cad1636bbc6ea69f57a26dbf0c1
3
- size 28825343
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8a09705a037029d69c5e81224eeebac8dc59482639c6426a73ca1da1a1c699f
3
+ size 43238143
benchmarks.shelve.dir CHANGED
@@ -1,2 +1,3 @@
1
  'teacher', (0, 14412556)
2
  'attn_layer_mapper=all, attn_loss_fn=raw_mse, attn_projector=miles', (14412800, 14412543)
 
 
1
  'teacher', (0, 14412556)
2
  'attn_layer_mapper=all, attn_loss_fn=raw_mse, attn_projector=miles', (14412800, 14412543)
3
+ 'attn_layer_mapper=all, attn_loss_fn=logsum, attn_projector=miles', (28825600, 14412543)
tokenizer.json CHANGED
@@ -1,19 +1,7 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 1023,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
- "padding": {
10
- "strategy": "BatchLongest",
11
- "direction": "Right",
12
- "pad_to_multiple_of": null,
13
- "pad_id": 50256,
14
- "pad_type_id": 0,
15
- "pad_token": "<|endoftext|>"
16
- },
17
  "added_tokens": [
18
  {
19
  "id": 50256,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 50256,