End of training

Browse files

Files changed (5) hide show

README.md +33 -33
benchmarks.shelve.bak +1 -0
benchmarks.shelve.dat +2 -2
benchmarks.shelve.dir +1 -0
tokenizer.json +2 -14

README.md CHANGED Viewed

@@ -41,38 +41,38 @@ More information needed
 # Benchmark Metrics Comparison
-| Metric | attn_layer_mapper=all, attn_loss_fn=raw_mse, attn_projector=miles | teacher |
-| :--- | :--- | :--- |
-| ai2_arc (acc) | 0.256 | 0.304 |
-| ai2_arc (acc_norm) | 0.267 | 0.309 |
-| arc_challenge (acc) | 0.177 | 0.184 |
-| arc_challenge (acc_norm) | 0.202 | 0.214 |
-| arc_easy (acc) | 0.335 | 0.424 |
-| arc_easy (acc_norm) | 0.332 | 0.405 |
-| boolq (acc) | 0.377 | 0.541 |
-| cola (mcc) | 0.0 | 0.009 |
-| glue (acc) | 0.444 | 0.41 |
-| glue (f1) | 0.279 | 0.526 |
-| glue (mcc) | 0.0 | 0.009 |
-| hellaswag (acc) | 0.302 | 0.337 |
-| hellaswag (acc_norm) | 0.308 | 0.384 |
-| mnli (acc) | 0.331 | 0.323 |
-| mnli_mismatch (acc) | 0.367 | 0.344 |
-| mrpc (acc) | 0.336 | 0.515 |
-| mrpc (f1) | 0.075 | 0.631 |
-| qnli (acc) | 0.519 | 0.472 |
-| qqp (acc) | 0.515 | 0.34 |
-| qqp (f1) | 0.363 | 0.483 |
-| rte (acc) | 0.57 | 0.516 |
-| sst2 (acc) | 0.498 | 0.511 |
-| wikitext (bits_per_byte) | 1.273 | 0.98 |
-| wikitext (byte_perplexity) | 2.416 | 1.973 |
-| wikitext (word_perplexity) | 111.9 | 37.82 |
-| wnli (acc) | 0.521 | 0.451 |
 # Resource Usage Comparison
-- VRAM Use: 7.7854 GB
 # Distillation (Teacher -> Student) Architecture Difference:
@@ -92,7 +92,7 @@ More information needed
 <br/>
 # Train Dataset
-Trained on 145,722,156 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
 - Num Samples: `247,500`
 - Subset: `20231101.en`
@@ -102,7 +102,7 @@ Trained on 145,722,156 tokens from the [wikimedia/wikipedia](https://huggingface
 # Training Objective
 ```
-DistillationObjective(logits_loss_component=LossComponent(label=logits, weight=1, loss_fn=kl), attn_loss_component=LossComponent(label=attn, weight=25.0, loss_fn=raw_mse, layer_mapper=all, projector=miles))
 ```
 # Hyperparameters
@@ -119,9 +119,9 @@ The following hyperparameters were used during training:
 - lr_scheduler_type: `cosine_with_min_lr`
 - lr_scheduler_warmup_ratio: `0.5`
 - num_epochs: `1.0`
-- distillation_objective: `DistillationObjective(logits_loss_component=LossComponent(label=logits, weight=1, loss_fn=kl), attn_loss_component=LossComponent(label=attn, weight=25.0, loss_fn=raw_mse, layer_mapper=all, projector=miles))`
 - train_embeddings: `True`
-- lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7f6efc464fd0>`
 - student_model_name_or_path: `None`
 - student_config_name_or_path: `None`
 - student_model_config: `None`

 # Benchmark Metrics Comparison
+| Metric | attn_layer_mapper=all, attn_loss_fn=logsum, attn_projector=miles | attn_layer_mapper=all, attn_loss_fn=raw_mse, attn_projector=miles | teacher |
+| :--- | :--- | :--- | :--- |
+| ai2_arc (acc) | 0.228 | 0.256 | 0.304 |
+| ai2_arc (acc_norm) | 0.258 | 0.267 | 0.309 |
+| arc_challenge (acc) | 0.186 | 0.177 | 0.184 |
+| arc_challenge (acc_norm) | 0.227 | 0.202 | 0.214 |
+| arc_easy (acc) | 0.27 | 0.335 | 0.424 |
+| arc_easy (acc_norm) | 0.288 | 0.332 | 0.405 |
+| boolq (acc) | 0.375 | 0.377 | 0.541 |
+| cola (mcc) | 0.0 | 0.0 | 0.009 |
+| glue (acc) | 0.454 | 0.444 | 0.41 |
+| glue (f1) | 0.0 | 0.279 | 0.526 |
+| glue (mcc) | 0.0 | 0.0 | 0.009 |
+| hellaswag (acc) | 0.282 | 0.302 | 0.337 |
+| hellaswag (acc_norm) | 0.275 | 0.308 | 0.384 |
+| mnli (acc) | 0.326 | 0.331 | 0.323 |
+| mnli_mismatch (acc) | 0.295 | 0.367 | 0.344 |
+| mrpc (acc) | 0.316 | 0.336 | 0.515 |
+| mrpc (f1) | 0.0 | 0.075 | 0.631 |
+| qnli (acc) | 0.527 | 0.519 | 0.472 |
+| qqp (acc) | 0.673 | 0.515 | 0.34 |
+| qqp (f1) | 0.0 | 0.363 | 0.483 |
+| rte (acc) | 0.52 | 0.57 | 0.516 |
+| sst2 (acc) | 0.492 | 0.498 | 0.511 |
+| wikitext (bits_per_byte) | 1.888 | 1.273 | 0.98 |
+| wikitext (byte_perplexity) | 3.701 | 2.416 | 1.973 |
+| wikitext (word_perplexity) | 1094.0 | 111.9 | 37.82 |
+| wnli (acc) | 0.437 | 0.521 | 0.451 |
 # Resource Usage Comparison
+- VRAM Use: 8.2920 GB
 # Distillation (Teacher -> Student) Architecture Difference:
 <br/>
 # Train Dataset
+Trained on 145,724,804 tokens from the [wikimedia/wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset.
 - Num Samples: `247,500`
 - Subset: `20231101.en`
 # Training Objective
 ```
+DistillationObjective(logits_loss_component=LossComponent(label=logits, weight=1, loss_fn=kl), attn_loss_component=LossComponent(label=attn, weight=25.0, loss_fn=logsum, layer_mapper=all, projector=miles))
 ```
 # Hyperparameters
 - lr_scheduler_type: `cosine_with_min_lr`
 - lr_scheduler_warmup_ratio: `0.5`
 - num_epochs: `1.0`
+- distillation_objective: `DistillationObjective(logits_loss_component=LossComponent(label=logits, weight=1, loss_fn=kl), attn_loss_component=LossComponent(label=attn, weight=25.0, loss_fn=logsum, layer_mapper=all, projector=miles))`
 - train_embeddings: `True`
+- lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7f6927719540>`
 - student_model_name_or_path: `None`
 - student_config_name_or_path: `None`
 - student_model_config: `None`

benchmarks.shelve.bak CHANGED Viewed

@@ -1,2 +1,3 @@
 'teacher', (0, 14412556)
 'attn_layer_mapper=all, attn_loss_fn=raw_mse, attn_projector=miles', (14412800, 14412543)

 'teacher', (0, 14412556)
 'attn_layer_mapper=all, attn_loss_fn=raw_mse, attn_projector=miles', (14412800, 14412543)
+'attn_layer_mapper=all, attn_loss_fn=logsum, attn_projector=miles', (28825600, 14412543)

benchmarks.shelve.dat CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:15752df966654d7744eed260668d6b9896db6cad1636bbc6ea69f57a26dbf0c1
-size 28825343

 version https://git-lfs.github.com/spec/v1
+oid sha256:e8a09705a037029d69c5e81224eeebac8dc59482639c6426a73ca1da1a1c699f
+size 43238143

benchmarks.shelve.dir CHANGED Viewed

@@ -1,2 +1,3 @@
 'teacher', (0, 14412556)
 'attn_layer_mapper=all, attn_loss_fn=raw_mse, attn_projector=miles', (14412800, 14412543)

 'teacher', (0, 14412556)
 'attn_layer_mapper=all, attn_loss_fn=raw_mse, attn_projector=miles', (14412800, 14412543)
+'attn_layer_mapper=all, attn_loss_fn=logsum, attn_projector=miles', (28825600, 14412543)

tokenizer.json CHANGED Viewed

@@ -1,19 +1,7 @@
 {
   "version": "1.0",
-  "truncation": {
-    "direction": "Right",
-    "max_length": 1023,
-    "strategy": "LongestFirst",
-    "stride": 0
-  },
-  "padding": {
-    "strategy": "BatchLongest",
-    "direction": "Right",
-    "pad_to_multiple_of": null,
-    "pad_id": 50256,
-    "pad_type_id": 0,
-    "pad_token": "<|endoftext|>"
-  },
   "added_tokens": [
     {
       "id": 50256,

 {
   "version": "1.0",
+  "truncation": null,
+  "padding": null,
   "added_tokens": [
     {
       "id": 50256,