Model save
Browse files- README.md +14 -36
- adapter_config.json +4 -4
- adapter_model.safetensors +2 -2
- training_args.bin +2 -2
README.md
CHANGED
@@ -2,11 +2,10 @@
|
|
2 |
license: apache-2.0
|
3 |
library_name: peft
|
4 |
tags:
|
5 |
-
- axolotl
|
6 |
- generated_from_trainer
|
7 |
base_model: mistralai/Mixtral-8x7B-Instruct-v0.1
|
8 |
model-index:
|
9 |
-
- name: mixtral-lora
|
10 |
results: []
|
11 |
---
|
12 |
|
@@ -41,7 +40,7 @@ datasets:
|
|
41 |
|
42 |
dataset_prepared_path: last_run_prepared
|
43 |
val_set_size: 0.01
|
44 |
-
output_dir:
|
45 |
|
46 |
adapter: qlora
|
47 |
lora_model_dir:
|
@@ -50,21 +49,18 @@ sequence_len: 4096
|
|
50 |
sample_packing: true
|
51 |
pad_to_sequence_len: true
|
52 |
|
53 |
-
lora_r:
|
54 |
lora_alpha: 16
|
55 |
lora_dropout: 0.05
|
56 |
-
lora_target_linear: true
|
57 |
lora_fan_in_fan_out:
|
58 |
-
hub_model_id: liuylhf/mixtral-lora
|
|
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
# - v_proj
|
66 |
-
# - k_proj
|
67 |
-
# - o_proj
|
68 |
|
69 |
wandb_project: function-call
|
70 |
wandb_name: mixtral-instruct-raw-data-v3
|
@@ -72,7 +68,7 @@ wandb_log_model: end
|
|
72 |
|
73 |
gradient_accumulation_steps: 4
|
74 |
micro_batch_size: 2
|
75 |
-
num_epochs:
|
76 |
optimizer: paged_adamw_8bit
|
77 |
lr_scheduler: cosine
|
78 |
learning_rate: 0.001
|
@@ -113,11 +109,9 @@ fsdp_config:
|
|
113 |
|
114 |
</details><br>
|
115 |
|
116 |
-
# mixtral-lora
|
117 |
|
118 |
-
This model is a fine-tuned version of [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) on
|
119 |
-
It achieves the following results on the evaluation set:
|
120 |
-
- Loss: 0.1923
|
121 |
|
122 |
## Model description
|
123 |
|
@@ -148,23 +142,7 @@ The following hyperparameters were used during training:
|
|
148 |
- optimizer: Adam with betas=(0.9,0.95) and epsilon=1e-05
|
149 |
- lr_scheduler_type: cosine
|
150 |
- lr_scheduler_warmup_steps: 10
|
151 |
-
- num_epochs:
|
152 |
-
|
153 |
-
### Training results
|
154 |
-
|
155 |
-
| Training Loss | Epoch | Step | Validation Loss |
|
156 |
-
|:-------------:|:-----:|:----:|:---------------:|
|
157 |
-
| 3.2966 | 0.0 | 1 | 3.2222 |
|
158 |
-
| 0.3736 | 0.05 | 16 | 0.3541 |
|
159 |
-
| 0.1777 | 0.1 | 32 | 0.2357 |
|
160 |
-
| 0.2366 | 0.16 | 48 | 0.2154 |
|
161 |
-
| 0.1917 | 0.21 | 64 | 0.2056 |
|
162 |
-
| 0.2213 | 0.26 | 80 | 0.2003 |
|
163 |
-
| 0.149 | 0.31 | 96 | 0.1972 |
|
164 |
-
| 0.1739 | 0.37 | 112 | 0.1950 |
|
165 |
-
| 0.1668 | 0.42 | 128 | 0.1928 |
|
166 |
-
| 0.0997 | 0.47 | 144 | 0.1923 |
|
167 |
-
|
168 |
|
169 |
### Framework versions
|
170 |
|
|
|
2 |
license: apache-2.0
|
3 |
library_name: peft
|
4 |
tags:
|
|
|
5 |
- generated_from_trainer
|
6 |
base_model: mistralai/Mixtral-8x7B-Instruct-v0.1
|
7 |
model-index:
|
8 |
+
- name: mixtral-lora-less-modules
|
9 |
results: []
|
10 |
---
|
11 |
|
|
|
40 |
|
41 |
dataset_prepared_path: last_run_prepared
|
42 |
val_set_size: 0.01
|
43 |
+
output_dir: ./mixtral-qlora-1-epochs-r64
|
44 |
|
45 |
adapter: qlora
|
46 |
lora_model_dir:
|
|
|
49 |
sample_packing: true
|
50 |
pad_to_sequence_len: true
|
51 |
|
52 |
+
lora_r: 64
|
53 |
lora_alpha: 16
|
54 |
lora_dropout: 0.05
|
|
|
55 |
lora_fan_in_fan_out:
|
56 |
+
hub_model_id: liuylhf/mixtral-lora-less-modules
|
57 |
+
hub_strategy: end
|
58 |
|
59 |
+
lora_target_modules:
|
60 |
+
- q_proj
|
61 |
+
- v_proj
|
62 |
+
- k_proj
|
63 |
+
- o_proj
|
|
|
|
|
|
|
64 |
|
65 |
wandb_project: function-call
|
66 |
wandb_name: mixtral-instruct-raw-data-v3
|
|
|
68 |
|
69 |
gradient_accumulation_steps: 4
|
70 |
micro_batch_size: 2
|
71 |
+
num_epochs: 1
|
72 |
optimizer: paged_adamw_8bit
|
73 |
lr_scheduler: cosine
|
74 |
learning_rate: 0.001
|
|
|
109 |
|
110 |
</details><br>
|
111 |
|
112 |
+
# mixtral-lora-less-modules
|
113 |
|
114 |
+
This model is a fine-tuned version of [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) on an unknown dataset.
|
|
|
|
|
115 |
|
116 |
## Model description
|
117 |
|
|
|
142 |
- optimizer: Adam with betas=(0.9,0.95) and epsilon=1e-05
|
143 |
- lr_scheduler_type: cosine
|
144 |
- lr_scheduler_warmup_steps: 10
|
145 |
+
- num_epochs: 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
### Framework versions
|
148 |
|
adapter_config.json
CHANGED
@@ -15,14 +15,14 @@
|
|
15 |
"megatron_core": "megatron.core",
|
16 |
"modules_to_save": null,
|
17 |
"peft_type": "LORA",
|
18 |
-
"r":
|
19 |
"rank_pattern": {},
|
20 |
"revision": null,
|
21 |
"target_modules": [
|
22 |
-
"k_proj",
|
23 |
-
"v_proj",
|
24 |
"o_proj",
|
25 |
-
"q_proj"
|
|
|
|
|
26 |
],
|
27 |
"task_type": "CAUSAL_LM",
|
28 |
"use_rslora": false
|
|
|
15 |
"megatron_core": "megatron.core",
|
16 |
"modules_to_save": null,
|
17 |
"peft_type": "LORA",
|
18 |
+
"r": 64,
|
19 |
"rank_pattern": {},
|
20 |
"revision": null,
|
21 |
"target_modules": [
|
|
|
|
|
22 |
"o_proj",
|
23 |
+
"q_proj",
|
24 |
+
"v_proj",
|
25 |
+
"k_proj"
|
26 |
],
|
27 |
"task_type": "CAUSAL_LM",
|
28 |
"use_rslora": false
|
adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd223b7576019fbc18fdb6df07a26b9b662da56d34b65c55bfe245668d413a3f
|
3 |
+
size 218138576
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:007877440649249ac071b2c5c00afa3c113d2399d566bb3f2dab4235d750f45e
|
3 |
+
size 5624
|