Upload ICM-DPO enhanced Gemma PEFT adapter with comprehensive LoRA and model card
Browse files- README.md +6 -6
- adapter_config.json +5 -5
- adapter_model.safetensors +1 -1
- results.json +13 -13
- training_config.json +8 -8
README.md
CHANGED
@@ -41,7 +41,7 @@ This model demonstrates comprehensive capability enhancement using ICM-generated
|
|
41 |
- **Target Modules**: q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj
|
42 |
- **Modules to Save**: embed_tokens, lm_head (full layers)
|
43 |
- **Training Method**: Direct Preference Optimization (DPO)
|
44 |
-
- **Beta (KL Penalty)**: 0.
|
45 |
- **Adapter Size**: ~669MB (includes full embedding/head layers)
|
46 |
- **Trainable Parameters**: ~56.13838755173775% of base model
|
47 |
|
@@ -50,15 +50,15 @@ This model demonstrates comprehensive capability enhancement using ICM-generated
|
|
50 |
### Dataset
|
51 |
- **Source**: [codelion/gemma-3-270m-icm-dpo](https://huggingface.co/datasets/codelion/gemma-3-270m-icm-dpo)
|
52 |
- **Method**: ICM (Internal Coherence Maximization) for label-free preference generation
|
53 |
-
- **Training Samples**:
|
54 |
- **Evaluation Samples**: 50
|
55 |
|
56 |
### Training Configuration
|
57 |
-
- **Epochs**:
|
58 |
-
- **Batch Size**:
|
59 |
- **Gradient Accumulation**: 8 steps
|
60 |
-
- **Effective Batch Size**:
|
61 |
-
- **Learning Rate**:
|
62 |
- **Optimizer**: paged_adamw_8bit
|
63 |
- **Memory Optimization**: BF16, Gradient Checkpointing
|
64 |
|
|
|
41 |
- **Target Modules**: q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj
|
42 |
- **Modules to Save**: embed_tokens, lm_head (full layers)
|
43 |
- **Training Method**: Direct Preference Optimization (DPO)
|
44 |
+
- **Beta (KL Penalty)**: 0.1
|
45 |
- **Adapter Size**: ~669MB (includes full embedding/head layers)
|
46 |
- **Trainable Parameters**: ~56.13838755173775% of base model
|
47 |
|
|
|
50 |
### Dataset
|
51 |
- **Source**: [codelion/gemma-3-270m-icm-dpo](https://huggingface.co/datasets/codelion/gemma-3-270m-icm-dpo)
|
52 |
- **Method**: ICM (Internal Coherence Maximization) for label-free preference generation
|
53 |
+
- **Training Samples**: 1060
|
54 |
- **Evaluation Samples**: 50
|
55 |
|
56 |
### Training Configuration
|
57 |
+
- **Epochs**: 3
|
58 |
+
- **Batch Size**: 4 (per device)
|
59 |
- **Gradient Accumulation**: 8 steps
|
60 |
+
- **Effective Batch Size**: 32
|
61 |
+
- **Learning Rate**: 5e-06
|
62 |
- **Optimizer**: paged_adamw_8bit
|
63 |
- **Memory Optimization**: BF16, Gradient Checkpointing
|
64 |
|
adapter_config.json
CHANGED
@@ -28,13 +28,13 @@
|
|
28 |
"rank_pattern": {},
|
29 |
"revision": null,
|
30 |
"target_modules": [
|
31 |
-
"
|
32 |
-
"down_proj",
|
33 |
-
"v_proj",
|
34 |
"q_proj",
|
|
|
|
|
35 |
"k_proj",
|
36 |
-
"
|
37 |
-
"
|
38 |
],
|
39 |
"target_parameters": null,
|
40 |
"task_type": "CAUSAL_LM",
|
|
|
28 |
"rank_pattern": {},
|
29 |
"revision": null,
|
30 |
"target_modules": [
|
31 |
+
"o_proj",
|
|
|
|
|
32 |
"q_proj",
|
33 |
+
"v_proj",
|
34 |
+
"up_proj",
|
35 |
"k_proj",
|
36 |
+
"down_proj",
|
37 |
+
"gate_proj"
|
38 |
],
|
39 |
"target_parameters": null,
|
40 |
"task_type": "CAUSAL_LM",
|
adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 701497992
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6ea8b9b9fe8720ed96c0cc938abb9da0ec35f9fbf0f18d92681aee01e2ec47e1
|
3 |
size 701497992
|
results.json
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
{
|
2 |
"training_metrics": {
|
3 |
-
"train_runtime":
|
4 |
-
"train_samples_per_second":
|
5 |
-
"train_steps_per_second": 0.
|
6 |
"total_flos": 0.0,
|
7 |
-
"train_loss": 0.
|
8 |
-
"epoch":
|
9 |
},
|
10 |
"config": {
|
11 |
"model_name": "google/gemma-3-270m-it",
|
@@ -28,23 +28,23 @@
|
|
28 |
"lm_head"
|
29 |
],
|
30 |
"max_train_samples": null,
|
31 |
-
"beta": 0.
|
32 |
"max_length": 1024,
|
33 |
"max_prompt_length": 512,
|
34 |
-
"batch_size":
|
35 |
"gradient_accumulation_steps": 8,
|
36 |
-
"learning_rate":
|
37 |
-
"num_train_epochs":
|
38 |
"warmup_ratio": 0.1,
|
39 |
-
"weight_decay": 0.
|
40 |
-
"max_grad_norm": 0
|
41 |
"gradient_checkpointing": true,
|
42 |
"fp16": false,
|
43 |
"bf16": true,
|
44 |
"optim": "paged_adamw_8bit",
|
45 |
"remove_unused_columns": false,
|
46 |
-
"eval_steps":
|
47 |
-
"save_steps":
|
48 |
"logging_steps": 10,
|
49 |
"eval_strategy": "steps",
|
50 |
"save_strategy": "steps",
|
|
|
1 |
{
|
2 |
"training_metrics": {
|
3 |
+
"train_runtime": 230.3572,
|
4 |
+
"train_samples_per_second": 13.805,
|
5 |
+
"train_steps_per_second": 0.443,
|
6 |
"total_flos": 0.0,
|
7 |
+
"train_loss": 0.6297222118751675,
|
8 |
+
"epoch": 3.0
|
9 |
},
|
10 |
"config": {
|
11 |
"model_name": "google/gemma-3-270m-it",
|
|
|
28 |
"lm_head"
|
29 |
],
|
30 |
"max_train_samples": null,
|
31 |
+
"beta": 0.1,
|
32 |
"max_length": 1024,
|
33 |
"max_prompt_length": 512,
|
34 |
+
"batch_size": 4,
|
35 |
"gradient_accumulation_steps": 8,
|
36 |
+
"learning_rate": 5e-06,
|
37 |
+
"num_train_epochs": 3,
|
38 |
"warmup_ratio": 0.1,
|
39 |
+
"weight_decay": 0.01,
|
40 |
+
"max_grad_norm": 1.0,
|
41 |
"gradient_checkpointing": true,
|
42 |
"fp16": false,
|
43 |
"bf16": true,
|
44 |
"optim": "paged_adamw_8bit",
|
45 |
"remove_unused_columns": false,
|
46 |
+
"eval_steps": 50,
|
47 |
+
"save_steps": 100,
|
48 |
"logging_steps": 10,
|
49 |
"eval_strategy": "steps",
|
50 |
"save_strategy": "steps",
|
training_config.json
CHANGED
@@ -19,23 +19,23 @@
|
|
19 |
"lm_head"
|
20 |
],
|
21 |
"max_train_samples": null,
|
22 |
-
"beta": 0.
|
23 |
"max_length": 1024,
|
24 |
"max_prompt_length": 512,
|
25 |
-
"batch_size":
|
26 |
"gradient_accumulation_steps": 8,
|
27 |
-
"learning_rate":
|
28 |
-
"num_train_epochs":
|
29 |
"warmup_ratio": 0.1,
|
30 |
-
"weight_decay": 0.
|
31 |
-
"max_grad_norm": 0
|
32 |
"gradient_checkpointing": true,
|
33 |
"fp16": false,
|
34 |
"bf16": true,
|
35 |
"optim": "paged_adamw_8bit",
|
36 |
"remove_unused_columns": false,
|
37 |
-
"eval_steps":
|
38 |
-
"save_steps":
|
39 |
"logging_steps": 10,
|
40 |
"eval_strategy": "steps",
|
41 |
"save_strategy": "steps",
|
|
|
19 |
"lm_head"
|
20 |
],
|
21 |
"max_train_samples": null,
|
22 |
+
"beta": 0.1,
|
23 |
"max_length": 1024,
|
24 |
"max_prompt_length": 512,
|
25 |
+
"batch_size": 4,
|
26 |
"gradient_accumulation_steps": 8,
|
27 |
+
"learning_rate": 5e-06,
|
28 |
+
"num_train_epochs": 3,
|
29 |
"warmup_ratio": 0.1,
|
30 |
+
"weight_decay": 0.01,
|
31 |
+
"max_grad_norm": 1.0,
|
32 |
"gradient_checkpointing": true,
|
33 |
"fp16": false,
|
34 |
"bf16": true,
|
35 |
"optim": "paged_adamw_8bit",
|
36 |
"remove_unused_columns": false,
|
37 |
+
"eval_steps": 50,
|
38 |
+
"save_steps": 100,
|
39 |
"logging_steps": 10,
|
40 |
"eval_strategy": "steps",
|
41 |
"save_strategy": "steps",
|