Upload ICM-DPO enhanced Gemma PEFT adapter with comprehensive LoRA and model card

Files changed (5) hide show

README.md CHANGED Viewed

@@ -41,7 +41,7 @@ This model demonstrates comprehensive capability enhancement using ICM-generated
 - **Target Modules**: q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj
 - **Modules to Save**: embed_tokens, lm_head (full layers)
 - **Training Method**: Direct Preference Optimization (DPO)
-- **Beta (KL Penalty)**: 0.02
 - **Adapter Size**: ~669MB (includes full embedding/head layers)
 - **Trainable Parameters**: ~56.13838755173775% of base model
@@ -50,15 +50,15 @@ This model demonstrates comprehensive capability enhancement using ICM-generated
 ### Dataset
 - **Source**: [codelion/gemma-3-270m-icm-dpo](https://huggingface.co/datasets/codelion/gemma-3-270m-icm-dpo)
 - **Method**: ICM (Internal Coherence Maximization) for label-free preference generation
-- **Training Samples**: 1812
 - **Evaluation Samples**: 50
 ### Training Configuration
-- **Epochs**: 1
-- **Batch Size**: 1 (per device)
 - **Gradient Accumulation**: 8 steps
-- **Effective Batch Size**: 8
-- **Learning Rate**: 2e-07
 - **Optimizer**: paged_adamw_8bit
 - **Memory Optimization**: BF16, Gradient Checkpointing

 - **Target Modules**: q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj
 - **Modules to Save**: embed_tokens, lm_head (full layers)
 - **Training Method**: Direct Preference Optimization (DPO)
+- **Beta (KL Penalty)**: 0.1
 - **Adapter Size**: ~669MB (includes full embedding/head layers)
 - **Trainable Parameters**: ~56.13838755173775% of base model
 ### Dataset
 - **Source**: [codelion/gemma-3-270m-icm-dpo](https://huggingface.co/datasets/codelion/gemma-3-270m-icm-dpo)
 - **Method**: ICM (Internal Coherence Maximization) for label-free preference generation
+- **Training Samples**: 1060
 - **Evaluation Samples**: 50
 ### Training Configuration
+- **Epochs**: 3
+- **Batch Size**: 4 (per device)
 - **Gradient Accumulation**: 8 steps
+- **Effective Batch Size**: 32
+- **Learning Rate**: 5e-06
 - **Optimizer**: paged_adamw_8bit
 - **Memory Optimization**: BF16, Gradient Checkpointing

adapter_config.json CHANGED Viewed

@@ -28,13 +28,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "gate_proj",
-    "down_proj",
-    "v_proj",
     "q_proj",
     "k_proj",
-    "o_proj",
-    "up_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "o_proj",
     "q_proj",
+    "v_proj",
+    "up_proj",
     "k_proj",
+    "down_proj",
+    "gate_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:67361f39938126afc8829ec3a771adc6af04dc92100c369d4bbf63931548b824
 size 701497992

 version https://git-lfs.github.com/spec/v1
+oid sha256:6ea8b9b9fe8720ed96c0cc938abb9da0ec35f9fbf0f18d92681aee01e2ec47e1
 size 701497992

results.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
   "training_metrics": {
-    "train_runtime": 567.3455,
-    "train_samples_per_second": 3.194,
-    "train_steps_per_second": 0.4,
     "total_flos": 0.0,
-    "train_loss": 0.6932530382131165,
-    "epoch": 1.0
   },
   "config": {
     "model_name": "google/gemma-3-270m-it",
@@ -28,23 +28,23 @@
       "lm_head"
     ],
     "max_train_samples": null,
-    "beta": 0.02,
     "max_length": 1024,
     "max_prompt_length": 512,
-    "batch_size": 1,
     "gradient_accumulation_steps": 8,
-    "learning_rate": 2e-07,
-    "num_train_epochs": 1,
     "warmup_ratio": 0.1,
-    "weight_decay": 0.05,
-    "max_grad_norm": 0.5,
     "gradient_checkpointing": true,
     "fp16": false,
     "bf16": true,
     "optim": "paged_adamw_8bit",
     "remove_unused_columns": false,
-    "eval_steps": 250,
-    "save_steps": 500,
     "logging_steps": 10,
     "eval_strategy": "steps",
     "save_strategy": "steps",

 {
   "training_metrics": {
+    "train_runtime": 230.3572,
+    "train_samples_per_second": 13.805,
+    "train_steps_per_second": 0.443,
     "total_flos": 0.0,
+    "train_loss": 0.6297222118751675,
+    "epoch": 3.0
   },
   "config": {
     "model_name": "google/gemma-3-270m-it",
       "lm_head"
     ],
     "max_train_samples": null,
+    "beta": 0.1,
     "max_length": 1024,
     "max_prompt_length": 512,
+    "batch_size": 4,
     "gradient_accumulation_steps": 8,
+    "learning_rate": 5e-06,
+    "num_train_epochs": 3,
     "warmup_ratio": 0.1,
+    "weight_decay": 0.01,
+    "max_grad_norm": 1.0,
     "gradient_checkpointing": true,
     "fp16": false,
     "bf16": true,
     "optim": "paged_adamw_8bit",
     "remove_unused_columns": false,
+    "eval_steps": 50,
+    "save_steps": 100,
     "logging_steps": 10,
     "eval_strategy": "steps",
     "save_strategy": "steps",

training_config.json CHANGED Viewed

@@ -19,23 +19,23 @@
     "lm_head"
   ],
   "max_train_samples": null,
-  "beta": 0.02,
   "max_length": 1024,
   "max_prompt_length": 512,
-  "batch_size": 1,
   "gradient_accumulation_steps": 8,
-  "learning_rate": 2e-07,
-  "num_train_epochs": 1,
   "warmup_ratio": 0.1,
-  "weight_decay": 0.05,
-  "max_grad_norm": 0.5,
   "gradient_checkpointing": true,
   "fp16": false,
   "bf16": true,
   "optim": "paged_adamw_8bit",
   "remove_unused_columns": false,
-  "eval_steps": 250,
-  "save_steps": 500,
   "logging_steps": 10,
   "eval_strategy": "steps",
   "save_strategy": "steps",

     "lm_head"
   ],
   "max_train_samples": null,
+  "beta": 0.1,
   "max_length": 1024,
   "max_prompt_length": 512,
+  "batch_size": 4,
   "gradient_accumulation_steps": 8,
+  "learning_rate": 5e-06,
+  "num_train_epochs": 3,
   "warmup_ratio": 0.1,
+  "weight_decay": 0.01,
+  "max_grad_norm": 1.0,
   "gradient_checkpointing": true,
   "fp16": false,
   "bf16": true,
   "optim": "paged_adamw_8bit",
   "remove_unused_columns": false,
+  "eval_steps": 50,
+  "save_steps": 100,
   "logging_steps": 10,
   "eval_strategy": "steps",
   "save_strategy": "steps",