VERSIL91 commited on
Commit
1ad08e5
·
verified ·
1 Parent(s): e3b7d51

End of training

Browse files
README.md CHANGED
@@ -5,7 +5,7 @@ tags:
5
  - axolotl
6
  - generated_from_trainer
7
  model-index:
8
- - name: a9e7a6db-0def-40a4-a8a4-0bf2c657692a
9
  results: []
10
  ---
11
 
@@ -17,16 +17,11 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  axolotl version: `0.4.1`
19
  ```yaml
20
- accelerate_config:
21
- dynamo_backend: inductor
22
- mixed_precision: bf16
23
- num_machines: 1
24
- num_processes: auto
25
- use_cpu: false
26
  adapter: lora
27
  base_model: peft-internal-testing/tiny-dummy-qwen2
28
- bf16: auto
29
  chat_template: llama3
 
30
  dataset_prepared_path: null
31
  datasets:
32
  - data_files:
@@ -44,63 +39,67 @@ datasets:
44
  debug: null
45
  deepspeed: null
46
  device_map: auto
47
- early_stopping_patience: null
 
 
48
  eval_max_new_tokens: 128
 
49
  eval_table_size: null
50
- evals_per_epoch: 4
51
- flash_attention: false
52
- fp16: null
53
  fsdp: null
54
  fsdp_config: null
55
- gradient_accumulation_steps: 16
56
  gradient_checkpointing: true
57
- group_by_length: false
58
  hub_model_id: null
59
  hub_repo: null
60
  hub_strategy: checkpoint
61
  hub_token: null
62
  learning_rate: 0.0001
 
 
63
  local_rank: null
64
- logging_steps: 1
65
- lora_alpha: 16
66
  lora_dropout: 0.05
67
  lora_fan_in_fan_out: null
68
  lora_model_dir: null
69
- lora_r: 8
70
  lora_target_linear: true
71
- lora_target_modules:
72
- - q_proj
73
- - v_proj
74
  lr_scheduler: cosine
 
75
  max_memory:
76
- 0: 70GiB
77
- max_steps: 100
78
- micro_batch_size: 2
79
  mlflow_experiment_name: /tmp/c5efe3191618858d_train_data.json
80
  model_type: AutoModelForCausalLM
81
- num_epochs: 1
 
 
 
 
82
  optimizer: adamw_bnb_8bit
83
  output_dir: miner_id_24
84
  pad_to_sequence_len: true
85
- quantization_config:
86
- llm_int8_enable_fp32_cpu_offload: true
87
- load_in_8bit: true
88
  resume_from_checkpoint: null
89
  s2_attention: null
90
  sample_packing: false
91
- saves_per_epoch: 4
92
- sequence_len: 512
 
93
  strict: false
94
- tf32: false
95
  tokenizer_type: AutoTokenizer
96
- torch_compile: true
97
  train_on_inputs: false
98
  trust_remote_code: true
99
  val_set_size: 0.05
100
- wandb_entity: null
101
  wandb_mode: online
102
  wandb_name: a9e7a6db-0def-40a4-a8a4-0bf2c657692a
103
- wandb_project: Gradients-On-Demand
104
  wandb_run: your_name
105
  wandb_runid: a9e7a6db-0def-40a4-a8a4-0bf2c657692a
106
  warmup_steps: 10
@@ -111,11 +110,11 @@ xformers_attention: null
111
 
112
  </details><br>
113
 
114
- # a9e7a6db-0def-40a4-a8a4-0bf2c657692a
115
 
116
  This model is a fine-tuned version of [peft-internal-testing/tiny-dummy-qwen2](https://huggingface.co/peft-internal-testing/tiny-dummy-qwen2) on the None dataset.
117
  It achieves the following results on the evaluation set:
118
- - Loss: 11.9285
119
 
120
  ## Model description
121
 
@@ -135,25 +134,24 @@ More information needed
135
 
136
  The following hyperparameters were used during training:
137
  - learning_rate: 0.0001
138
- - train_batch_size: 2
139
- - eval_batch_size: 2
140
  - seed: 42
141
- - gradient_accumulation_steps: 16
142
  - total_train_batch_size: 32
143
- - optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
144
  - lr_scheduler_type: cosine
145
  - lr_scheduler_warmup_steps: 10
146
- - training_steps: 56
147
 
148
  ### Training results
149
 
150
  | Training Loss | Epoch | Step | Validation Loss |
151
  |:-------------:|:------:|:----:|:---------------:|
152
- | 11.9287 | 0.0179 | 1 | 11.9288 |
153
- | 11.9286 | 0.2503 | 14 | 11.9287 |
154
- | 11.9304 | 0.5006 | 28 | 11.9286 |
155
- | 11.9279 | 0.7508 | 42 | 11.9285 |
156
- | 13.9592 | 1.0034 | 56 | 11.9285 |
157
 
158
 
159
  ### Framework versions
 
5
  - axolotl
6
  - generated_from_trainer
7
  model-index:
8
+ - name: 20aee596-a3f3-4852-b720-d6bfbd74ec2b
9
  results: []
10
  ---
11
 
 
17
 
18
  axolotl version: `0.4.1`
19
  ```yaml
 
 
 
 
 
 
20
  adapter: lora
21
  base_model: peft-internal-testing/tiny-dummy-qwen2
22
+ bf16: true
23
  chat_template: llama3
24
+ data_processes: 16
25
  dataset_prepared_path: null
26
  datasets:
27
  - data_files:
 
39
  debug: null
40
  deepspeed: null
41
  device_map: auto
42
+ do_eval: true
43
+ early_stopping_patience: 5
44
+ eval_batch_size: 4
45
  eval_max_new_tokens: 128
46
+ eval_steps: 50
47
  eval_table_size: null
48
+ evals_per_epoch: null
49
+ flash_attention: true
50
+ fp16: false
51
  fsdp: null
52
  fsdp_config: null
53
+ gradient_accumulation_steps: 4
54
  gradient_checkpointing: true
55
+ group_by_length: true
56
  hub_model_id: null
57
  hub_repo: null
58
  hub_strategy: checkpoint
59
  hub_token: null
60
  learning_rate: 0.0001
61
+ load_in_4bit: false
62
+ load_in_8bit: false
63
  local_rank: null
64
+ logging_steps: 3
65
+ lora_alpha: 128
66
  lora_dropout: 0.05
67
  lora_fan_in_fan_out: null
68
  lora_model_dir: null
69
+ lora_r: 64
70
  lora_target_linear: true
 
 
 
71
  lr_scheduler: cosine
72
+ max_grad_norm: 1.0
73
  max_memory:
74
+ 0: 75GB
75
+ max_steps: 200
76
+ micro_batch_size: 8
77
  mlflow_experiment_name: /tmp/c5efe3191618858d_train_data.json
78
  model_type: AutoModelForCausalLM
79
+ num_epochs: 3
80
+ optim_args:
81
+ adam_beta1: 0.9
82
+ adam_beta2: 0.95
83
+ adam_epsilon: 1e-5
84
  optimizer: adamw_bnb_8bit
85
  output_dir: miner_id_24
86
  pad_to_sequence_len: true
 
 
 
87
  resume_from_checkpoint: null
88
  s2_attention: null
89
  sample_packing: false
90
+ save_steps: 50
91
+ saves_per_epoch: null
92
+ sequence_len: 1024
93
  strict: false
94
+ tf32: true
95
  tokenizer_type: AutoTokenizer
 
96
  train_on_inputs: false
97
  trust_remote_code: true
98
  val_set_size: 0.05
99
+ wandb_entity: techspear-hub
100
  wandb_mode: online
101
  wandb_name: a9e7a6db-0def-40a4-a8a4-0bf2c657692a
102
+ wandb_project: Gradients-On-Three
103
  wandb_run: your_name
104
  wandb_runid: a9e7a6db-0def-40a4-a8a4-0bf2c657692a
105
  warmup_steps: 10
 
110
 
111
  </details><br>
112
 
113
+ # 20aee596-a3f3-4852-b720-d6bfbd74ec2b
114
 
115
  This model is a fine-tuned version of [peft-internal-testing/tiny-dummy-qwen2](https://huggingface.co/peft-internal-testing/tiny-dummy-qwen2) on the None dataset.
116
  It achieves the following results on the evaluation set:
117
+ - Loss: 11.9103
118
 
119
  ## Model description
120
 
 
134
 
135
  The following hyperparameters were used during training:
136
  - learning_rate: 0.0001
137
+ - train_batch_size: 8
138
+ - eval_batch_size: 4
139
  - seed: 42
140
+ - gradient_accumulation_steps: 4
141
  - total_train_batch_size: 32
142
+ - optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=adam_beta1=0.9,adam_beta2=0.95,adam_epsilon=1e-5
143
  - lr_scheduler_type: cosine
144
  - lr_scheduler_warmup_steps: 10
145
+ - training_steps: 168
146
 
147
  ### Training results
148
 
149
  | Training Loss | Epoch | Step | Validation Loss |
150
  |:-------------:|:------:|:----:|:---------------:|
151
+ | No log | 0.0179 | 1 | 11.9290 |
152
+ | 11.9211 | 0.8929 | 50 | 11.9188 |
153
+ | 11.9154 | 1.7857 | 100 | 11.9130 |
154
+ | 11.9095 | 2.6786 | 150 | 11.9103 |
 
155
 
156
 
157
  ### Framework versions
adapter_config.json CHANGED
@@ -10,23 +10,23 @@
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
- "lora_alpha": 16,
14
  "lora_dropout": 0.05,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
- "r": 8,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "v_proj",
24
- "down_proj",
25
  "k_proj",
26
  "q_proj",
27
- "gate_proj",
28
- "o_proj",
29
- "up_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
+ "lora_alpha": 128,
14
  "lora_dropout": 0.05,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
+ "r": 64,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "o_proj",
24
  "v_proj",
25
+ "gate_proj",
26
  "k_proj",
27
  "q_proj",
28
+ "up_proj",
29
+ "down_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a5a6287d8823fe0879ab294b3da41caa5a430114dba79e42f0253a5f6dba7af
3
- size 21378
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23be886be29d08119ba7e77cd77202bd2500cfc5ec768b28e1017011d87d3242
3
+ size 100226
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e019a9f57ecb8b85aec8bb6eee672a122e591e36bd2565bc6c4c332b190d0fc5
3
- size 14696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70358ddbab6095a18b2d8241d01d9a1150eb787eb05c75962fee72ff2f1282b1
3
+ size 93608
last-checkpoint/adapter_config.json CHANGED
@@ -10,23 +10,23 @@
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
- "lora_alpha": 16,
14
  "lora_dropout": 0.05,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
- "r": 8,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "v_proj",
24
- "down_proj",
25
  "k_proj",
26
  "q_proj",
27
- "gate_proj",
28
- "o_proj",
29
- "up_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
+ "lora_alpha": 128,
14
  "lora_dropout": 0.05,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
+ "r": 64,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "o_proj",
24
  "v_proj",
25
+ "gate_proj",
26
  "k_proj",
27
  "q_proj",
28
+ "up_proj",
29
+ "down_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e019a9f57ecb8b85aec8bb6eee672a122e591e36bd2565bc6c4c332b190d0fc5
3
- size 14696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d65eca5df530a9cf1f21b7ea60a51be0bb3c1def5e92182931659838d1a3ae1
3
+ size 93608
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bdac6c649202ef2291ed5061c16538bc0530983fc22e3831ec5e6375e3ece8d5
3
- size 39398
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92d385166cc29cae40ac00f90a7df30af287e10af88ef6ea0770cf9b6e721e42
3
+ size 197158
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3f3d68d08c427bfcb26826cb256501e784dd1de9309d24ccf5d718c10dd6b5c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:332d7236d1b47460701a6a346289bf62d000f5ae00d25856256e9d7e62fe1ee4
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e41d236d9989ea458dc3f7994dccaa194aab3668a60eebd5db6cd4583a97af79
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f09a84ebe5bd3a3aeb21e49457549d7dd6ed93b0deb63e91ac3af1cc132c6c3
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,452 +1,453 @@
1
  {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 1.0033519553072625,
5
- "eval_steps": 14,
6
- "global_step": 56,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.017877094972067038,
13
- "grad_norm": 0.008051837794482708,
14
- "learning_rate": 1e-05,
15
- "loss": 11.9287,
 
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.017877094972067038,
20
- "eval_loss": 11.928812026977539,
21
- "eval_runtime": 17.7636,
22
- "eval_samples_per_second": 5.348,
23
- "eval_steps_per_second": 2.702,
24
- "step": 1
25
  },
26
  {
27
- "epoch": 0.035754189944134075,
28
- "grad_norm": 0.008926309645175934,
29
- "learning_rate": 2e-05,
30
- "loss": 11.9296,
31
- "step": 2
32
  },
33
  {
34
- "epoch": 0.053631284916201116,
35
- "grad_norm": 0.00831968616694212,
36
- "learning_rate": 3e-05,
37
- "loss": 11.9287,
38
- "step": 3
39
  },
40
  {
41
- "epoch": 0.07150837988826815,
42
- "grad_norm": 0.007635825779289007,
43
- "learning_rate": 4e-05,
44
- "loss": 11.9297,
45
- "step": 4
46
  },
47
  {
48
- "epoch": 0.0893854748603352,
49
- "grad_norm": 0.006136827636510134,
50
- "learning_rate": 5e-05,
51
- "loss": 11.9308,
52
- "step": 5
53
  },
54
  {
55
- "epoch": 0.10726256983240223,
56
- "grad_norm": 0.008952487260103226,
57
- "learning_rate": 6e-05,
58
- "loss": 11.9298,
59
- "step": 6
60
  },
61
  {
62
- "epoch": 0.12513966480446928,
63
- "grad_norm": 0.008229999803006649,
64
- "learning_rate": 7e-05,
65
- "loss": 11.9316,
66
- "step": 7
67
  },
68
  {
69
- "epoch": 0.1430167597765363,
70
- "grad_norm": 0.008337481878697872,
71
- "learning_rate": 8e-05,
72
- "loss": 11.9323,
73
- "step": 8
74
  },
75
  {
76
- "epoch": 0.16089385474860335,
77
- "grad_norm": 0.0077857039868831635,
78
- "learning_rate": 9e-05,
79
- "loss": 11.9287,
80
- "step": 9
81
  },
82
  {
83
- "epoch": 0.1787709497206704,
84
- "grad_norm": 0.007779798936098814,
85
- "learning_rate": 0.0001,
86
- "loss": 11.9293,
87
- "step": 10
88
  },
89
  {
90
- "epoch": 0.19664804469273742,
91
- "grad_norm": 0.010903590358793736,
92
- "learning_rate": 9.988343845952697e-05,
93
- "loss": 11.9297,
94
- "step": 11
95
  },
96
  {
97
- "epoch": 0.21452513966480447,
98
- "grad_norm": 0.008899732492864132,
99
- "learning_rate": 9.953429730181653e-05,
100
- "loss": 11.929,
101
- "step": 12
102
  },
103
  {
104
- "epoch": 0.2324022346368715,
105
- "grad_norm": 0.008686481043696404,
106
- "learning_rate": 9.895420438411616e-05,
107
- "loss": 11.9327,
108
- "step": 13
109
  },
110
  {
111
- "epoch": 0.25027932960893856,
112
- "grad_norm": 0.008174674585461617,
113
- "learning_rate": 9.814586436738998e-05,
114
- "loss": 11.9286,
115
- "step": 14
116
  },
117
  {
118
- "epoch": 0.25027932960893856,
119
- "eval_loss": 11.928715705871582,
120
- "eval_runtime": 0.3873,
121
- "eval_samples_per_second": 245.313,
122
- "eval_steps_per_second": 123.947,
123
- "step": 14
124
  },
125
  {
126
- "epoch": 0.2681564245810056,
127
- "grad_norm": 0.008636604063212872,
128
- "learning_rate": 9.711304610594104e-05,
129
- "loss": 11.9307,
130
- "step": 15
131
  },
132
  {
133
- "epoch": 0.2860335195530726,
134
- "grad_norm": 0.009689634665846825,
135
- "learning_rate": 9.586056507527266e-05,
136
- "loss": 11.9291,
137
- "step": 16
 
138
  },
139
  {
140
- "epoch": 0.3039106145251397,
141
- "grad_norm": 0.007181845605373383,
142
- "learning_rate": 9.439426092011875e-05,
143
- "loss": 11.9316,
144
- "step": 17
145
  },
146
  {
147
- "epoch": 0.3217877094972067,
148
- "grad_norm": 0.00760689377784729,
149
- "learning_rate": 9.272097022732443e-05,
150
- "loss": 11.9297,
151
- "step": 18
152
  },
153
  {
154
- "epoch": 0.3396648044692737,
155
- "grad_norm": 0.007029213942587376,
156
- "learning_rate": 9.08484946505221e-05,
157
- "loss": 11.9308,
158
- "step": 19
159
  },
160
  {
161
- "epoch": 0.3575418994413408,
162
- "grad_norm": 0.009038039483129978,
163
- "learning_rate": 8.8785564535221e-05,
164
- "loss": 11.931,
165
- "step": 20
166
  },
167
  {
168
- "epoch": 0.3754189944134078,
169
- "grad_norm": 0.0075072660110890865,
170
- "learning_rate": 8.654179821390621e-05,
171
- "loss": 11.9313,
172
- "step": 21
173
  },
174
  {
175
- "epoch": 0.39329608938547483,
176
- "grad_norm": 0.00974891148507595,
177
- "learning_rate": 8.412765716093272e-05,
178
- "loss": 11.9313,
179
- "step": 22
180
  },
181
  {
182
- "epoch": 0.4111731843575419,
183
- "grad_norm": 0.0073928870260715485,
184
- "learning_rate": 8.155439721630264e-05,
185
- "loss": 11.9301,
186
- "step": 23
187
  },
188
  {
189
- "epoch": 0.42905027932960893,
190
- "grad_norm": 0.009019100107252598,
191
- "learning_rate": 7.883401610574336e-05,
192
- "loss": 11.9291,
193
- "step": 24
194
  },
195
  {
196
- "epoch": 0.44692737430167595,
197
- "grad_norm": 0.007918241433799267,
198
- "learning_rate": 7.597919750177168e-05,
199
- "loss": 11.9313,
200
- "step": 25
201
  },
202
  {
203
- "epoch": 0.464804469273743,
204
- "grad_norm": 0.009383410215377808,
205
- "learning_rate": 7.300325188655761e-05,
206
- "loss": 11.9298,
207
- "step": 26
208
  },
209
  {
210
- "epoch": 0.48268156424581005,
211
- "grad_norm": 0.008459771983325481,
212
- "learning_rate": 6.992005449231208e-05,
213
- "loss": 11.9309,
214
- "step": 27
215
  },
216
  {
217
- "epoch": 0.5005586592178771,
218
- "grad_norm": 0.008441867306828499,
219
- "learning_rate": 6.674398060854931e-05,
220
- "loss": 11.9304,
221
- "step": 28
222
  },
223
  {
224
- "epoch": 0.5005586592178771,
225
- "eval_loss": 11.928580284118652,
226
- "eval_runtime": 0.3886,
227
- "eval_samples_per_second": 244.488,
228
- "eval_steps_per_second": 123.531,
229
- "step": 28
230
  },
231
  {
232
- "epoch": 0.5184357541899441,
233
- "grad_norm": 0.010095364414155483,
234
- "learning_rate": 6.348983855785121e-05,
235
- "loss": 11.9277,
236
- "step": 29
237
  },
238
  {
239
- "epoch": 0.5363128491620112,
240
- "grad_norm": 0.007604570593684912,
241
- "learning_rate": 6.01728006526317e-05,
242
- "loss": 11.9298,
243
- "step": 30
244
  },
245
  {
246
- "epoch": 0.5541899441340782,
247
- "grad_norm": 0.0103254783898592,
248
- "learning_rate": 5.680833245481234e-05,
249
- "loss": 11.9283,
250
- "step": 31
251
  },
252
  {
253
- "epoch": 0.5720670391061452,
254
- "grad_norm": 0.00810755044221878,
255
- "learning_rate": 5.341212066823355e-05,
256
- "loss": 11.931,
257
- "step": 32
258
  },
259
  {
260
- "epoch": 0.5899441340782123,
261
- "grad_norm": 0.007562727201730013,
262
- "learning_rate": 5e-05,
263
- "loss": 11.9287,
264
- "step": 33
 
265
  },
266
  {
267
- "epoch": 0.6078212290502794,
268
- "grad_norm": 0.008853144943714142,
269
- "learning_rate": 4.658787933176646e-05,
270
- "loss": 11.9304,
271
- "step": 34
272
  },
273
  {
274
- "epoch": 0.6256983240223464,
275
- "grad_norm": 0.009539203718304634,
276
- "learning_rate": 4.319166754518768e-05,
277
- "loss": 11.93,
278
- "step": 35
279
  },
280
  {
281
- "epoch": 0.6435754189944134,
282
- "grad_norm": 0.009173383004963398,
283
- "learning_rate": 3.982719934736832e-05,
284
- "loss": 11.9296,
285
- "step": 36
286
  },
287
  {
288
- "epoch": 0.6614525139664804,
289
- "grad_norm": 0.008169720880687237,
290
- "learning_rate": 3.651016144214878e-05,
291
- "loss": 11.9302,
292
- "step": 37
293
  },
294
  {
295
- "epoch": 0.6793296089385474,
296
- "grad_norm": 0.008827430196106434,
297
- "learning_rate": 3.325601939145069e-05,
298
- "loss": 11.9295,
299
- "step": 38
300
  },
301
  {
302
- "epoch": 0.6972067039106146,
303
- "grad_norm": 0.010021938011050224,
304
- "learning_rate": 3.007994550768793e-05,
305
- "loss": 11.9299,
306
- "step": 39
307
  },
308
  {
309
- "epoch": 0.7150837988826816,
310
- "grad_norm": 0.010521038435399532,
311
- "learning_rate": 2.6996748113442394e-05,
312
- "loss": 11.9308,
313
- "step": 40
314
  },
315
  {
316
- "epoch": 0.7329608938547486,
317
- "grad_norm": 0.009070714004337788,
318
- "learning_rate": 2.4020802498228335e-05,
319
- "loss": 11.93,
320
- "step": 41
321
  },
322
  {
323
- "epoch": 0.7508379888268156,
324
- "grad_norm": 0.008820487186312675,
325
- "learning_rate": 2.1165983894256647e-05,
326
- "loss": 11.9279,
327
- "step": 42
328
  },
329
  {
330
- "epoch": 0.7508379888268156,
331
- "eval_loss": 11.928503036499023,
332
- "eval_runtime": 0.3804,
333
- "eval_samples_per_second": 249.752,
334
- "eval_steps_per_second": 126.19,
335
- "step": 42
336
  },
337
  {
338
- "epoch": 0.7687150837988826,
339
- "grad_norm": 0.010876229964196682,
340
- "learning_rate": 1.8445602783697374e-05,
341
- "loss": 11.9305,
342
- "step": 43
343
  },
344
  {
345
- "epoch": 0.7865921787709497,
346
- "grad_norm": 0.008084769360721111,
347
- "learning_rate": 1.5872342839067306e-05,
348
- "loss": 11.9299,
349
- "step": 44
350
  },
351
  {
352
- "epoch": 0.8044692737430168,
353
- "grad_norm": 0.009019813500344753,
354
- "learning_rate": 1.3458201786093794e-05,
355
- "loss": 11.9283,
356
- "step": 45
357
  },
358
  {
359
- "epoch": 0.8223463687150838,
360
- "grad_norm": 0.008098295889794827,
361
- "learning_rate": 1.1214435464779006e-05,
362
- "loss": 11.9292,
363
- "step": 46
364
  },
365
  {
366
- "epoch": 0.8402234636871508,
367
- "grad_norm": 0.008133570663630962,
368
- "learning_rate": 9.151505349477902e-06,
369
- "loss": 11.9289,
370
- "step": 47
371
  },
372
  {
373
- "epoch": 0.8581005586592179,
374
- "grad_norm": 0.012650455348193645,
375
- "learning_rate": 7.2790297726755716e-06,
376
- "loss": 11.9302,
377
- "step": 48
378
  },
379
  {
380
- "epoch": 0.8759776536312849,
381
- "grad_norm": 0.009691119194030762,
382
- "learning_rate": 5.605739079881239e-06,
383
- "loss": 11.9307,
384
- "step": 49
385
  },
386
  {
387
- "epoch": 0.8938547486033519,
388
- "grad_norm": 0.009178046137094498,
389
- "learning_rate": 4.139434924727359e-06,
390
- "loss": 11.9297,
391
- "step": 50
 
392
  },
393
  {
394
- "epoch": 0.911731843575419,
395
- "grad_norm": 0.008271483704447746,
396
- "learning_rate": 2.88695389405898e-06,
397
- "loss": 11.9291,
398
- "step": 51
399
  },
400
  {
401
- "epoch": 0.929608938547486,
402
- "grad_norm": 0.00795311015099287,
403
- "learning_rate": 1.8541356326100433e-06,
404
- "loss": 11.9286,
405
- "step": 52
406
  },
407
  {
408
- "epoch": 0.9474860335195531,
409
- "grad_norm": 0.008445663377642632,
410
- "learning_rate": 1.0457956158838544e-06,
411
- "loss": 11.932,
412
- "step": 53
413
  },
414
  {
415
- "epoch": 0.9653631284916201,
416
- "grad_norm": 0.009014743380248547,
417
- "learning_rate": 4.6570269818346224e-07,
418
- "loss": 11.9283,
419
- "step": 54
420
  },
421
  {
422
- "epoch": 0.9832402234636871,
423
- "grad_norm": 0.009628918021917343,
424
- "learning_rate": 1.1656154047303691e-07,
425
- "loss": 11.9317,
426
- "step": 55
427
  },
428
  {
429
- "epoch": 1.0033519553072625,
430
- "grad_norm": 0.00985956471413374,
431
  "learning_rate": 0.0,
432
- "loss": 13.9592,
433
- "step": 56
434
- },
435
- {
436
- "epoch": 1.0033519553072625,
437
- "eval_loss": 11.928487777709961,
438
- "eval_runtime": 0.3738,
439
- "eval_samples_per_second": 254.136,
440
- "eval_steps_per_second": 128.405,
441
- "step": 56
442
  }
443
  ],
444
- "logging_steps": 1,
445
- "max_steps": 56,
446
  "num_input_tokens_seen": 0,
447
- "num_train_epochs": 2,
448
- "save_steps": 14,
449
  "stateful_callbacks": {
 
 
 
 
 
 
 
 
 
450
  "TrainerControl": {
451
  "args": {
452
  "should_epoch_stop": false,
@@ -458,8 +459,8 @@
458
  "attributes": {}
459
  }
460
  },
461
- "total_flos": 26468155392.0,
462
- "train_batch_size": 2,
463
  "trial_name": null,
464
  "trial_params": null
465
  }
 
1
  {
2
+ "best_metric": 11.9102783203125,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-150",
4
+ "epoch": 3.0,
5
+ "eval_steps": 50,
6
+ "global_step": 168,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.017857142857142856,
13
+ "eval_loss": 11.9290132522583,
14
+ "eval_runtime": 0.5421,
15
+ "eval_samples_per_second": 175.25,
16
+ "eval_steps_per_second": 44.274,
17
  "step": 1
18
  },
19
  {
20
+ "epoch": 0.05357142857142857,
21
+ "grad_norm": 0.019959961995482445,
22
+ "learning_rate": 3e-05,
23
+ "loss": 11.9303,
24
+ "step": 3
 
25
  },
26
  {
27
+ "epoch": 0.10714285714285714,
28
+ "grad_norm": 0.022794105112552643,
29
+ "learning_rate": 6e-05,
30
+ "loss": 11.9305,
31
+ "step": 6
32
  },
33
  {
34
+ "epoch": 0.16071428571428573,
35
+ "grad_norm": 0.0353395938873291,
36
+ "learning_rate": 9e-05,
37
+ "loss": 11.9295,
38
+ "step": 9
39
  },
40
  {
41
+ "epoch": 0.21428571428571427,
42
+ "grad_norm": 0.03792329132556915,
43
+ "learning_rate": 9.996046986136509e-05,
44
+ "loss": 11.93,
45
+ "step": 12
46
  },
47
  {
48
+ "epoch": 0.26785714285714285,
49
+ "grad_norm": 0.03241180628538132,
50
+ "learning_rate": 9.975310752612137e-05,
51
+ "loss": 11.9276,
52
+ "step": 15
53
  },
54
  {
55
+ "epoch": 0.32142857142857145,
56
+ "grad_norm": 0.05070869252085686,
57
+ "learning_rate": 9.936876709681668e-05,
58
+ "loss": 11.9291,
59
+ "step": 18
60
  },
61
  {
62
+ "epoch": 0.375,
63
+ "grad_norm": 0.05852275714278221,
64
+ "learning_rate": 9.880881572095256e-05,
65
+ "loss": 11.9291,
66
+ "step": 21
67
  },
68
  {
69
+ "epoch": 0.42857142857142855,
70
+ "grad_norm": 0.08124187588691711,
71
+ "learning_rate": 9.807524521637102e-05,
72
+ "loss": 11.9278,
73
+ "step": 24
74
  },
75
  {
76
+ "epoch": 0.48214285714285715,
77
+ "grad_norm": 0.05361940711736679,
78
+ "learning_rate": 9.717066498610673e-05,
79
+ "loss": 11.9256,
80
+ "step": 27
81
  },
82
  {
83
+ "epoch": 0.5357142857142857,
84
+ "grad_norm": 0.0750463530421257,
85
+ "learning_rate": 9.609829273641034e-05,
86
+ "loss": 11.9261,
87
+ "step": 30
88
  },
89
  {
90
+ "epoch": 0.5892857142857143,
91
+ "grad_norm": 0.08543704450130463,
92
+ "learning_rate": 9.486194303096062e-05,
93
+ "loss": 11.9248,
94
+ "step": 33
95
  },
96
  {
97
+ "epoch": 0.6428571428571429,
98
+ "grad_norm": 0.09016852080821991,
99
+ "learning_rate": 9.346601372197914e-05,
100
+ "loss": 11.9243,
101
+ "step": 36
102
  },
103
  {
104
+ "epoch": 0.6964285714285714,
105
+ "grad_norm": 0.10736904293298721,
106
+ "learning_rate": 9.191547030651383e-05,
107
+ "loss": 11.9206,
108
+ "step": 39
109
  },
110
  {
111
+ "epoch": 0.75,
112
+ "grad_norm": 0.0793285071849823,
113
+ "learning_rate": 9.021582826353824e-05,
114
+ "loss": 11.9232,
115
+ "step": 42
116
  },
117
  {
118
+ "epoch": 0.8035714285714286,
119
+ "grad_norm": 0.05057210102677345,
120
+ "learning_rate": 8.83731334346954e-05,
121
+ "loss": 11.9215,
122
+ "step": 45
 
123
  },
124
  {
125
+ "epoch": 0.8571428571428571,
126
+ "grad_norm": 0.05617088824510574,
127
+ "learning_rate": 8.639394051847472e-05,
128
+ "loss": 11.9211,
129
+ "step": 48
130
  },
131
  {
132
+ "epoch": 0.8928571428571429,
133
+ "eval_loss": 11.918818473815918,
134
+ "eval_runtime": 0.5369,
135
+ "eval_samples_per_second": 176.942,
136
+ "eval_steps_per_second": 44.701,
137
+ "step": 50
138
  },
139
  {
140
+ "epoch": 0.9107142857142857,
141
+ "grad_norm": 0.06780627369880676,
142
+ "learning_rate": 8.428528975432066e-05,
143
+ "loss": 11.9193,
144
+ "step": 51
145
  },
146
  {
147
+ "epoch": 0.9642857142857143,
148
+ "grad_norm": 0.03634953871369362,
149
+ "learning_rate": 8.2054681879611e-05,
150
+ "loss": 11.9178,
151
+ "step": 54
152
  },
153
  {
154
+ "epoch": 1.0178571428571428,
155
+ "grad_norm": 0.04605906456708908,
156
+ "learning_rate": 7.971005144858553e-05,
157
+ "loss": 11.9182,
158
+ "step": 57
159
  },
160
  {
161
+ "epoch": 1.0714285714285714,
162
+ "grad_norm": 0.025876272469758987,
163
+ "learning_rate": 7.725973860813338e-05,
164
+ "loss": 11.9199,
165
+ "step": 60
166
  },
167
  {
168
+ "epoch": 1.125,
169
+ "grad_norm": 0.04855341464281082,
170
+ "learning_rate": 7.471245943083615e-05,
171
+ "loss": 11.9185,
172
+ "step": 63
173
  },
174
  {
175
+ "epoch": 1.1785714285714286,
176
+ "grad_norm": 0.03910359740257263,
177
+ "learning_rate": 7.20772749107956e-05,
178
+ "loss": 11.9184,
179
+ "step": 66
180
  },
181
  {
182
+ "epoch": 1.2321428571428572,
183
+ "grad_norm": 0.08992303162813187,
184
+ "learning_rate": 6.936355873253206e-05,
185
+ "loss": 11.9154,
186
+ "step": 69
187
  },
188
  {
189
+ "epoch": 1.2857142857142856,
190
+ "grad_norm": 0.04321262612938881,
191
+ "learning_rate": 6.65809639276034e-05,
192
+ "loss": 11.919,
193
+ "step": 72
194
  },
195
  {
196
+ "epoch": 1.3392857142857144,
197
+ "grad_norm": 0.049111876636743546,
198
+ "learning_rate": 6.373938853755126e-05,
199
+ "loss": 11.9185,
200
+ "step": 75
201
  },
202
  {
203
+ "epoch": 1.3928571428571428,
204
+ "grad_norm": 0.0660889744758606,
205
+ "learning_rate": 6.08489404053159e-05,
206
+ "loss": 11.9156,
207
+ "step": 78
208
  },
209
  {
210
+ "epoch": 1.4464285714285714,
211
+ "grad_norm": 0.0920424684882164,
212
+ "learning_rate": 5.791990122036075e-05,
213
+ "loss": 11.9151,
214
+ "step": 81
215
  },
216
  {
217
+ "epoch": 1.5,
218
+ "grad_norm": 0.04646582156419754,
219
+ "learning_rate": 5.496268994540309e-05,
220
+ "loss": 11.9162,
221
+ "step": 84
222
  },
223
  {
224
+ "epoch": 1.5535714285714286,
225
+ "grad_norm": 0.053827133029699326,
226
+ "learning_rate": 5.19878257548463e-05,
227
+ "loss": 11.9172,
228
+ "step": 87
 
229
  },
230
  {
231
+ "epoch": 1.6071428571428572,
232
+ "grad_norm": 0.04865885153412819,
233
+ "learning_rate": 4.900589061674649e-05,
234
+ "loss": 11.9165,
235
+ "step": 90
236
  },
237
  {
238
+ "epoch": 1.6607142857142856,
239
+ "grad_norm": 0.0875491127371788,
240
+ "learning_rate": 4.602749165141428e-05,
241
+ "loss": 11.9132,
242
+ "step": 93
243
  },
244
  {
245
+ "epoch": 1.7142857142857144,
246
+ "grad_norm": 0.04339161515235901,
247
+ "learning_rate": 4.3063223400546594e-05,
248
+ "loss": 11.9123,
249
+ "step": 96
250
  },
251
  {
252
+ "epoch": 1.7678571428571428,
253
+ "grad_norm": 0.04742836579680443,
254
+ "learning_rate": 4.012363014110237e-05,
255
+ "loss": 11.9154,
256
+ "step": 99
257
  },
258
  {
259
+ "epoch": 1.7857142857142856,
260
+ "eval_loss": 11.913043975830078,
261
+ "eval_runtime": 0.5398,
262
+ "eval_samples_per_second": 175.99,
263
+ "eval_steps_per_second": 44.461,
264
+ "step": 100
265
  },
266
  {
267
+ "epoch": 1.8214285714285714,
268
+ "grad_norm": 0.04398871585726738,
269
+ "learning_rate": 3.721916837797627e-05,
270
+ "loss": 11.9137,
271
+ "step": 102
272
  },
273
  {
274
+ "epoch": 1.875,
275
+ "grad_norm": 0.05781185254454613,
276
+ "learning_rate": 3.436016964888865e-05,
277
+ "loss": 11.9125,
278
+ "step": 105
279
  },
280
  {
281
+ "epoch": 1.9285714285714286,
282
+ "grad_norm": 0.08808522671461105,
283
+ "learning_rate": 3.1556803773799614e-05,
284
+ "loss": 11.9077,
285
+ "step": 108
286
  },
287
  {
288
+ "epoch": 1.9821428571428572,
289
+ "grad_norm": 0.06021308898925781,
290
+ "learning_rate": 2.8819042679573617e-05,
291
+ "loss": 11.9141,
292
+ "step": 111
293
  },
294
  {
295
+ "epoch": 2.0357142857142856,
296
+ "grad_norm": 0.048566147685050964,
297
+ "learning_rate": 2.6156624928574707e-05,
298
+ "loss": 11.9136,
299
+ "step": 114
300
  },
301
  {
302
+ "epoch": 2.0892857142857144,
303
+ "grad_norm": 0.04223039001226425,
304
+ "learning_rate": 2.3579021077369046e-05,
305
+ "loss": 11.9142,
306
+ "step": 117
307
  },
308
  {
309
+ "epoch": 2.142857142857143,
310
+ "grad_norm": 0.04601627215743065,
311
+ "learning_rate": 2.1095399988757574e-05,
312
+ "loss": 11.9133,
313
+ "step": 120
314
  },
315
  {
316
+ "epoch": 2.1964285714285716,
317
+ "grad_norm": 0.05840318650007248,
318
+ "learning_rate": 1.8714596216972007e-05,
319
+ "loss": 11.9112,
320
+ "step": 123
321
  },
322
  {
323
+ "epoch": 2.25,
324
+ "grad_norm": 0.035777896642684937,
325
+ "learning_rate": 1.6445078582048155e-05,
326
+ "loss": 11.9091,
327
+ "step": 126
328
  },
329
  {
330
+ "epoch": 2.3035714285714284,
331
+ "grad_norm": 0.037539299577474594,
332
+ "learning_rate": 1.4294920045162513e-05,
333
+ "loss": 11.9146,
334
+ "step": 129
 
335
  },
336
  {
337
+ "epoch": 2.357142857142857,
338
+ "grad_norm": 0.042784880846738815,
339
+ "learning_rate": 1.2271768992088489e-05,
340
+ "loss": 11.9117,
341
+ "step": 132
342
  },
343
  {
344
+ "epoch": 2.4107142857142856,
345
+ "grad_norm": 0.05495860055088997,
346
+ "learning_rate": 1.038282202692129e-05,
347
+ "loss": 11.9109,
348
+ "step": 135
349
  },
350
  {
351
+ "epoch": 2.4642857142857144,
352
+ "grad_norm": 0.07855169475078583,
353
+ "learning_rate": 8.634798372847148e-06,
354
+ "loss": 11.9069,
355
+ "step": 138
356
  },
357
  {
358
+ "epoch": 2.517857142857143,
359
+ "grad_norm": 0.03630689159035683,
360
+ "learning_rate": 7.033915971016952e-06,
361
+ "loss": 11.9139,
362
+ "step": 141
363
  },
364
  {
365
+ "epoch": 2.571428571428571,
366
+ "grad_norm": 0.048760075122117996,
367
+ "learning_rate": 5.585869362543416e-06,
368
+ "loss": 11.9124,
369
+ "step": 144
370
  },
371
  {
372
+ "epoch": 2.625,
373
+ "grad_norm": 0.04377627745270729,
374
+ "learning_rate": 4.29580943229827e-06,
375
+ "loss": 11.9118,
376
+ "step": 147
377
  },
378
  {
379
+ "epoch": 2.678571428571429,
380
+ "grad_norm": 0.05847623199224472,
381
+ "learning_rate": 3.1683250865636114e-06,
382
+ "loss": 11.9095,
383
+ "step": 150
384
  },
385
  {
386
+ "epoch": 2.678571428571429,
387
+ "eval_loss": 11.9102783203125,
388
+ "eval_runtime": 0.5392,
389
+ "eval_samples_per_second": 176.177,
390
+ "eval_steps_per_second": 44.508,
391
+ "step": 150
392
  },
393
  {
394
+ "epoch": 2.732142857142857,
395
+ "grad_norm": 0.034065768122673035,
396
+ "learning_rate": 2.2074269297119587e-06,
397
+ "loss": 11.9088,
398
+ "step": 153
399
  },
400
  {
401
+ "epoch": 2.7857142857142856,
402
+ "grad_norm": 0.04465312138199806,
403
+ "learning_rate": 1.4165329979794973e-06,
404
+ "loss": 11.9133,
405
+ "step": 156
406
  },
407
  {
408
+ "epoch": 2.8392857142857144,
409
+ "grad_norm": 0.043062131851911545,
410
+ "learning_rate": 7.984566010789674e-07,
411
+ "loss": 11.9122,
412
+ "step": 159
413
  },
414
  {
415
+ "epoch": 2.892857142857143,
416
+ "grad_norm": 0.05254960432648659,
417
+ "learning_rate": 3.553963149013295e-07,
418
+ "loss": 11.9094,
419
+ "step": 162
420
  },
421
  {
422
+ "epoch": 2.946428571428571,
423
+ "grad_norm": 0.032099399715662,
424
+ "learning_rate": 8.892816090335099e-08,
425
+ "loss": 11.9082,
426
+ "step": 165
427
  },
428
  {
429
+ "epoch": 3.0,
430
+ "grad_norm": 0.09737348556518555,
431
  "learning_rate": 0.0,
432
+ "loss": 11.908,
433
+ "step": 168
 
 
 
 
 
 
 
 
434
  }
435
  ],
436
+ "logging_steps": 3,
437
+ "max_steps": 168,
438
  "num_input_tokens_seen": 0,
439
+ "num_train_epochs": 3,
440
+ "save_steps": 50,
441
  "stateful_callbacks": {
442
+ "EarlyStoppingCallback": {
443
+ "args": {
444
+ "early_stopping_patience": 5,
445
+ "early_stopping_threshold": 0.0
446
+ },
447
+ "attributes": {
448
+ "early_stopping_patience_counter": 0
449
+ }
450
+ },
451
  "TrainerControl": {
452
  "args": {
453
  "should_epoch_stop": false,
 
459
  "attributes": {}
460
  }
461
  },
462
+ "total_flos": 808543272960.0,
463
+ "train_batch_size": 8,
464
  "trial_name": null,
465
  "trial_params": null
466
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b493acda86013cdad96ddc1c8b15ecd048f77916680ceff7b36913b4abf1f138
3
- size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0e57d43ac703b2028bdad9e5663a68ee010037b928221f2ef946ca8bb5c20a4
3
+ size 6840
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b493acda86013cdad96ddc1c8b15ecd048f77916680ceff7b36913b4abf1f138
3
- size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0e57d43ac703b2028bdad9e5663a68ee010037b928221f2ef946ca8bb5c20a4
3
+ size 6840