End of training
Browse files- README.md +43 -45
- adapter_config.json +6 -6
- adapter_model.bin +2 -2
- adapter_model.safetensors +2 -2
- last-checkpoint/adapter_config.json +6 -6
- last-checkpoint/adapter_model.safetensors +2 -2
- last-checkpoint/optimizer.pt +2 -2
- last-checkpoint/rng_state.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +322 -321
- last-checkpoint/training_args.bin +2 -2
- training_args.bin +2 -2
README.md
CHANGED
@@ -5,7 +5,7 @@ tags:
|
|
5 |
- axolotl
|
6 |
- generated_from_trainer
|
7 |
model-index:
|
8 |
-
- name:
|
9 |
results: []
|
10 |
---
|
11 |
|
@@ -17,16 +17,11 @@ should probably proofread and complete it, then remove this comment. -->
|
|
17 |
|
18 |
axolotl version: `0.4.1`
|
19 |
```yaml
|
20 |
-
accelerate_config:
|
21 |
-
dynamo_backend: inductor
|
22 |
-
mixed_precision: bf16
|
23 |
-
num_machines: 1
|
24 |
-
num_processes: auto
|
25 |
-
use_cpu: false
|
26 |
adapter: lora
|
27 |
base_model: peft-internal-testing/tiny-dummy-qwen2
|
28 |
-
bf16:
|
29 |
chat_template: llama3
|
|
|
30 |
dataset_prepared_path: null
|
31 |
datasets:
|
32 |
- data_files:
|
@@ -44,63 +39,67 @@ datasets:
|
|
44 |
debug: null
|
45 |
deepspeed: null
|
46 |
device_map: auto
|
47 |
-
|
|
|
|
|
48 |
eval_max_new_tokens: 128
|
|
|
49 |
eval_table_size: null
|
50 |
-
evals_per_epoch:
|
51 |
-
flash_attention:
|
52 |
-
fp16:
|
53 |
fsdp: null
|
54 |
fsdp_config: null
|
55 |
-
gradient_accumulation_steps:
|
56 |
gradient_checkpointing: true
|
57 |
-
group_by_length:
|
58 |
hub_model_id: null
|
59 |
hub_repo: null
|
60 |
hub_strategy: checkpoint
|
61 |
hub_token: null
|
62 |
learning_rate: 0.0001
|
|
|
|
|
63 |
local_rank: null
|
64 |
-
logging_steps:
|
65 |
-
lora_alpha:
|
66 |
lora_dropout: 0.05
|
67 |
lora_fan_in_fan_out: null
|
68 |
lora_model_dir: null
|
69 |
-
lora_r:
|
70 |
lora_target_linear: true
|
71 |
-
lora_target_modules:
|
72 |
-
- q_proj
|
73 |
-
- v_proj
|
74 |
lr_scheduler: cosine
|
|
|
75 |
max_memory:
|
76 |
-
0:
|
77 |
-
max_steps:
|
78 |
-
micro_batch_size:
|
79 |
mlflow_experiment_name: /tmp/c5efe3191618858d_train_data.json
|
80 |
model_type: AutoModelForCausalLM
|
81 |
-
num_epochs:
|
|
|
|
|
|
|
|
|
82 |
optimizer: adamw_bnb_8bit
|
83 |
output_dir: miner_id_24
|
84 |
pad_to_sequence_len: true
|
85 |
-
quantization_config:
|
86 |
-
llm_int8_enable_fp32_cpu_offload: true
|
87 |
-
load_in_8bit: true
|
88 |
resume_from_checkpoint: null
|
89 |
s2_attention: null
|
90 |
sample_packing: false
|
91 |
-
|
92 |
-
|
|
|
93 |
strict: false
|
94 |
-
tf32:
|
95 |
tokenizer_type: AutoTokenizer
|
96 |
-
torch_compile: true
|
97 |
train_on_inputs: false
|
98 |
trust_remote_code: true
|
99 |
val_set_size: 0.05
|
100 |
-
wandb_entity:
|
101 |
wandb_mode: online
|
102 |
wandb_name: a9e7a6db-0def-40a4-a8a4-0bf2c657692a
|
103 |
-
wandb_project: Gradients-On-
|
104 |
wandb_run: your_name
|
105 |
wandb_runid: a9e7a6db-0def-40a4-a8a4-0bf2c657692a
|
106 |
warmup_steps: 10
|
@@ -111,11 +110,11 @@ xformers_attention: null
|
|
111 |
|
112 |
</details><br>
|
113 |
|
114 |
-
#
|
115 |
|
116 |
This model is a fine-tuned version of [peft-internal-testing/tiny-dummy-qwen2](https://huggingface.co/peft-internal-testing/tiny-dummy-qwen2) on the None dataset.
|
117 |
It achieves the following results on the evaluation set:
|
118 |
-
- Loss: 11.
|
119 |
|
120 |
## Model description
|
121 |
|
@@ -135,25 +134,24 @@ More information needed
|
|
135 |
|
136 |
The following hyperparameters were used during training:
|
137 |
- learning_rate: 0.0001
|
138 |
-
- train_batch_size:
|
139 |
-
- eval_batch_size:
|
140 |
- seed: 42
|
141 |
-
- gradient_accumulation_steps:
|
142 |
- total_train_batch_size: 32
|
143 |
-
- optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=
|
144 |
- lr_scheduler_type: cosine
|
145 |
- lr_scheduler_warmup_steps: 10
|
146 |
-
- training_steps:
|
147 |
|
148 |
### Training results
|
149 |
|
150 |
| Training Loss | Epoch | Step | Validation Loss |
|
151 |
|:-------------:|:------:|:----:|:---------------:|
|
152 |
-
|
|
153 |
-
| 11.
|
154 |
-
| 11.
|
155 |
-
| 11.
|
156 |
-
| 13.9592 | 1.0034 | 56 | 11.9285 |
|
157 |
|
158 |
|
159 |
### Framework versions
|
|
|
5 |
- axolotl
|
6 |
- generated_from_trainer
|
7 |
model-index:
|
8 |
+
- name: 20aee596-a3f3-4852-b720-d6bfbd74ec2b
|
9 |
results: []
|
10 |
---
|
11 |
|
|
|
17 |
|
18 |
axolotl version: `0.4.1`
|
19 |
```yaml
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
adapter: lora
|
21 |
base_model: peft-internal-testing/tiny-dummy-qwen2
|
22 |
+
bf16: true
|
23 |
chat_template: llama3
|
24 |
+
data_processes: 16
|
25 |
dataset_prepared_path: null
|
26 |
datasets:
|
27 |
- data_files:
|
|
|
39 |
debug: null
|
40 |
deepspeed: null
|
41 |
device_map: auto
|
42 |
+
do_eval: true
|
43 |
+
early_stopping_patience: 5
|
44 |
+
eval_batch_size: 4
|
45 |
eval_max_new_tokens: 128
|
46 |
+
eval_steps: 50
|
47 |
eval_table_size: null
|
48 |
+
evals_per_epoch: null
|
49 |
+
flash_attention: true
|
50 |
+
fp16: false
|
51 |
fsdp: null
|
52 |
fsdp_config: null
|
53 |
+
gradient_accumulation_steps: 4
|
54 |
gradient_checkpointing: true
|
55 |
+
group_by_length: true
|
56 |
hub_model_id: null
|
57 |
hub_repo: null
|
58 |
hub_strategy: checkpoint
|
59 |
hub_token: null
|
60 |
learning_rate: 0.0001
|
61 |
+
load_in_4bit: false
|
62 |
+
load_in_8bit: false
|
63 |
local_rank: null
|
64 |
+
logging_steps: 3
|
65 |
+
lora_alpha: 128
|
66 |
lora_dropout: 0.05
|
67 |
lora_fan_in_fan_out: null
|
68 |
lora_model_dir: null
|
69 |
+
lora_r: 64
|
70 |
lora_target_linear: true
|
|
|
|
|
|
|
71 |
lr_scheduler: cosine
|
72 |
+
max_grad_norm: 1.0
|
73 |
max_memory:
|
74 |
+
0: 75GB
|
75 |
+
max_steps: 200
|
76 |
+
micro_batch_size: 8
|
77 |
mlflow_experiment_name: /tmp/c5efe3191618858d_train_data.json
|
78 |
model_type: AutoModelForCausalLM
|
79 |
+
num_epochs: 3
|
80 |
+
optim_args:
|
81 |
+
adam_beta1: 0.9
|
82 |
+
adam_beta2: 0.95
|
83 |
+
adam_epsilon: 1e-5
|
84 |
optimizer: adamw_bnb_8bit
|
85 |
output_dir: miner_id_24
|
86 |
pad_to_sequence_len: true
|
|
|
|
|
|
|
87 |
resume_from_checkpoint: null
|
88 |
s2_attention: null
|
89 |
sample_packing: false
|
90 |
+
save_steps: 50
|
91 |
+
saves_per_epoch: null
|
92 |
+
sequence_len: 1024
|
93 |
strict: false
|
94 |
+
tf32: true
|
95 |
tokenizer_type: AutoTokenizer
|
|
|
96 |
train_on_inputs: false
|
97 |
trust_remote_code: true
|
98 |
val_set_size: 0.05
|
99 |
+
wandb_entity: techspear-hub
|
100 |
wandb_mode: online
|
101 |
wandb_name: a9e7a6db-0def-40a4-a8a4-0bf2c657692a
|
102 |
+
wandb_project: Gradients-On-Three
|
103 |
wandb_run: your_name
|
104 |
wandb_runid: a9e7a6db-0def-40a4-a8a4-0bf2c657692a
|
105 |
warmup_steps: 10
|
|
|
110 |
|
111 |
</details><br>
|
112 |
|
113 |
+
# 20aee596-a3f3-4852-b720-d6bfbd74ec2b
|
114 |
|
115 |
This model is a fine-tuned version of [peft-internal-testing/tiny-dummy-qwen2](https://huggingface.co/peft-internal-testing/tiny-dummy-qwen2) on the None dataset.
|
116 |
It achieves the following results on the evaluation set:
|
117 |
+
- Loss: 11.9103
|
118 |
|
119 |
## Model description
|
120 |
|
|
|
134 |
|
135 |
The following hyperparameters were used during training:
|
136 |
- learning_rate: 0.0001
|
137 |
+
- train_batch_size: 8
|
138 |
+
- eval_batch_size: 4
|
139 |
- seed: 42
|
140 |
+
- gradient_accumulation_steps: 4
|
141 |
- total_train_batch_size: 32
|
142 |
+
- optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=adam_beta1=0.9,adam_beta2=0.95,adam_epsilon=1e-5
|
143 |
- lr_scheduler_type: cosine
|
144 |
- lr_scheduler_warmup_steps: 10
|
145 |
+
- training_steps: 168
|
146 |
|
147 |
### Training results
|
148 |
|
149 |
| Training Loss | Epoch | Step | Validation Loss |
|
150 |
|:-------------:|:------:|:----:|:---------------:|
|
151 |
+
| No log | 0.0179 | 1 | 11.9290 |
|
152 |
+
| 11.9211 | 0.8929 | 50 | 11.9188 |
|
153 |
+
| 11.9154 | 1.7857 | 100 | 11.9130 |
|
154 |
+
| 11.9095 | 2.6786 | 150 | 11.9103 |
|
|
|
155 |
|
156 |
|
157 |
### Framework versions
|
adapter_config.json
CHANGED
@@ -10,23 +10,23 @@
|
|
10 |
"layers_pattern": null,
|
11 |
"layers_to_transform": null,
|
12 |
"loftq_config": {},
|
13 |
-
"lora_alpha":
|
14 |
"lora_dropout": 0.05,
|
15 |
"megatron_config": null,
|
16 |
"megatron_core": "megatron.core",
|
17 |
"modules_to_save": null,
|
18 |
"peft_type": "LORA",
|
19 |
-
"r":
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
|
|
23 |
"v_proj",
|
24 |
-
"
|
25 |
"k_proj",
|
26 |
"q_proj",
|
27 |
-
"
|
28 |
-
"
|
29 |
-
"up_proj"
|
30 |
],
|
31 |
"task_type": "CAUSAL_LM",
|
32 |
"use_dora": false,
|
|
|
10 |
"layers_pattern": null,
|
11 |
"layers_to_transform": null,
|
12 |
"loftq_config": {},
|
13 |
+
"lora_alpha": 128,
|
14 |
"lora_dropout": 0.05,
|
15 |
"megatron_config": null,
|
16 |
"megatron_core": "megatron.core",
|
17 |
"modules_to_save": null,
|
18 |
"peft_type": "LORA",
|
19 |
+
"r": 64,
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
+
"o_proj",
|
24 |
"v_proj",
|
25 |
+
"gate_proj",
|
26 |
"k_proj",
|
27 |
"q_proj",
|
28 |
+
"up_proj",
|
29 |
+
"down_proj"
|
|
|
30 |
],
|
31 |
"task_type": "CAUSAL_LM",
|
32 |
"use_dora": false,
|
adapter_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:23be886be29d08119ba7e77cd77202bd2500cfc5ec768b28e1017011d87d3242
|
3 |
+
size 100226
|
adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:70358ddbab6095a18b2d8241d01d9a1150eb787eb05c75962fee72ff2f1282b1
|
3 |
+
size 93608
|
last-checkpoint/adapter_config.json
CHANGED
@@ -10,23 +10,23 @@
|
|
10 |
"layers_pattern": null,
|
11 |
"layers_to_transform": null,
|
12 |
"loftq_config": {},
|
13 |
-
"lora_alpha":
|
14 |
"lora_dropout": 0.05,
|
15 |
"megatron_config": null,
|
16 |
"megatron_core": "megatron.core",
|
17 |
"modules_to_save": null,
|
18 |
"peft_type": "LORA",
|
19 |
-
"r":
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
|
|
23 |
"v_proj",
|
24 |
-
"
|
25 |
"k_proj",
|
26 |
"q_proj",
|
27 |
-
"
|
28 |
-
"
|
29 |
-
"up_proj"
|
30 |
],
|
31 |
"task_type": "CAUSAL_LM",
|
32 |
"use_dora": false,
|
|
|
10 |
"layers_pattern": null,
|
11 |
"layers_to_transform": null,
|
12 |
"loftq_config": {},
|
13 |
+
"lora_alpha": 128,
|
14 |
"lora_dropout": 0.05,
|
15 |
"megatron_config": null,
|
16 |
"megatron_core": "megatron.core",
|
17 |
"modules_to_save": null,
|
18 |
"peft_type": "LORA",
|
19 |
+
"r": 64,
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
+
"o_proj",
|
24 |
"v_proj",
|
25 |
+
"gate_proj",
|
26 |
"k_proj",
|
27 |
"q_proj",
|
28 |
+
"up_proj",
|
29 |
+
"down_proj"
|
|
|
30 |
],
|
31 |
"task_type": "CAUSAL_LM",
|
32 |
"use_dora": false,
|
last-checkpoint/adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d65eca5df530a9cf1f21b7ea60a51be0bb3c1def5e92182931659838d1a3ae1
|
3 |
+
size 93608
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92d385166cc29cae40ac00f90a7df30af287e10af88ef6ea0770cf9b6e721e42
|
3 |
+
size 197158
|
last-checkpoint/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14244
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:332d7236d1b47460701a6a346289bf62d000f5ae00d25856256e9d7e62fe1ee4
|
3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1f09a84ebe5bd3a3aeb21e49457549d7dd6ed93b0deb63e91ac3af1cc132c6c3
|
3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
@@ -1,452 +1,453 @@
|
|
1 |
{
|
2 |
-
"best_metric":
|
3 |
-
"best_model_checkpoint":
|
4 |
-
"epoch":
|
5 |
-
"eval_steps":
|
6 |
-
"global_step":
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
10 |
"log_history": [
|
11 |
{
|
12 |
-
"epoch": 0.
|
13 |
-
"
|
14 |
-
"
|
15 |
-
"
|
|
|
16 |
"step": 1
|
17 |
},
|
18 |
{
|
19 |
-
"epoch": 0.
|
20 |
-
"
|
21 |
-
"
|
22 |
-
"
|
23 |
-
"
|
24 |
-
"step": 1
|
25 |
},
|
26 |
{
|
27 |
-
"epoch": 0.
|
28 |
-
"grad_norm": 0.
|
29 |
-
"learning_rate":
|
30 |
-
"loss": 11.
|
31 |
-
"step":
|
32 |
},
|
33 |
{
|
34 |
-
"epoch": 0.
|
35 |
-
"grad_norm": 0.
|
36 |
-
"learning_rate":
|
37 |
-
"loss": 11.
|
38 |
-
"step":
|
39 |
},
|
40 |
{
|
41 |
-
"epoch": 0.
|
42 |
-
"grad_norm": 0.
|
43 |
-
"learning_rate":
|
44 |
-
"loss": 11.
|
45 |
-
"step":
|
46 |
},
|
47 |
{
|
48 |
-
"epoch": 0.
|
49 |
-
"grad_norm": 0.
|
50 |
-
"learning_rate":
|
51 |
-
"loss": 11.
|
52 |
-
"step":
|
53 |
},
|
54 |
{
|
55 |
-
"epoch": 0.
|
56 |
-
"grad_norm": 0.
|
57 |
-
"learning_rate":
|
58 |
-
"loss": 11.
|
59 |
-
"step":
|
60 |
},
|
61 |
{
|
62 |
-
"epoch": 0.
|
63 |
-
"grad_norm": 0.
|
64 |
-
"learning_rate":
|
65 |
-
"loss": 11.
|
66 |
-
"step":
|
67 |
},
|
68 |
{
|
69 |
-
"epoch": 0.
|
70 |
-
"grad_norm": 0.
|
71 |
-
"learning_rate":
|
72 |
-
"loss": 11.
|
73 |
-
"step":
|
74 |
},
|
75 |
{
|
76 |
-
"epoch": 0.
|
77 |
-
"grad_norm": 0.
|
78 |
-
"learning_rate":
|
79 |
-
"loss": 11.
|
80 |
-
"step":
|
81 |
},
|
82 |
{
|
83 |
-
"epoch": 0.
|
84 |
-
"grad_norm": 0.
|
85 |
-
"learning_rate":
|
86 |
-
"loss": 11.
|
87 |
-
"step":
|
88 |
},
|
89 |
{
|
90 |
-
"epoch": 0.
|
91 |
-
"grad_norm": 0.
|
92 |
-
"learning_rate": 9.
|
93 |
-
"loss": 11.
|
94 |
-
"step":
|
95 |
},
|
96 |
{
|
97 |
-
"epoch": 0.
|
98 |
-
"grad_norm": 0.
|
99 |
-
"learning_rate": 9.
|
100 |
-
"loss": 11.
|
101 |
-
"step":
|
102 |
},
|
103 |
{
|
104 |
-
"epoch": 0.
|
105 |
-
"grad_norm": 0.
|
106 |
-
"learning_rate": 9.
|
107 |
-
"loss": 11.
|
108 |
-
"step":
|
109 |
},
|
110 |
{
|
111 |
-
"epoch": 0.
|
112 |
-
"grad_norm": 0.
|
113 |
-
"learning_rate": 9.
|
114 |
-
"loss": 11.
|
115 |
-
"step":
|
116 |
},
|
117 |
{
|
118 |
-
"epoch": 0.
|
119 |
-
"
|
120 |
-
"
|
121 |
-
"
|
122 |
-
"
|
123 |
-
"step": 14
|
124 |
},
|
125 |
{
|
126 |
-
"epoch": 0.
|
127 |
-
"grad_norm": 0.
|
128 |
-
"learning_rate":
|
129 |
-
"loss": 11.
|
130 |
-
"step":
|
131 |
},
|
132 |
{
|
133 |
-
"epoch": 0.
|
134 |
-
"
|
135 |
-
"
|
136 |
-
"
|
137 |
-
"
|
|
|
138 |
},
|
139 |
{
|
140 |
-
"epoch": 0.
|
141 |
-
"grad_norm": 0.
|
142 |
-
"learning_rate":
|
143 |
-
"loss": 11.
|
144 |
-
"step":
|
145 |
},
|
146 |
{
|
147 |
-
"epoch": 0.
|
148 |
-
"grad_norm": 0.
|
149 |
-
"learning_rate":
|
150 |
-
"loss": 11.
|
151 |
-
"step":
|
152 |
},
|
153 |
{
|
154 |
-
"epoch":
|
155 |
-
"grad_norm": 0.
|
156 |
-
"learning_rate":
|
157 |
-
"loss": 11.
|
158 |
-
"step":
|
159 |
},
|
160 |
{
|
161 |
-
"epoch":
|
162 |
-
"grad_norm": 0.
|
163 |
-
"learning_rate":
|
164 |
-
"loss": 11.
|
165 |
-
"step":
|
166 |
},
|
167 |
{
|
168 |
-
"epoch":
|
169 |
-
"grad_norm": 0.
|
170 |
-
"learning_rate":
|
171 |
-
"loss": 11.
|
172 |
-
"step":
|
173 |
},
|
174 |
{
|
175 |
-
"epoch":
|
176 |
-
"grad_norm": 0.
|
177 |
-
"learning_rate":
|
178 |
-
"loss": 11.
|
179 |
-
"step":
|
180 |
},
|
181 |
{
|
182 |
-
"epoch":
|
183 |
-
"grad_norm": 0.
|
184 |
-
"learning_rate":
|
185 |
-
"loss": 11.
|
186 |
-
"step":
|
187 |
},
|
188 |
{
|
189 |
-
"epoch":
|
190 |
-
"grad_norm": 0.
|
191 |
-
"learning_rate":
|
192 |
-
"loss": 11.
|
193 |
-
"step":
|
194 |
},
|
195 |
{
|
196 |
-
"epoch":
|
197 |
-
"grad_norm": 0.
|
198 |
-
"learning_rate":
|
199 |
-
"loss": 11.
|
200 |
-
"step":
|
201 |
},
|
202 |
{
|
203 |
-
"epoch":
|
204 |
-
"grad_norm": 0.
|
205 |
-
"learning_rate":
|
206 |
-
"loss": 11.
|
207 |
-
"step":
|
208 |
},
|
209 |
{
|
210 |
-
"epoch":
|
211 |
-
"grad_norm": 0.
|
212 |
-
"learning_rate":
|
213 |
-
"loss": 11.
|
214 |
-
"step":
|
215 |
},
|
216 |
{
|
217 |
-
"epoch":
|
218 |
-
"grad_norm": 0.
|
219 |
-
"learning_rate":
|
220 |
-
"loss": 11.
|
221 |
-
"step":
|
222 |
},
|
223 |
{
|
224 |
-
"epoch":
|
225 |
-
"
|
226 |
-
"
|
227 |
-
"
|
228 |
-
"
|
229 |
-
"step": 28
|
230 |
},
|
231 |
{
|
232 |
-
"epoch":
|
233 |
-
"grad_norm": 0.
|
234 |
-
"learning_rate":
|
235 |
-
"loss": 11.
|
236 |
-
"step":
|
237 |
},
|
238 |
{
|
239 |
-
"epoch":
|
240 |
-
"grad_norm": 0.
|
241 |
-
"learning_rate":
|
242 |
-
"loss": 11.
|
243 |
-
"step":
|
244 |
},
|
245 |
{
|
246 |
-
"epoch":
|
247 |
-
"grad_norm": 0.
|
248 |
-
"learning_rate":
|
249 |
-
"loss": 11.
|
250 |
-
"step":
|
251 |
},
|
252 |
{
|
253 |
-
"epoch":
|
254 |
-
"grad_norm": 0.
|
255 |
-
"learning_rate":
|
256 |
-
"loss": 11.
|
257 |
-
"step":
|
258 |
},
|
259 |
{
|
260 |
-
"epoch":
|
261 |
-
"
|
262 |
-
"
|
263 |
-
"
|
264 |
-
"
|
|
|
265 |
},
|
266 |
{
|
267 |
-
"epoch":
|
268 |
-
"grad_norm": 0.
|
269 |
-
"learning_rate":
|
270 |
-
"loss": 11.
|
271 |
-
"step":
|
272 |
},
|
273 |
{
|
274 |
-
"epoch":
|
275 |
-
"grad_norm": 0.
|
276 |
-
"learning_rate":
|
277 |
-
"loss": 11.
|
278 |
-
"step":
|
279 |
},
|
280 |
{
|
281 |
-
"epoch":
|
282 |
-
"grad_norm": 0.
|
283 |
-
"learning_rate": 3.
|
284 |
-
"loss": 11.
|
285 |
-
"step":
|
286 |
},
|
287 |
{
|
288 |
-
"epoch":
|
289 |
-
"grad_norm": 0.
|
290 |
-
"learning_rate":
|
291 |
-
"loss": 11.
|
292 |
-
"step":
|
293 |
},
|
294 |
{
|
295 |
-
"epoch":
|
296 |
-
"grad_norm": 0.
|
297 |
-
"learning_rate":
|
298 |
-
"loss": 11.
|
299 |
-
"step":
|
300 |
},
|
301 |
{
|
302 |
-
"epoch":
|
303 |
-
"grad_norm": 0.
|
304 |
-
"learning_rate":
|
305 |
-
"loss": 11.
|
306 |
-
"step":
|
307 |
},
|
308 |
{
|
309 |
-
"epoch":
|
310 |
-
"grad_norm": 0.
|
311 |
-
"learning_rate": 2.
|
312 |
-
"loss": 11.
|
313 |
-
"step":
|
314 |
},
|
315 |
{
|
316 |
-
"epoch":
|
317 |
-
"grad_norm": 0.
|
318 |
-
"learning_rate":
|
319 |
-
"loss": 11.
|
320 |
-
"step":
|
321 |
},
|
322 |
{
|
323 |
-
"epoch":
|
324 |
-
"grad_norm": 0.
|
325 |
-
"learning_rate":
|
326 |
-
"loss": 11.
|
327 |
-
"step":
|
328 |
},
|
329 |
{
|
330 |
-
"epoch":
|
331 |
-
"
|
332 |
-
"
|
333 |
-
"
|
334 |
-
"
|
335 |
-
"step": 42
|
336 |
},
|
337 |
{
|
338 |
-
"epoch":
|
339 |
-
"grad_norm": 0.
|
340 |
-
"learning_rate": 1.
|
341 |
-
"loss": 11.
|
342 |
-
"step":
|
343 |
},
|
344 |
{
|
345 |
-
"epoch":
|
346 |
-
"grad_norm": 0.
|
347 |
-
"learning_rate": 1.
|
348 |
-
"loss": 11.
|
349 |
-
"step":
|
350 |
},
|
351 |
{
|
352 |
-
"epoch":
|
353 |
-
"grad_norm": 0.
|
354 |
-
"learning_rate":
|
355 |
-
"loss": 11.
|
356 |
-
"step":
|
357 |
},
|
358 |
{
|
359 |
-
"epoch":
|
360 |
-
"grad_norm": 0.
|
361 |
-
"learning_rate":
|
362 |
-
"loss": 11.
|
363 |
-
"step":
|
364 |
},
|
365 |
{
|
366 |
-
"epoch":
|
367 |
-
"grad_norm": 0.
|
368 |
-
"learning_rate":
|
369 |
-
"loss": 11.
|
370 |
-
"step":
|
371 |
},
|
372 |
{
|
373 |
-
"epoch":
|
374 |
-
"grad_norm": 0.
|
375 |
-
"learning_rate":
|
376 |
-
"loss": 11.
|
377 |
-
"step":
|
378 |
},
|
379 |
{
|
380 |
-
"epoch":
|
381 |
-
"grad_norm": 0.
|
382 |
-
"learning_rate":
|
383 |
-
"loss": 11.
|
384 |
-
"step":
|
385 |
},
|
386 |
{
|
387 |
-
"epoch":
|
388 |
-
"
|
389 |
-
"
|
390 |
-
"
|
391 |
-
"
|
|
|
392 |
},
|
393 |
{
|
394 |
-
"epoch":
|
395 |
-
"grad_norm": 0.
|
396 |
-
"learning_rate": 2.
|
397 |
-
"loss": 11.
|
398 |
-
"step":
|
399 |
},
|
400 |
{
|
401 |
-
"epoch":
|
402 |
-
"grad_norm": 0.
|
403 |
-
"learning_rate": 1.
|
404 |
-
"loss": 11.
|
405 |
-
"step":
|
406 |
},
|
407 |
{
|
408 |
-
"epoch":
|
409 |
-
"grad_norm": 0.
|
410 |
-
"learning_rate":
|
411 |
-
"loss": 11.
|
412 |
-
"step":
|
413 |
},
|
414 |
{
|
415 |
-
"epoch":
|
416 |
-
"grad_norm": 0.
|
417 |
-
"learning_rate":
|
418 |
-
"loss": 11.
|
419 |
-
"step":
|
420 |
},
|
421 |
{
|
422 |
-
"epoch":
|
423 |
-
"grad_norm": 0.
|
424 |
-
"learning_rate":
|
425 |
-
"loss": 11.
|
426 |
-
"step":
|
427 |
},
|
428 |
{
|
429 |
-
"epoch":
|
430 |
-
"grad_norm": 0.
|
431 |
"learning_rate": 0.0,
|
432 |
-
"loss":
|
433 |
-
"step":
|
434 |
-
},
|
435 |
-
{
|
436 |
-
"epoch": 1.0033519553072625,
|
437 |
-
"eval_loss": 11.928487777709961,
|
438 |
-
"eval_runtime": 0.3738,
|
439 |
-
"eval_samples_per_second": 254.136,
|
440 |
-
"eval_steps_per_second": 128.405,
|
441 |
-
"step": 56
|
442 |
}
|
443 |
],
|
444 |
-
"logging_steps":
|
445 |
-
"max_steps":
|
446 |
"num_input_tokens_seen": 0,
|
447 |
-
"num_train_epochs":
|
448 |
-
"save_steps":
|
449 |
"stateful_callbacks": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
450 |
"TrainerControl": {
|
451 |
"args": {
|
452 |
"should_epoch_stop": false,
|
@@ -458,8 +459,8 @@
|
|
458 |
"attributes": {}
|
459 |
}
|
460 |
},
|
461 |
-
"total_flos":
|
462 |
-
"train_batch_size":
|
463 |
"trial_name": null,
|
464 |
"trial_params": null
|
465 |
}
|
|
|
1 |
{
|
2 |
+
"best_metric": 11.9102783203125,
|
3 |
+
"best_model_checkpoint": "miner_id_24/checkpoint-150",
|
4 |
+
"epoch": 3.0,
|
5 |
+
"eval_steps": 50,
|
6 |
+
"global_step": 168,
|
7 |
"is_hyper_param_search": false,
|
8 |
"is_local_process_zero": true,
|
9 |
"is_world_process_zero": true,
|
10 |
"log_history": [
|
11 |
{
|
12 |
+
"epoch": 0.017857142857142856,
|
13 |
+
"eval_loss": 11.9290132522583,
|
14 |
+
"eval_runtime": 0.5421,
|
15 |
+
"eval_samples_per_second": 175.25,
|
16 |
+
"eval_steps_per_second": 44.274,
|
17 |
"step": 1
|
18 |
},
|
19 |
{
|
20 |
+
"epoch": 0.05357142857142857,
|
21 |
+
"grad_norm": 0.019959961995482445,
|
22 |
+
"learning_rate": 3e-05,
|
23 |
+
"loss": 11.9303,
|
24 |
+
"step": 3
|
|
|
25 |
},
|
26 |
{
|
27 |
+
"epoch": 0.10714285714285714,
|
28 |
+
"grad_norm": 0.022794105112552643,
|
29 |
+
"learning_rate": 6e-05,
|
30 |
+
"loss": 11.9305,
|
31 |
+
"step": 6
|
32 |
},
|
33 |
{
|
34 |
+
"epoch": 0.16071428571428573,
|
35 |
+
"grad_norm": 0.0353395938873291,
|
36 |
+
"learning_rate": 9e-05,
|
37 |
+
"loss": 11.9295,
|
38 |
+
"step": 9
|
39 |
},
|
40 |
{
|
41 |
+
"epoch": 0.21428571428571427,
|
42 |
+
"grad_norm": 0.03792329132556915,
|
43 |
+
"learning_rate": 9.996046986136509e-05,
|
44 |
+
"loss": 11.93,
|
45 |
+
"step": 12
|
46 |
},
|
47 |
{
|
48 |
+
"epoch": 0.26785714285714285,
|
49 |
+
"grad_norm": 0.03241180628538132,
|
50 |
+
"learning_rate": 9.975310752612137e-05,
|
51 |
+
"loss": 11.9276,
|
52 |
+
"step": 15
|
53 |
},
|
54 |
{
|
55 |
+
"epoch": 0.32142857142857145,
|
56 |
+
"grad_norm": 0.05070869252085686,
|
57 |
+
"learning_rate": 9.936876709681668e-05,
|
58 |
+
"loss": 11.9291,
|
59 |
+
"step": 18
|
60 |
},
|
61 |
{
|
62 |
+
"epoch": 0.375,
|
63 |
+
"grad_norm": 0.05852275714278221,
|
64 |
+
"learning_rate": 9.880881572095256e-05,
|
65 |
+
"loss": 11.9291,
|
66 |
+
"step": 21
|
67 |
},
|
68 |
{
|
69 |
+
"epoch": 0.42857142857142855,
|
70 |
+
"grad_norm": 0.08124187588691711,
|
71 |
+
"learning_rate": 9.807524521637102e-05,
|
72 |
+
"loss": 11.9278,
|
73 |
+
"step": 24
|
74 |
},
|
75 |
{
|
76 |
+
"epoch": 0.48214285714285715,
|
77 |
+
"grad_norm": 0.05361940711736679,
|
78 |
+
"learning_rate": 9.717066498610673e-05,
|
79 |
+
"loss": 11.9256,
|
80 |
+
"step": 27
|
81 |
},
|
82 |
{
|
83 |
+
"epoch": 0.5357142857142857,
|
84 |
+
"grad_norm": 0.0750463530421257,
|
85 |
+
"learning_rate": 9.609829273641034e-05,
|
86 |
+
"loss": 11.9261,
|
87 |
+
"step": 30
|
88 |
},
|
89 |
{
|
90 |
+
"epoch": 0.5892857142857143,
|
91 |
+
"grad_norm": 0.08543704450130463,
|
92 |
+
"learning_rate": 9.486194303096062e-05,
|
93 |
+
"loss": 11.9248,
|
94 |
+
"step": 33
|
95 |
},
|
96 |
{
|
97 |
+
"epoch": 0.6428571428571429,
|
98 |
+
"grad_norm": 0.09016852080821991,
|
99 |
+
"learning_rate": 9.346601372197914e-05,
|
100 |
+
"loss": 11.9243,
|
101 |
+
"step": 36
|
102 |
},
|
103 |
{
|
104 |
+
"epoch": 0.6964285714285714,
|
105 |
+
"grad_norm": 0.10736904293298721,
|
106 |
+
"learning_rate": 9.191547030651383e-05,
|
107 |
+
"loss": 11.9206,
|
108 |
+
"step": 39
|
109 |
},
|
110 |
{
|
111 |
+
"epoch": 0.75,
|
112 |
+
"grad_norm": 0.0793285071849823,
|
113 |
+
"learning_rate": 9.021582826353824e-05,
|
114 |
+
"loss": 11.9232,
|
115 |
+
"step": 42
|
116 |
},
|
117 |
{
|
118 |
+
"epoch": 0.8035714285714286,
|
119 |
+
"grad_norm": 0.05057210102677345,
|
120 |
+
"learning_rate": 8.83731334346954e-05,
|
121 |
+
"loss": 11.9215,
|
122 |
+
"step": 45
|
|
|
123 |
},
|
124 |
{
|
125 |
+
"epoch": 0.8571428571428571,
|
126 |
+
"grad_norm": 0.05617088824510574,
|
127 |
+
"learning_rate": 8.639394051847472e-05,
|
128 |
+
"loss": 11.9211,
|
129 |
+
"step": 48
|
130 |
},
|
131 |
{
|
132 |
+
"epoch": 0.8928571428571429,
|
133 |
+
"eval_loss": 11.918818473815918,
|
134 |
+
"eval_runtime": 0.5369,
|
135 |
+
"eval_samples_per_second": 176.942,
|
136 |
+
"eval_steps_per_second": 44.701,
|
137 |
+
"step": 50
|
138 |
},
|
139 |
{
|
140 |
+
"epoch": 0.9107142857142857,
|
141 |
+
"grad_norm": 0.06780627369880676,
|
142 |
+
"learning_rate": 8.428528975432066e-05,
|
143 |
+
"loss": 11.9193,
|
144 |
+
"step": 51
|
145 |
},
|
146 |
{
|
147 |
+
"epoch": 0.9642857142857143,
|
148 |
+
"grad_norm": 0.03634953871369362,
|
149 |
+
"learning_rate": 8.2054681879611e-05,
|
150 |
+
"loss": 11.9178,
|
151 |
+
"step": 54
|
152 |
},
|
153 |
{
|
154 |
+
"epoch": 1.0178571428571428,
|
155 |
+
"grad_norm": 0.04605906456708908,
|
156 |
+
"learning_rate": 7.971005144858553e-05,
|
157 |
+
"loss": 11.9182,
|
158 |
+
"step": 57
|
159 |
},
|
160 |
{
|
161 |
+
"epoch": 1.0714285714285714,
|
162 |
+
"grad_norm": 0.025876272469758987,
|
163 |
+
"learning_rate": 7.725973860813338e-05,
|
164 |
+
"loss": 11.9199,
|
165 |
+
"step": 60
|
166 |
},
|
167 |
{
|
168 |
+
"epoch": 1.125,
|
169 |
+
"grad_norm": 0.04855341464281082,
|
170 |
+
"learning_rate": 7.471245943083615e-05,
|
171 |
+
"loss": 11.9185,
|
172 |
+
"step": 63
|
173 |
},
|
174 |
{
|
175 |
+
"epoch": 1.1785714285714286,
|
176 |
+
"grad_norm": 0.03910359740257263,
|
177 |
+
"learning_rate": 7.20772749107956e-05,
|
178 |
+
"loss": 11.9184,
|
179 |
+
"step": 66
|
180 |
},
|
181 |
{
|
182 |
+
"epoch": 1.2321428571428572,
|
183 |
+
"grad_norm": 0.08992303162813187,
|
184 |
+
"learning_rate": 6.936355873253206e-05,
|
185 |
+
"loss": 11.9154,
|
186 |
+
"step": 69
|
187 |
},
|
188 |
{
|
189 |
+
"epoch": 1.2857142857142856,
|
190 |
+
"grad_norm": 0.04321262612938881,
|
191 |
+
"learning_rate": 6.65809639276034e-05,
|
192 |
+
"loss": 11.919,
|
193 |
+
"step": 72
|
194 |
},
|
195 |
{
|
196 |
+
"epoch": 1.3392857142857144,
|
197 |
+
"grad_norm": 0.049111876636743546,
|
198 |
+
"learning_rate": 6.373938853755126e-05,
|
199 |
+
"loss": 11.9185,
|
200 |
+
"step": 75
|
201 |
},
|
202 |
{
|
203 |
+
"epoch": 1.3928571428571428,
|
204 |
+
"grad_norm": 0.0660889744758606,
|
205 |
+
"learning_rate": 6.08489404053159e-05,
|
206 |
+
"loss": 11.9156,
|
207 |
+
"step": 78
|
208 |
},
|
209 |
{
|
210 |
+
"epoch": 1.4464285714285714,
|
211 |
+
"grad_norm": 0.0920424684882164,
|
212 |
+
"learning_rate": 5.791990122036075e-05,
|
213 |
+
"loss": 11.9151,
|
214 |
+
"step": 81
|
215 |
},
|
216 |
{
|
217 |
+
"epoch": 1.5,
|
218 |
+
"grad_norm": 0.04646582156419754,
|
219 |
+
"learning_rate": 5.496268994540309e-05,
|
220 |
+
"loss": 11.9162,
|
221 |
+
"step": 84
|
222 |
},
|
223 |
{
|
224 |
+
"epoch": 1.5535714285714286,
|
225 |
+
"grad_norm": 0.053827133029699326,
|
226 |
+
"learning_rate": 5.19878257548463e-05,
|
227 |
+
"loss": 11.9172,
|
228 |
+
"step": 87
|
|
|
229 |
},
|
230 |
{
|
231 |
+
"epoch": 1.6071428571428572,
|
232 |
+
"grad_norm": 0.04865885153412819,
|
233 |
+
"learning_rate": 4.900589061674649e-05,
|
234 |
+
"loss": 11.9165,
|
235 |
+
"step": 90
|
236 |
},
|
237 |
{
|
238 |
+
"epoch": 1.6607142857142856,
|
239 |
+
"grad_norm": 0.0875491127371788,
|
240 |
+
"learning_rate": 4.602749165141428e-05,
|
241 |
+
"loss": 11.9132,
|
242 |
+
"step": 93
|
243 |
},
|
244 |
{
|
245 |
+
"epoch": 1.7142857142857144,
|
246 |
+
"grad_norm": 0.04339161515235901,
|
247 |
+
"learning_rate": 4.3063223400546594e-05,
|
248 |
+
"loss": 11.9123,
|
249 |
+
"step": 96
|
250 |
},
|
251 |
{
|
252 |
+
"epoch": 1.7678571428571428,
|
253 |
+
"grad_norm": 0.04742836579680443,
|
254 |
+
"learning_rate": 4.012363014110237e-05,
|
255 |
+
"loss": 11.9154,
|
256 |
+
"step": 99
|
257 |
},
|
258 |
{
|
259 |
+
"epoch": 1.7857142857142856,
|
260 |
+
"eval_loss": 11.913043975830078,
|
261 |
+
"eval_runtime": 0.5398,
|
262 |
+
"eval_samples_per_second": 175.99,
|
263 |
+
"eval_steps_per_second": 44.461,
|
264 |
+
"step": 100
|
265 |
},
|
266 |
{
|
267 |
+
"epoch": 1.8214285714285714,
|
268 |
+
"grad_norm": 0.04398871585726738,
|
269 |
+
"learning_rate": 3.721916837797627e-05,
|
270 |
+
"loss": 11.9137,
|
271 |
+
"step": 102
|
272 |
},
|
273 |
{
|
274 |
+
"epoch": 1.875,
|
275 |
+
"grad_norm": 0.05781185254454613,
|
276 |
+
"learning_rate": 3.436016964888865e-05,
|
277 |
+
"loss": 11.9125,
|
278 |
+
"step": 105
|
279 |
},
|
280 |
{
|
281 |
+
"epoch": 1.9285714285714286,
|
282 |
+
"grad_norm": 0.08808522671461105,
|
283 |
+
"learning_rate": 3.1556803773799614e-05,
|
284 |
+
"loss": 11.9077,
|
285 |
+
"step": 108
|
286 |
},
|
287 |
{
|
288 |
+
"epoch": 1.9821428571428572,
|
289 |
+
"grad_norm": 0.06021308898925781,
|
290 |
+
"learning_rate": 2.8819042679573617e-05,
|
291 |
+
"loss": 11.9141,
|
292 |
+
"step": 111
|
293 |
},
|
294 |
{
|
295 |
+
"epoch": 2.0357142857142856,
|
296 |
+
"grad_norm": 0.048566147685050964,
|
297 |
+
"learning_rate": 2.6156624928574707e-05,
|
298 |
+
"loss": 11.9136,
|
299 |
+
"step": 114
|
300 |
},
|
301 |
{
|
302 |
+
"epoch": 2.0892857142857144,
|
303 |
+
"grad_norm": 0.04223039001226425,
|
304 |
+
"learning_rate": 2.3579021077369046e-05,
|
305 |
+
"loss": 11.9142,
|
306 |
+
"step": 117
|
307 |
},
|
308 |
{
|
309 |
+
"epoch": 2.142857142857143,
|
310 |
+
"grad_norm": 0.04601627215743065,
|
311 |
+
"learning_rate": 2.1095399988757574e-05,
|
312 |
+
"loss": 11.9133,
|
313 |
+
"step": 120
|
314 |
},
|
315 |
{
|
316 |
+
"epoch": 2.1964285714285716,
|
317 |
+
"grad_norm": 0.05840318650007248,
|
318 |
+
"learning_rate": 1.8714596216972007e-05,
|
319 |
+
"loss": 11.9112,
|
320 |
+
"step": 123
|
321 |
},
|
322 |
{
|
323 |
+
"epoch": 2.25,
|
324 |
+
"grad_norm": 0.035777896642684937,
|
325 |
+
"learning_rate": 1.6445078582048155e-05,
|
326 |
+
"loss": 11.9091,
|
327 |
+
"step": 126
|
328 |
},
|
329 |
{
|
330 |
+
"epoch": 2.3035714285714284,
|
331 |
+
"grad_norm": 0.037539299577474594,
|
332 |
+
"learning_rate": 1.4294920045162513e-05,
|
333 |
+
"loss": 11.9146,
|
334 |
+
"step": 129
|
|
|
335 |
},
|
336 |
{
|
337 |
+
"epoch": 2.357142857142857,
|
338 |
+
"grad_norm": 0.042784880846738815,
|
339 |
+
"learning_rate": 1.2271768992088489e-05,
|
340 |
+
"loss": 11.9117,
|
341 |
+
"step": 132
|
342 |
},
|
343 |
{
|
344 |
+
"epoch": 2.4107142857142856,
|
345 |
+
"grad_norm": 0.05495860055088997,
|
346 |
+
"learning_rate": 1.038282202692129e-05,
|
347 |
+
"loss": 11.9109,
|
348 |
+
"step": 135
|
349 |
},
|
350 |
{
|
351 |
+
"epoch": 2.4642857142857144,
|
352 |
+
"grad_norm": 0.07855169475078583,
|
353 |
+
"learning_rate": 8.634798372847148e-06,
|
354 |
+
"loss": 11.9069,
|
355 |
+
"step": 138
|
356 |
},
|
357 |
{
|
358 |
+
"epoch": 2.517857142857143,
|
359 |
+
"grad_norm": 0.03630689159035683,
|
360 |
+
"learning_rate": 7.033915971016952e-06,
|
361 |
+
"loss": 11.9139,
|
362 |
+
"step": 141
|
363 |
},
|
364 |
{
|
365 |
+
"epoch": 2.571428571428571,
|
366 |
+
"grad_norm": 0.048760075122117996,
|
367 |
+
"learning_rate": 5.585869362543416e-06,
|
368 |
+
"loss": 11.9124,
|
369 |
+
"step": 144
|
370 |
},
|
371 |
{
|
372 |
+
"epoch": 2.625,
|
373 |
+
"grad_norm": 0.04377627745270729,
|
374 |
+
"learning_rate": 4.29580943229827e-06,
|
375 |
+
"loss": 11.9118,
|
376 |
+
"step": 147
|
377 |
},
|
378 |
{
|
379 |
+
"epoch": 2.678571428571429,
|
380 |
+
"grad_norm": 0.05847623199224472,
|
381 |
+
"learning_rate": 3.1683250865636114e-06,
|
382 |
+
"loss": 11.9095,
|
383 |
+
"step": 150
|
384 |
},
|
385 |
{
|
386 |
+
"epoch": 2.678571428571429,
|
387 |
+
"eval_loss": 11.9102783203125,
|
388 |
+
"eval_runtime": 0.5392,
|
389 |
+
"eval_samples_per_second": 176.177,
|
390 |
+
"eval_steps_per_second": 44.508,
|
391 |
+
"step": 150
|
392 |
},
|
393 |
{
|
394 |
+
"epoch": 2.732142857142857,
|
395 |
+
"grad_norm": 0.034065768122673035,
|
396 |
+
"learning_rate": 2.2074269297119587e-06,
|
397 |
+
"loss": 11.9088,
|
398 |
+
"step": 153
|
399 |
},
|
400 |
{
|
401 |
+
"epoch": 2.7857142857142856,
|
402 |
+
"grad_norm": 0.04465312138199806,
|
403 |
+
"learning_rate": 1.4165329979794973e-06,
|
404 |
+
"loss": 11.9133,
|
405 |
+
"step": 156
|
406 |
},
|
407 |
{
|
408 |
+
"epoch": 2.8392857142857144,
|
409 |
+
"grad_norm": 0.043062131851911545,
|
410 |
+
"learning_rate": 7.984566010789674e-07,
|
411 |
+
"loss": 11.9122,
|
412 |
+
"step": 159
|
413 |
},
|
414 |
{
|
415 |
+
"epoch": 2.892857142857143,
|
416 |
+
"grad_norm": 0.05254960432648659,
|
417 |
+
"learning_rate": 3.553963149013295e-07,
|
418 |
+
"loss": 11.9094,
|
419 |
+
"step": 162
|
420 |
},
|
421 |
{
|
422 |
+
"epoch": 2.946428571428571,
|
423 |
+
"grad_norm": 0.032099399715662,
|
424 |
+
"learning_rate": 8.892816090335099e-08,
|
425 |
+
"loss": 11.9082,
|
426 |
+
"step": 165
|
427 |
},
|
428 |
{
|
429 |
+
"epoch": 3.0,
|
430 |
+
"grad_norm": 0.09737348556518555,
|
431 |
"learning_rate": 0.0,
|
432 |
+
"loss": 11.908,
|
433 |
+
"step": 168
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
434 |
}
|
435 |
],
|
436 |
+
"logging_steps": 3,
|
437 |
+
"max_steps": 168,
|
438 |
"num_input_tokens_seen": 0,
|
439 |
+
"num_train_epochs": 3,
|
440 |
+
"save_steps": 50,
|
441 |
"stateful_callbacks": {
|
442 |
+
"EarlyStoppingCallback": {
|
443 |
+
"args": {
|
444 |
+
"early_stopping_patience": 5,
|
445 |
+
"early_stopping_threshold": 0.0
|
446 |
+
},
|
447 |
+
"attributes": {
|
448 |
+
"early_stopping_patience_counter": 0
|
449 |
+
}
|
450 |
+
},
|
451 |
"TrainerControl": {
|
452 |
"args": {
|
453 |
"should_epoch_stop": false,
|
|
|
459 |
"attributes": {}
|
460 |
}
|
461 |
},
|
462 |
+
"total_flos": 808543272960.0,
|
463 |
+
"train_batch_size": 8,
|
464 |
"trial_name": null,
|
465 |
"trial_params": null
|
466 |
}
|
last-checkpoint/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d0e57d43ac703b2028bdad9e5663a68ee010037b928221f2ef946ca8bb5c20a4
|
3 |
+
size 6840
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d0e57d43ac703b2028bdad9e5663a68ee010037b928221f2ef946ca8bb5c20a4
|
3 |
+
size 6840
|