Lansechen commited on
Commit
306ba75
·
verified ·
1 Parent(s): df6291e

Model save

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. all_results.json +5 -5
  3. train_results.json +5 -5
  4. trainer_state.json +66 -169
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/chenran1995-the-chinese-university-of-hong-kong/huggingface/runs/x17gr4yx)
31
 
32
 
33
  This model was trained with SFT.
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/chenran1995-the-chinese-university-of-hong-kong/huggingface/runs/hhvcopv9)
31
 
32
 
33
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 179979530797056.0,
3
- "train_loss": 0.0,
4
- "train_runtime": 1.8275,
5
  "train_samples": 6554,
6
- "train_samples_per_second": 4051.49,
7
- "train_steps_per_second": 31.191
8
  }
 
1
  {
2
+ "total_flos": 84573274767360.0,
3
+ "train_loss": 0.7089759466940897,
4
+ "train_runtime": 1518.5199,
5
  "train_samples": 6554,
6
+ "train_samples_per_second": 4.876,
7
+ "train_steps_per_second": 0.038
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 179979530797056.0,
3
- "train_loss": 0.0,
4
- "train_runtime": 1.8275,
5
  "train_samples": 6554,
6
- "train_samples_per_second": 4051.49,
7
- "train_steps_per_second": 31.191
8
  }
 
1
  {
2
+ "total_flos": 84573274767360.0,
3
+ "train_loss": 0.7089759466940897,
4
+ "train_runtime": 1518.5199,
5
  "train_samples": 6554,
6
+ "train_samples_per_second": 4.876,
7
+ "train_steps_per_second": 0.038
8
  }
trainer_state.json CHANGED
@@ -1,213 +1,110 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.9440993788819876,
5
  "eval_steps": 500,
6
- "global_step": 120,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.12422360248447205,
13
- "grad_norm": 2.720003128051758,
14
- "learning_rate": 4.166666666666667e-05,
15
- "loss": 1.0081,
16
- "mean_token_accuracy": 0.7258908212184906,
17
  "step": 5
18
  },
19
  {
20
- "epoch": 0.2484472049689441,
21
- "grad_norm": 1.022207260131836,
22
- "learning_rate": 4.986344023965386e-05,
23
- "loss": 0.861,
24
- "mean_token_accuracy": 0.7483338862657547,
25
  "step": 10
26
  },
27
  {
28
- "epoch": 0.37267080745341613,
29
- "grad_norm": 0.5985455513000488,
30
- "learning_rate": 4.931150598363494e-05,
31
- "loss": 0.7839,
32
- "mean_token_accuracy": 0.7629545524716377,
33
  "step": 15
34
  },
35
  {
36
- "epoch": 0.4968944099378882,
37
- "grad_norm": 0.4236472249031067,
38
- "learning_rate": 4.834611651233304e-05,
39
- "loss": 0.7354,
40
- "mean_token_accuracy": 0.7745954841375351,
41
  "step": 20
42
  },
43
  {
44
- "epoch": 0.6211180124223602,
45
- "grad_norm": 0.35875728726387024,
46
- "learning_rate": 4.6985571585149876e-05,
47
- "loss": 0.7146,
48
- "mean_token_accuracy": 0.77941093146801,
49
  "step": 25
50
  },
51
  {
52
- "epoch": 0.7453416149068323,
53
- "grad_norm": 0.349810928106308,
54
- "learning_rate": 4.5255661461418854e-05,
55
- "loss": 0.6951,
56
- "mean_token_accuracy": 0.7839933410286903,
57
  "step": 30
58
  },
59
  {
60
- "epoch": 0.8695652173913043,
61
- "grad_norm": 0.2977786362171173,
62
- "learning_rate": 4.3189178024614896e-05,
63
- "loss": 0.6871,
64
- "mean_token_accuracy": 0.7858522430062294,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.9937888198757764,
69
- "grad_norm": 0.33435600996017456,
70
- "learning_rate": 4.0825293184962056e-05,
71
- "loss": 0.6795,
72
- "mean_token_accuracy": 0.7874569892883301,
73
  "step": 40
74
  },
75
  {
76
- "epoch": 1.0993788819875776,
77
- "grad_norm": 0.5033183097839355,
78
- "learning_rate": 3.8208816343334156e-05,
79
- "loss": 0.6564,
80
- "mean_token_accuracy": 0.7936392934883342,
81
  "step": 45
82
  },
83
  {
84
- "epoch": 1.2236024844720497,
85
- "grad_norm": 0.35527950525283813,
86
- "learning_rate": 3.5389344991836974e-05,
87
- "loss": 0.6223,
88
- "mean_token_accuracy": 0.801690150797367,
89
  "step": 50
90
  },
91
  {
92
- "epoch": 1.3478260869565217,
93
- "grad_norm": 0.3436889946460724,
94
- "learning_rate": 3.242032455214346e-05,
95
- "loss": 0.6219,
96
- "mean_token_accuracy": 0.8015273302793503,
97
  "step": 55
98
  },
99
  {
100
- "epoch": 1.4720496894409938,
101
- "grad_norm": 0.31214451789855957,
102
- "learning_rate": 2.9358035273127483e-05,
103
- "loss": 0.6241,
104
- "mean_token_accuracy": 0.8007384449243545,
105
- "step": 60
106
- },
107
- {
108
- "epoch": 1.5962732919254659,
109
- "grad_norm": 0.2834468185901642,
110
- "learning_rate": 2.6260525391993023e-05,
111
- "loss": 0.6212,
112
- "mean_token_accuracy": 0.8008193418383598,
113
- "step": 65
114
- },
115
- {
116
- "epoch": 1.720496894409938,
117
- "grad_norm": 0.2718545198440552,
118
- "learning_rate": 2.3186510781715892e-05,
119
- "loss": 0.6224,
120
- "mean_token_accuracy": 0.8004700869321824,
121
- "step": 70
122
- },
123
- {
124
- "epoch": 1.84472049689441,
125
- "grad_norm": 0.2998403012752533,
126
- "learning_rate": 2.0194261942894628e-05,
127
- "loss": 0.6077,
128
- "mean_token_accuracy": 0.8049791321158409,
129
- "step": 75
130
- },
131
- {
132
- "epoch": 1.968944099378882,
133
- "grad_norm": 0.26175156235694885,
134
- "learning_rate": 1.7340499438004994e-05,
135
- "loss": 0.6065,
136
- "mean_token_accuracy": 0.8056960567831993,
137
- "step": 80
138
- },
139
- {
140
- "epoch": 2.0745341614906834,
141
- "grad_norm": 0.30868765711784363,
142
- "learning_rate": 1.4679318706019013e-05,
143
- "loss": 0.5928,
144
- "mean_token_accuracy": 0.8117802195689258,
145
- "step": 85
146
- },
147
- {
148
- "epoch": 2.198757763975155,
149
- "grad_norm": 0.33503425121307373,
150
- "learning_rate": 1.2261164638420832e-05,
151
- "loss": 0.5657,
152
- "mean_token_accuracy": 0.816280497610569,
153
- "step": 90
154
- },
155
- {
156
- "epoch": 2.3229813664596275,
157
- "grad_norm": 0.27925997972488403,
158
- "learning_rate": 1.013187535438278e-05,
159
- "loss": 0.5749,
160
- "mean_token_accuracy": 0.813144038617611,
161
- "step": 95
162
- },
163
- {
164
- "epoch": 2.4472049689440993,
165
- "grad_norm": 0.24198994040489197,
166
- "learning_rate": 8.331813301137644e-06,
167
- "loss": 0.5744,
168
- "mean_token_accuracy": 0.8130593597888947,
169
- "step": 100
170
- },
171
- {
172
- "epoch": 2.571428571428571,
173
- "grad_norm": 0.23274952173233032,
174
- "learning_rate": 6.8951001502612065e-06,
175
- "loss": 0.5723,
176
- "mean_token_accuracy": 0.8137446627020836,
177
- "step": 105
178
- },
179
- {
180
- "epoch": 2.6956521739130435,
181
- "grad_norm": 0.2240184247493744,
182
- "learning_rate": 5.8489699930418664e-06,
183
- "loss": 0.5645,
184
- "mean_token_accuracy": 0.8161669239401818,
185
- "step": 110
186
- },
187
- {
188
- "epoch": 2.8198757763975157,
189
- "grad_norm": 0.22074371576309204,
190
- "learning_rate": 5.213253095656177e-06,
191
- "loss": 0.5812,
192
- "mean_token_accuracy": 0.8107848510146141,
193
- "step": 115
194
- },
195
- {
196
- "epoch": 2.9440993788819876,
197
- "grad_norm": 0.22450964152812958,
198
- "learning_rate": 5e-06,
199
- "loss": 0.5667,
200
- "mean_token_accuracy": 0.8154601871967315,
201
- "step": 120
202
- },
203
- {
204
- "epoch": 2.9440993788819876,
205
- "step": 120,
206
- "total_flos": 179979530797056.0,
207
- "train_loss": 0.0,
208
- "train_runtime": 1.8275,
209
- "train_samples_per_second": 4051.49,
210
- "train_steps_per_second": 31.191
211
  }
212
  ],
213
  "logging_steps": 5,
@@ -227,7 +124,7 @@
227
  "attributes": {}
228
  }
229
  },
230
- "total_flos": 179979530797056.0,
231
  "train_batch_size": 4,
232
  "trial_name": null,
233
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.8774193548387097,
5
  "eval_steps": 500,
6
+ "global_step": 57,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.25806451612903225,
13
+ "grad_norm": 1.7787522077560425,
14
+ "learning_rate": 4.984786304919372e-05,
15
+ "loss": 0.9092,
16
+ "mean_token_accuracy": 0.7503577440977096,
17
  "step": 5
18
  },
19
  {
20
+ "epoch": 0.5161290322580645,
21
+ "grad_norm": 0.9605578184127808,
22
+ "learning_rate": 4.815986240480617e-05,
23
+ "loss": 0.7348,
24
+ "mean_token_accuracy": 0.7808357656002045,
25
  "step": 10
26
  },
27
  {
28
+ "epoch": 0.7741935483870968,
29
+ "grad_norm": 0.601085364818573,
30
+ "learning_rate": 4.473599997017701e-05,
31
+ "loss": 0.6564,
32
+ "mean_token_accuracy": 0.7980892196297645,
33
  "step": 15
34
  },
35
  {
36
+ "epoch": 1.0,
37
+ "grad_norm": 512.1144409179688,
38
+ "learning_rate": 3.9863952006593134e-05,
39
+ "loss": 1.6224,
40
+ "mean_token_accuracy": 0.6946261065346854,
41
  "step": 20
42
  },
43
  {
44
+ "epoch": 1.2580645161290323,
45
+ "grad_norm": 0.6288492679595947,
46
+ "learning_rate": 3.3953072735999534e-05,
47
+ "loss": 0.6195,
48
+ "mean_token_accuracy": 0.8052686437964439,
49
  "step": 25
50
  },
51
  {
52
+ "epoch": 1.5161290322580645,
53
+ "grad_norm": 0.4004594385623932,
54
+ "learning_rate": 2.7500000000000004e-05,
55
+ "loss": 0.5815,
56
+ "mean_token_accuracy": 0.8156079143285752,
57
  "step": 30
58
  },
59
  {
60
+ "epoch": 1.7741935483870968,
61
+ "grad_norm": 0.3189852833747864,
62
+ "learning_rate": 2.1046927264000475e-05,
63
+ "loss": 0.5735,
64
+ "mean_token_accuracy": 0.8173265308141708,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 2.0,
69
+ "grad_norm": 0.3097386360168457,
70
+ "learning_rate": 1.5136047993406865e-05,
71
+ "loss": 0.5678,
72
+ "mean_token_accuracy": 0.819041873727526,
73
  "step": 40
74
  },
75
  {
76
+ "epoch": 2.258064516129032,
77
+ "grad_norm": 0.26866406202316284,
78
+ "learning_rate": 1.0264000029822999e-05,
79
+ "loss": 0.5328,
80
+ "mean_token_accuracy": 0.8284321025013923,
81
  "step": 45
82
  },
83
  {
84
+ "epoch": 2.5161290322580645,
85
+ "grad_norm": 0.24466799199581146,
86
+ "learning_rate": 6.840137595193838e-06,
87
+ "loss": 0.5318,
88
+ "mean_token_accuracy": 0.8282432556152344,
89
  "step": 50
90
  },
91
  {
92
+ "epoch": 2.774193548387097,
93
+ "grad_norm": 0.22442595660686493,
94
+ "learning_rate": 5.152136950806283e-06,
95
+ "loss": 0.5416,
96
+ "mean_token_accuracy": 0.8250808849930763,
97
  "step": 55
98
  },
99
  {
100
+ "epoch": 2.8774193548387097,
101
+ "mean_token_accuracy": 0.8287803158164024,
102
+ "step": 57,
103
+ "total_flos": 84573274767360.0,
104
+ "train_loss": 0.7089759466940897,
105
+ "train_runtime": 1518.5199,
106
+ "train_samples_per_second": 4.876,
107
+ "train_steps_per_second": 0.038
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  }
109
  ],
110
  "logging_steps": 5,
 
124
  "attributes": {}
125
  }
126
  },
127
+ "total_flos": 84573274767360.0,
128
  "train_batch_size": 4,
129
  "trial_name": null,
130
  "trial_params": null