pefanis27 commited on
Commit
1b5478c
·
verified ·
1 Parent(s): 82fc613

phi-3.5-new

Browse files
adapter_config.json CHANGED
@@ -12,9 +12,9 @@
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
- "lora_alpha": 32,
16
  "lora_bias": false,
17
- "lora_dropout": 0.05,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": null,
@@ -23,10 +23,10 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
26
  "qkv_proj",
27
- "gate_up_proj",
28
  "o_proj",
29
- "down_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
12
  "layers_pattern": null,
13
  "layers_to_transform": null,
14
  "loftq_config": {},
15
+ "lora_alpha": 64,
16
  "lora_bias": false,
17
+ "lora_dropout": 0.1,
18
  "megatron_config": null,
19
  "megatron_core": "megatron.core",
20
  "modules_to_save": null,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "down_proj",
27
  "qkv_proj",
 
28
  "o_proj",
29
+ "gate_up_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20178576bc8573e56eb14493ece1b59f3ae774a58a2e0780f176f7c0d3a70c29
3
  size 100697728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41b60cffda93a140952b4e8e313fd8a8519ca094dd39b7008a23a9942e80caa3
3
  size 100697728
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "epoch": 20.0,
3
- "eval_loss": 0.28774118423461914,
4
- "eval_runtime": 16.9372,
5
- "eval_samples_per_second": 2.48,
6
- "eval_steps_per_second": 1.24,
7
- "total_flos": 8.68297895581778e+16,
8
- "train_loss": 0.616850325694451,
9
- "train_runtime": 5601.927,
10
- "train_samples_per_second": 0.743,
11
- "train_steps_per_second": 0.371
12
  }
 
1
  {
2
+ "epoch": 7.0,
3
+ "eval_loss": 1.0098934173583984,
4
+ "eval_runtime": 7.0732,
5
+ "eval_samples_per_second": 2.403,
6
+ "eval_steps_per_second": 1.272,
7
+ "total_flos": 1.2128346681348096e+16,
8
+ "train_loss": 0.8996787428044949,
9
+ "train_runtime": 846.9703,
10
+ "train_samples_per_second": 0.98,
11
+ "train_steps_per_second": 0.496
12
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 20.0,
3
- "eval_loss": 0.28774118423461914,
4
- "eval_runtime": 16.9372,
5
- "eval_samples_per_second": 2.48,
6
- "eval_steps_per_second": 1.24
7
  }
 
1
  {
2
+ "epoch": 7.0,
3
+ "eval_loss": 1.0098934173583984,
4
+ "eval_runtime": 7.0732,
5
+ "eval_samples_per_second": 2.403,
6
+ "eval_steps_per_second": 1.272
7
  }
runs/Jan12_18-16-10_dmlab/events.out.tfevents.1736698571.dmlab.4621.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d54c2a3766204553f23b25a8cf1a7a3efa310798eff0e961e6d46a1667e7ba00
3
+ size 8519
runs/Jan12_18-39-23_dmlab/events.out.tfevents.1736699964.dmlab.4828.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd10c8db7c1dedcc2d3774aeeb9feec748f2e7d0719a944e669bfb18eb98ea66
3
+ size 8310
runs/Jan12_18-40-28_dmlab/events.out.tfevents.1736700029.dmlab.5135.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba41872b0a2f5a89c3b52534b7ef19d53cfc00f43a4083cab9dc00cbd774f61d
3
+ size 8313
runs/Jan12_18-48-53_dmlab/events.out.tfevents.1736700534.dmlab.6375.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3497c533857fddcc00a15e1c8cde06464b3e72e93550f9cf03f1d0553bdd6111
3
+ size 8312
runs/Jan12_18-49-27_dmlab/events.out.tfevents.1736700567.dmlab.6681.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ceb695ef6ca804bf37ca664d0a33b0e9b6ec1ad7bc5372f866735be62248c45a
3
+ size 8312
runs/Jan12_18-49-56_dmlab/events.out.tfevents.1736700596.dmlab.6967.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8bc81128a4ec26f0f4aaeb977cbbd7e8ef5f8b3eb0cff993670330e67af50c2
3
+ size 12024
runs/Jan12_18-49-56_dmlab/events.out.tfevents.1736701450.dmlab.6967.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61be2c6a36537aba58fb650dbf5d51eb1275adbe01d9e83dc4c273c62e5972d9
3
+ size 359
tokenizer_config.json CHANGED
@@ -121,7 +121,7 @@
121
  "clean_up_tokenization_spaces": false,
122
  "eos_token": "<|endoftext|>",
123
  "legacy": false,
124
- "model_max_length": 2048,
125
  "pad_token": "<unk>",
126
  "padding_side": "left",
127
  "sp_model_kwargs": {},
 
121
  "clean_up_tokenization_spaces": false,
122
  "eos_token": "<|endoftext|>",
123
  "legacy": false,
124
+ "model_max_length": 4096,
125
  "pad_token": "<unk>",
126
  "padding_side": "left",
127
  "sp_model_kwargs": {},
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 20.0,
3
- "total_flos": 8.68297895581778e+16,
4
- "train_loss": 0.616850325694451,
5
- "train_runtime": 5601.927,
6
- "train_samples_per_second": 0.743,
7
- "train_steps_per_second": 0.371
8
  }
 
1
  {
2
+ "epoch": 7.0,
3
+ "total_flos": 1.2128346681348096e+16,
4
+ "train_loss": 0.8996787428044949,
5
+ "train_runtime": 846.9703,
6
+ "train_samples_per_second": 0.98,
7
+ "train_steps_per_second": 0.496
8
  }
trainer_state.json CHANGED
@@ -1,169 +1,143 @@
1
  {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 20.0,
5
  "eval_steps": 500,
6
- "global_step": 2080,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "grad_norm": 0.6938191056251526,
14
- "learning_rate": 5e-05,
15
- "loss": 1.1644,
16
- "step": 104
17
- },
18
- {
19
- "epoch": 2.0,
20
- "grad_norm": 0.5342716574668884,
21
  "learning_rate": 0.0001,
22
- "loss": 0.9083,
23
- "step": 208
24
  },
25
  {
26
- "epoch": 3.0,
27
- "grad_norm": 0.47485536336898804,
28
- "learning_rate": 0.00015000000000000001,
29
- "loss": 0.8697,
30
- "step": 312
 
31
  },
32
  {
33
- "epoch": 4.0,
34
- "grad_norm": 0.5520597100257874,
35
  "learning_rate": 0.0002,
36
- "loss": 0.8504,
37
- "step": 416
38
  },
39
  {
40
- "epoch": 5.0,
41
- "grad_norm": 0.4108010530471802,
42
- "learning_rate": 0.00019807852804032305,
43
- "loss": 0.8314,
44
- "step": 520
 
45
  },
46
  {
47
- "epoch": 6.0,
48
- "grad_norm": 0.5089908838272095,
49
  "learning_rate": 0.0001923879532511287,
50
- "loss": 0.809,
51
- "step": 624
52
  },
53
  {
54
- "epoch": 7.0,
55
- "grad_norm": 0.5841424465179443,
56
- "learning_rate": 0.00018314696123025454,
57
- "loss": 0.788,
58
- "step": 728
 
59
  },
60
  {
61
- "epoch": 8.0,
62
- "grad_norm": 0.6074467301368713,
63
  "learning_rate": 0.00017071067811865476,
64
- "loss": 0.7642,
65
- "step": 832
66
  },
67
  {
68
- "epoch": 9.0,
69
- "grad_norm": 0.6272623538970947,
70
- "learning_rate": 0.00015555702330196023,
71
- "loss": 0.7376,
72
- "step": 936
 
73
  },
74
  {
75
- "epoch": 10.0,
76
- "grad_norm": 0.6662757992744446,
77
  "learning_rate": 0.000138268343236509,
78
- "loss": 0.7044,
79
- "step": 1040
80
  },
81
  {
82
- "epoch": 11.0,
83
- "grad_norm": 0.9177679419517517,
84
- "learning_rate": 0.00011950903220161285,
85
- "loss": 0.6638,
86
- "step": 1144
 
87
  },
88
  {
89
- "epoch": 12.0,
90
- "grad_norm": 1.0447787046432495,
91
  "learning_rate": 0.0001,
92
- "loss": 0.6131,
93
- "step": 1248
94
  },
95
  {
96
- "epoch": 13.0,
97
- "grad_norm": 1.1840310096740723,
98
- "learning_rate": 8.049096779838719e-05,
99
- "loss": 0.5513,
100
- "step": 1352
 
101
  },
102
  {
103
- "epoch": 14.0,
104
- "grad_norm": 1.5581327676773071,
105
  "learning_rate": 6.173165676349103e-05,
106
- "loss": 0.4789,
107
- "step": 1456
108
  },
109
  {
110
- "epoch": 15.0,
111
- "grad_norm": 1.5830806493759155,
112
- "learning_rate": 4.444297669803981e-05,
113
- "loss": 0.4021,
114
- "step": 1560
115
- },
116
- {
117
- "epoch": 16.0,
118
- "grad_norm": 1.7810040712356567,
119
- "learning_rate": 2.9289321881345254e-05,
120
- "loss": 0.3294,
121
- "step": 1664
122
- },
123
- {
124
- "epoch": 17.0,
125
- "grad_norm": 1.415872573852539,
126
- "learning_rate": 1.6853038769745467e-05,
127
- "loss": 0.2672,
128
- "step": 1768
129
- },
130
- {
131
- "epoch": 18.0,
132
- "grad_norm": 1.7194544076919556,
133
- "learning_rate": 7.612046748871327e-06,
134
- "loss": 0.2226,
135
- "step": 1872
136
- },
137
- {
138
- "epoch": 19.0,
139
- "grad_norm": 1.4402439594268799,
140
- "learning_rate": 1.921471959676957e-06,
141
- "loss": 0.1961,
142
- "step": 1976
143
- },
144
- {
145
- "epoch": 20.0,
146
- "grad_norm": 1.3084591627120972,
147
- "learning_rate": 0.0,
148
- "loss": 0.1851,
149
- "step": 2080
150
  },
151
  {
152
- "epoch": 20.0,
153
- "step": 2080,
154
- "total_flos": 8.68297895581778e+16,
155
- "train_loss": 0.616850325694451,
156
- "train_runtime": 5601.927,
157
- "train_samples_per_second": 0.743,
158
- "train_steps_per_second": 0.371
159
  }
160
  ],
161
  "logging_steps": 500,
162
- "max_steps": 2080,
163
  "num_input_tokens_seen": 0,
164
- "num_train_epochs": 20,
165
  "save_steps": 500,
166
  "stateful_callbacks": {
 
 
 
 
 
 
 
 
 
167
  "TrainerControl": {
168
  "args": {
169
  "should_epoch_stop": false,
@@ -175,7 +149,7 @@
175
  "attributes": {}
176
  }
177
  },
178
- "total_flos": 8.68297895581778e+16,
179
  "train_batch_size": 2,
180
  "trial_name": null,
181
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.0029878616333008,
3
+ "best_model_checkpoint": "/home/labuser/Documents/phi-3/phi-3.5-new/checkpoint-168",
4
+ "epoch": 7.0,
5
  "eval_steps": 500,
6
+ "global_step": 294,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "grad_norm": 1.0487970113754272,
 
 
 
 
 
 
 
14
  "learning_rate": 0.0001,
15
+ "loss": 1.1905,
16
+ "step": 42
17
  },
18
  {
19
+ "epoch": 1.0,
20
+ "eval_loss": 1.0386497974395752,
21
+ "eval_runtime": 7.0656,
22
+ "eval_samples_per_second": 2.406,
23
+ "eval_steps_per_second": 1.274,
24
+ "step": 42
25
  },
26
  {
27
+ "epoch": 2.0,
28
+ "grad_norm": 1.0445537567138672,
29
  "learning_rate": 0.0002,
30
+ "loss": 0.9483,
31
+ "step": 84
32
  },
33
  {
34
+ "epoch": 2.0,
35
+ "eval_loss": 1.0180531740188599,
36
+ "eval_runtime": 7.0935,
37
+ "eval_samples_per_second": 2.397,
38
+ "eval_steps_per_second": 1.269,
39
+ "step": 84
40
  },
41
  {
42
+ "epoch": 3.0,
43
+ "grad_norm": 0.8360362648963928,
44
  "learning_rate": 0.0001923879532511287,
45
+ "loss": 0.9012,
46
+ "step": 126
47
  },
48
  {
49
+ "epoch": 3.0,
50
+ "eval_loss": 1.0081464052200317,
51
+ "eval_runtime": 7.1072,
52
+ "eval_samples_per_second": 2.392,
53
+ "eval_steps_per_second": 1.266,
54
+ "step": 126
55
  },
56
  {
57
+ "epoch": 4.0,
58
+ "grad_norm": 0.9580802917480469,
59
  "learning_rate": 0.00017071067811865476,
60
+ "loss": 0.8626,
61
+ "step": 168
62
  },
63
  {
64
+ "epoch": 4.0,
65
+ "eval_loss": 1.0029878616333008,
66
+ "eval_runtime": 7.081,
67
+ "eval_samples_per_second": 2.401,
68
+ "eval_steps_per_second": 1.271,
69
+ "step": 168
70
  },
71
  {
72
+ "epoch": 5.0,
73
+ "grad_norm": 0.8999230861663818,
74
  "learning_rate": 0.000138268343236509,
75
+ "loss": 0.8324,
76
+ "step": 210
77
  },
78
  {
79
+ "epoch": 5.0,
80
+ "eval_loss": 1.0067561864852905,
81
+ "eval_runtime": 7.0886,
82
+ "eval_samples_per_second": 2.398,
83
+ "eval_steps_per_second": 1.27,
84
+ "step": 210
85
  },
86
  {
87
+ "epoch": 6.0,
88
+ "grad_norm": 0.8453378081321716,
89
  "learning_rate": 0.0001,
90
+ "loss": 0.8004,
91
+ "step": 252
92
  },
93
  {
94
+ "epoch": 6.0,
95
+ "eval_loss": 1.0179213285446167,
96
+ "eval_runtime": 7.0766,
97
+ "eval_samples_per_second": 2.402,
98
+ "eval_steps_per_second": 1.272,
99
+ "step": 252
100
  },
101
  {
102
+ "epoch": 7.0,
103
+ "grad_norm": 1.0025222301483154,
104
  "learning_rate": 6.173165676349103e-05,
105
+ "loss": 0.7622,
106
+ "step": 294
107
  },
108
  {
109
+ "epoch": 7.0,
110
+ "eval_loss": 1.043684959411621,
111
+ "eval_runtime": 7.0812,
112
+ "eval_samples_per_second": 2.401,
113
+ "eval_steps_per_second": 1.271,
114
+ "step": 294
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  },
116
  {
117
+ "epoch": 7.0,
118
+ "step": 294,
119
+ "total_flos": 1.2128346681348096e+16,
120
+ "train_loss": 0.8996787428044949,
121
+ "train_runtime": 846.9703,
122
+ "train_samples_per_second": 0.98,
123
+ "train_steps_per_second": 0.496
124
  }
125
  ],
126
  "logging_steps": 500,
127
+ "max_steps": 420,
128
  "num_input_tokens_seen": 0,
129
+ "num_train_epochs": 10,
130
  "save_steps": 500,
131
  "stateful_callbacks": {
132
+ "EarlyStoppingCallback": {
133
+ "args": {
134
+ "early_stopping_patience": 3,
135
+ "early_stopping_threshold": 0.0
136
+ },
137
+ "attributes": {
138
+ "early_stopping_patience_counter": 3
139
+ }
140
+ },
141
  "TrainerControl": {
142
  "args": {
143
  "should_epoch_stop": false,
 
149
  "attributes": {}
150
  }
151
  },
152
+ "total_flos": 1.2128346681348096e+16,
153
  "train_batch_size": 2,
154
  "trial_name": null,
155
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:633d6603fb5d519d493770c00d2039db41cce516bb4399ad40e6d01187a3b828
3
  size 5624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a0dba0afd6c07504c9951fbbc2017ec670cad8a3b2ca6951fe367b50a131172
3
  size 5624