omarViga commited on
Commit
79c0a9c
·
verified ·
1 Parent(s): dda1623

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +8 -0
  2. added_tokens.json +4 -0
  3. checkpoint-1000/added_tokens.json +4 -0
  4. checkpoint-1000/config.json +91 -0
  5. checkpoint-1000/generation_config.json +9 -0
  6. checkpoint-1000/model.safetensors +3 -0
  7. checkpoint-1000/optimizer.pt +3 -0
  8. checkpoint-1000/rng_state.pth +3 -0
  9. checkpoint-1000/scaler.pt +3 -0
  10. checkpoint-1000/scheduler.pt +3 -0
  11. checkpoint-1000/special_tokens_map.json +13 -0
  12. checkpoint-1000/spm_char.model +3 -0
  13. checkpoint-1000/tokenizer_config.json +64 -0
  14. checkpoint-1000/trainer_state.json +322 -0
  15. checkpoint-1000/training_args.bin +3 -0
  16. checkpoint-2000/added_tokens.json +4 -0
  17. checkpoint-2000/config.json +91 -0
  18. checkpoint-2000/generation_config.json +9 -0
  19. checkpoint-2000/model.safetensors +3 -0
  20. checkpoint-2000/optimizer.pt +3 -0
  21. checkpoint-2000/rng_state.pth +3 -0
  22. checkpoint-2000/scaler.pt +3 -0
  23. checkpoint-2000/scheduler.pt +3 -0
  24. checkpoint-2000/special_tokens_map.json +13 -0
  25. checkpoint-2000/spm_char.model +3 -0
  26. checkpoint-2000/tokenizer_config.json +64 -0
  27. checkpoint-2000/trainer_state.json +610 -0
  28. checkpoint-2000/training_args.bin +3 -0
  29. checkpoint-3000/added_tokens.json +4 -0
  30. checkpoint-3000/config.json +91 -0
  31. checkpoint-3000/generation_config.json +9 -0
  32. checkpoint-3000/model.safetensors +3 -0
  33. checkpoint-3000/optimizer.pt +3 -0
  34. checkpoint-3000/rng_state.pth +3 -0
  35. checkpoint-3000/scaler.pt +3 -0
  36. checkpoint-3000/scheduler.pt +3 -0
  37. checkpoint-3000/special_tokens_map.json +13 -0
  38. checkpoint-3000/spm_char.model +3 -0
  39. checkpoint-3000/tokenizer_config.json +64 -0
  40. checkpoint-3000/trainer_state.json +898 -0
  41. checkpoint-3000/training_args.bin +3 -0
  42. checkpoint-4000/added_tokens.json +4 -0
  43. checkpoint-4000/config.json +91 -0
  44. checkpoint-4000/generation_config.json +9 -0
  45. checkpoint-4000/model.safetensors +3 -0
  46. checkpoint-4000/optimizer.pt +3 -0
  47. checkpoint-4000/rng_state.pth +3 -0
  48. checkpoint-4000/scaler.pt +3 -0
  49. checkpoint-4000/scheduler.pt +3 -0
  50. checkpoint-4000/special_tokens_map.json +13 -0
README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - text-to-speech
4
+ - speecht5
5
+ - mabama # Make sure no empty tags exist
6
+ library_name: transformers
7
+ license: mit
8
+ ---
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<ctc_blank>": 80,
3
+ "<mask>": 79
4
+ }
checkpoint-1000/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<ctc_blank>": 80,
3
+ "<mask>": 79
4
+ }
checkpoint-1000/config.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "SpeechT5ForTextToSpeech"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 0,
9
+ "conv_bias": false,
10
+ "conv_dim": [
11
+ 512,
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512
18
+ ],
19
+ "conv_kernel": [
20
+ 10,
21
+ 3,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 2,
26
+ 2
27
+ ],
28
+ "conv_stride": [
29
+ 5,
30
+ 2,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2
36
+ ],
37
+ "decoder_attention_heads": 12,
38
+ "decoder_ffn_dim": 3072,
39
+ "decoder_layerdrop": 0.1,
40
+ "decoder_layers": 6,
41
+ "decoder_start_token_id": 2,
42
+ "encoder_attention_heads": 12,
43
+ "encoder_ffn_dim": 3072,
44
+ "encoder_layerdrop": 0.1,
45
+ "encoder_layers": 12,
46
+ "encoder_max_relative_position": 160,
47
+ "eos_token_id": 2,
48
+ "feat_extract_activation": "gelu",
49
+ "feat_extract_norm": "group",
50
+ "feat_proj_dropout": 0.0,
51
+ "guided_attention_loss_num_heads": 2,
52
+ "guided_attention_loss_scale": 10.0,
53
+ "guided_attention_loss_sigma": 0.4,
54
+ "hidden_act": "gelu",
55
+ "hidden_dropout": 0.1,
56
+ "hidden_size": 768,
57
+ "initializer_range": 0.02,
58
+ "is_encoder_decoder": true,
59
+ "layer_norm_eps": 1e-05,
60
+ "mask_feature_length": 10,
61
+ "mask_feature_min_masks": 0,
62
+ "mask_feature_prob": 0.0,
63
+ "mask_time_length": 10,
64
+ "mask_time_min_masks": 2,
65
+ "mask_time_prob": 0.05,
66
+ "max_length": null,
67
+ "max_speech_positions": 1876,
68
+ "max_text_positions": 600,
69
+ "model_type": "speecht5",
70
+ "num_conv_pos_embedding_groups": 16,
71
+ "num_conv_pos_embeddings": 128,
72
+ "num_feat_extract_layers": 7,
73
+ "num_mel_bins": 80,
74
+ "pad_token_id": 1,
75
+ "positional_dropout": 0.1,
76
+ "reduction_factor": 2,
77
+ "scale_embedding": false,
78
+ "speaker_embedding_dim": 512,
79
+ "speech_decoder_postnet_dropout": 0.5,
80
+ "speech_decoder_postnet_kernel": 5,
81
+ "speech_decoder_postnet_layers": 5,
82
+ "speech_decoder_postnet_units": 256,
83
+ "speech_decoder_prenet_dropout": 0.5,
84
+ "speech_decoder_prenet_layers": 2,
85
+ "speech_decoder_prenet_units": 256,
86
+ "torch_dtype": "float32",
87
+ "transformers_version": "4.52.0.dev0",
88
+ "use_cache": false,
89
+ "use_guided_attention_loss": true,
90
+ "vocab_size": 81
91
+ }
checkpoint-1000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "max_length": 1876,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "4.52.0.dev0"
9
+ }
checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87514d400a13c6eefdeb2f89abd1795e621c2344f96e159ce2aeba3d0ce85944
3
+ size 577789320
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2699a889db8dbb0ae670281bc558951bffc765ae88e6bb0bb5222ac12288814b
3
+ size 1155772233
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82d177efef76b1d9db7c817f74c58d37c483a0042c96999443934a8052be41aa
3
+ size 14244
checkpoint-1000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27eb6d31126283f601b217f22a8971040a00a73abf0a2e26bfcb5064cd0afa48
3
+ size 988
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5101d8c1f86d6f48167e50b1164b9ba363ab76694ff2d5c1e326e3d5f94ecaef
3
+ size 1064
checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "mask_token": {
5
+ "content": "<mask>",
6
+ "lstrip": true,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "pad_token": "<pad>",
12
+ "unk_token": "<unk>"
13
+ }
checkpoint-1000/spm_char.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560
3
+ size 238473
checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "79": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "80": {
44
+ "content": "<ctc_blank>",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ }
51
+ },
52
+ "bos_token": "<s>",
53
+ "clean_up_tokenization_spaces": false,
54
+ "eos_token": "</s>",
55
+ "extra_special_tokens": {},
56
+ "mask_token": "<mask>",
57
+ "model_max_length": 600,
58
+ "normalize": false,
59
+ "pad_token": "<pad>",
60
+ "processor_class": "SpeechT5Processor",
61
+ "sp_model_kwargs": {},
62
+ "tokenizer_class": "SpeechT5Tokenizer",
63
+ "unk_token": "<unk>"
64
+ }
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1000,
3
+ "best_metric": 0.5154594779014587,
4
+ "best_model_checkpoint": "./speecht5_tts_mabama/checkpoint-1000",
5
+ "epoch": 125.0,
6
+ "eval_steps": 1000,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 3.1333333333333333,
14
+ "grad_norm": 13.092179298400879,
15
+ "learning_rate": 4.2000000000000006e-07,
16
+ "loss": 1.0978,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 6.266666666666667,
21
+ "grad_norm": 13.538804054260254,
22
+ "learning_rate": 9.200000000000001e-07,
23
+ "loss": 1.0057,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 9.4,
28
+ "grad_norm": 3.8923733234405518,
29
+ "learning_rate": 1.42e-06,
30
+ "loss": 0.8155,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 12.533333333333333,
35
+ "grad_norm": 2.569746732711792,
36
+ "learning_rate": 1.9200000000000003e-06,
37
+ "loss": 0.7921,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 15.666666666666666,
42
+ "grad_norm": 2.390493631362915,
43
+ "learning_rate": 2.42e-06,
44
+ "loss": 0.7531,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 18.8,
49
+ "grad_norm": 2.7168779373168945,
50
+ "learning_rate": 2.92e-06,
51
+ "loss": 0.7393,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 21.933333333333334,
56
+ "grad_norm": 10.27633285522461,
57
+ "learning_rate": 3.4200000000000007e-06,
58
+ "loss": 0.7292,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 25.0,
63
+ "grad_norm": 5.6921000480651855,
64
+ "learning_rate": 3.920000000000001e-06,
65
+ "loss": 0.6642,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 28.133333333333333,
70
+ "grad_norm": 2.6206777095794678,
71
+ "learning_rate": 4.42e-06,
72
+ "loss": 0.6555,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 31.266666666666666,
77
+ "grad_norm": 1.9396028518676758,
78
+ "learning_rate": 4.92e-06,
79
+ "loss": 0.6484,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 34.4,
84
+ "grad_norm": 3.44437575340271,
85
+ "learning_rate": 5.420000000000001e-06,
86
+ "loss": 0.6414,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 37.53333333333333,
91
+ "grad_norm": 2.729497194290161,
92
+ "learning_rate": 5.92e-06,
93
+ "loss": 0.6323,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 40.666666666666664,
98
+ "grad_norm": 2.3852877616882324,
99
+ "learning_rate": 6.42e-06,
100
+ "loss": 0.6073,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 43.8,
105
+ "grad_norm": 4.4287109375,
106
+ "learning_rate": 6.92e-06,
107
+ "loss": 0.6034,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 46.93333333333333,
112
+ "grad_norm": 2.1653966903686523,
113
+ "learning_rate": 7.420000000000001e-06,
114
+ "loss": 0.5865,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 50.0,
119
+ "grad_norm": 2.8120265007019043,
120
+ "learning_rate": 7.92e-06,
121
+ "loss": 0.5556,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 53.13333333333333,
126
+ "grad_norm": 2.0973806381225586,
127
+ "learning_rate": 8.42e-06,
128
+ "loss": 0.5416,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 56.266666666666666,
133
+ "grad_norm": 2.6723616123199463,
134
+ "learning_rate": 8.920000000000001e-06,
135
+ "loss": 0.5407,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 59.4,
140
+ "grad_norm": 2.1810383796691895,
141
+ "learning_rate": 9.42e-06,
142
+ "loss": 0.5174,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 62.53333333333333,
147
+ "grad_norm": 3.464071750640869,
148
+ "learning_rate": 9.920000000000002e-06,
149
+ "loss": 0.5327,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 65.66666666666667,
154
+ "grad_norm": 3.6148977279663086,
155
+ "learning_rate": 9.940000000000001e-06,
156
+ "loss": 0.5141,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 68.8,
161
+ "grad_norm": 2.5631027221679688,
162
+ "learning_rate": 9.86857142857143e-06,
163
+ "loss": 0.5246,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 71.93333333333334,
168
+ "grad_norm": 2.058468818664551,
169
+ "learning_rate": 9.797142857142858e-06,
170
+ "loss": 0.5065,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 75.0,
175
+ "grad_norm": 1.7559466361999512,
176
+ "learning_rate": 9.725714285714287e-06,
177
+ "loss": 0.4871,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 78.13333333333334,
182
+ "grad_norm": 2.653345823287964,
183
+ "learning_rate": 9.654285714285716e-06,
184
+ "loss": 0.4941,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 81.26666666666667,
189
+ "grad_norm": 2.612226724624634,
190
+ "learning_rate": 9.582857142857143e-06,
191
+ "loss": 0.4796,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 84.4,
196
+ "grad_norm": 1.7446099519729614,
197
+ "learning_rate": 9.511428571428572e-06,
198
+ "loss": 0.487,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 87.53333333333333,
203
+ "grad_norm": 2.627315044403076,
204
+ "learning_rate": 9.440000000000001e-06,
205
+ "loss": 0.4731,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 90.66666666666667,
210
+ "grad_norm": 2.4315383434295654,
211
+ "learning_rate": 9.368571428571428e-06,
212
+ "loss": 0.4812,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 93.8,
217
+ "grad_norm": 2.4056336879730225,
218
+ "learning_rate": 9.297142857142857e-06,
219
+ "loss": 0.468,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 96.93333333333334,
224
+ "grad_norm": 2.153116464614868,
225
+ "learning_rate": 9.225714285714286e-06,
226
+ "loss": 0.4829,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 100.0,
231
+ "grad_norm": 2.9421756267547607,
232
+ "learning_rate": 9.154285714285715e-06,
233
+ "loss": 0.4555,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 103.13333333333334,
238
+ "grad_norm": 1.6771883964538574,
239
+ "learning_rate": 9.082857142857143e-06,
240
+ "loss": 0.462,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 106.26666666666667,
245
+ "grad_norm": 2.9711899757385254,
246
+ "learning_rate": 9.011428571428572e-06,
247
+ "loss": 0.471,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 109.4,
252
+ "grad_norm": 1.922980546951294,
253
+ "learning_rate": 8.94e-06,
254
+ "loss": 0.4673,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 112.53333333333333,
259
+ "grad_norm": 2.49945068359375,
260
+ "learning_rate": 8.86857142857143e-06,
261
+ "loss": 0.4611,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 115.66666666666667,
266
+ "grad_norm": 2.646510362625122,
267
+ "learning_rate": 8.797142857142857e-06,
268
+ "loss": 0.4574,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 118.8,
273
+ "grad_norm": 1.7943354845046997,
274
+ "learning_rate": 8.725714285714286e-06,
275
+ "loss": 0.4658,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 121.93333333333334,
280
+ "grad_norm": 2.171827793121338,
281
+ "learning_rate": 8.654285714285715e-06,
282
+ "loss": 0.4561,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 125.0,
287
+ "grad_norm": 7.516489505767822,
288
+ "learning_rate": 8.582857142857144e-06,
289
+ "loss": 0.4472,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 125.0,
294
+ "eval_loss": 0.5154594779014587,
295
+ "eval_runtime": 0.7837,
296
+ "eval_samples_per_second": 33.175,
297
+ "eval_steps_per_second": 5.104,
298
+ "step": 1000
299
+ }
300
+ ],
301
+ "logging_steps": 25,
302
+ "max_steps": 4000,
303
+ "num_input_tokens_seen": 0,
304
+ "num_train_epochs": 572,
305
+ "save_steps": 1000,
306
+ "stateful_callbacks": {
307
+ "TrainerControl": {
308
+ "args": {
309
+ "should_epoch_stop": false,
310
+ "should_evaluate": false,
311
+ "should_log": false,
312
+ "should_save": true,
313
+ "should_training_stop": false
314
+ },
315
+ "attributes": {}
316
+ }
317
+ },
318
+ "total_flos": 2568713479659360.0,
319
+ "train_batch_size": 16,
320
+ "trial_name": null,
321
+ "trial_params": null
322
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07e653cd3b83482a0939ee4c8a207df9a996f44ce9dd82197c4ab6cde60cf2bb
3
+ size 5432
checkpoint-2000/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<ctc_blank>": 80,
3
+ "<mask>": 79
4
+ }
checkpoint-2000/config.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "SpeechT5ForTextToSpeech"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 0,
9
+ "conv_bias": false,
10
+ "conv_dim": [
11
+ 512,
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512
18
+ ],
19
+ "conv_kernel": [
20
+ 10,
21
+ 3,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 2,
26
+ 2
27
+ ],
28
+ "conv_stride": [
29
+ 5,
30
+ 2,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2
36
+ ],
37
+ "decoder_attention_heads": 12,
38
+ "decoder_ffn_dim": 3072,
39
+ "decoder_layerdrop": 0.1,
40
+ "decoder_layers": 6,
41
+ "decoder_start_token_id": 2,
42
+ "encoder_attention_heads": 12,
43
+ "encoder_ffn_dim": 3072,
44
+ "encoder_layerdrop": 0.1,
45
+ "encoder_layers": 12,
46
+ "encoder_max_relative_position": 160,
47
+ "eos_token_id": 2,
48
+ "feat_extract_activation": "gelu",
49
+ "feat_extract_norm": "group",
50
+ "feat_proj_dropout": 0.0,
51
+ "guided_attention_loss_num_heads": 2,
52
+ "guided_attention_loss_scale": 10.0,
53
+ "guided_attention_loss_sigma": 0.4,
54
+ "hidden_act": "gelu",
55
+ "hidden_dropout": 0.1,
56
+ "hidden_size": 768,
57
+ "initializer_range": 0.02,
58
+ "is_encoder_decoder": true,
59
+ "layer_norm_eps": 1e-05,
60
+ "mask_feature_length": 10,
61
+ "mask_feature_min_masks": 0,
62
+ "mask_feature_prob": 0.0,
63
+ "mask_time_length": 10,
64
+ "mask_time_min_masks": 2,
65
+ "mask_time_prob": 0.05,
66
+ "max_length": null,
67
+ "max_speech_positions": 1876,
68
+ "max_text_positions": 600,
69
+ "model_type": "speecht5",
70
+ "num_conv_pos_embedding_groups": 16,
71
+ "num_conv_pos_embeddings": 128,
72
+ "num_feat_extract_layers": 7,
73
+ "num_mel_bins": 80,
74
+ "pad_token_id": 1,
75
+ "positional_dropout": 0.1,
76
+ "reduction_factor": 2,
77
+ "scale_embedding": false,
78
+ "speaker_embedding_dim": 512,
79
+ "speech_decoder_postnet_dropout": 0.5,
80
+ "speech_decoder_postnet_kernel": 5,
81
+ "speech_decoder_postnet_layers": 5,
82
+ "speech_decoder_postnet_units": 256,
83
+ "speech_decoder_prenet_dropout": 0.5,
84
+ "speech_decoder_prenet_layers": 2,
85
+ "speech_decoder_prenet_units": 256,
86
+ "torch_dtype": "float32",
87
+ "transformers_version": "4.52.0.dev0",
88
+ "use_cache": false,
89
+ "use_guided_attention_loss": true,
90
+ "vocab_size": 81
91
+ }
checkpoint-2000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "max_length": 1876,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "4.52.0.dev0"
9
+ }
checkpoint-2000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abac69d52746eff2a8ac4fea48c076a031effd3d774eaa79c34c25289b78a9ad
3
+ size 577789320
checkpoint-2000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df818e2b4cf58851158e10b3d57754c198be51c0b852b9cd4b587b629a205640
3
+ size 1155772233
checkpoint-2000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b766bf7909addbb49e8f135f2f8aa3b6e99cb053e36395d8560f93e71c2776e7
3
+ size 14244
checkpoint-2000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49bc19d5712fad43d5cef95c2e01c73bd75bdb71e4c16fa8781d626d978f5452
3
+ size 988
checkpoint-2000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b80a609c64a12b4db2f38941ea479b9a30f9351b7aac74f4956e8686dc338317
3
+ size 1064
checkpoint-2000/special_tokens_map.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "mask_token": {
5
+ "content": "<mask>",
6
+ "lstrip": true,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "pad_token": "<pad>",
12
+ "unk_token": "<unk>"
13
+ }
checkpoint-2000/spm_char.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560
3
+ size 238473
checkpoint-2000/tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "79": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "80": {
44
+ "content": "<ctc_blank>",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ }
51
+ },
52
+ "bos_token": "<s>",
53
+ "clean_up_tokenization_spaces": false,
54
+ "eos_token": "</s>",
55
+ "extra_special_tokens": {},
56
+ "mask_token": "<mask>",
57
+ "model_max_length": 600,
58
+ "normalize": false,
59
+ "pad_token": "<pad>",
60
+ "processor_class": "SpeechT5Processor",
61
+ "sp_model_kwargs": {},
62
+ "tokenizer_class": "SpeechT5Tokenizer",
63
+ "unk_token": "<unk>"
64
+ }
checkpoint-2000/trainer_state.json ADDED
@@ -0,0 +1,610 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 2000,
3
+ "best_metric": 0.4833647906780243,
4
+ "best_model_checkpoint": "./speecht5_tts_mabama/checkpoint-2000",
5
+ "epoch": 250.0,
6
+ "eval_steps": 1000,
7
+ "global_step": 2000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 3.1333333333333333,
14
+ "grad_norm": 13.092179298400879,
15
+ "learning_rate": 4.2000000000000006e-07,
16
+ "loss": 1.0978,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 6.266666666666667,
21
+ "grad_norm": 13.538804054260254,
22
+ "learning_rate": 9.200000000000001e-07,
23
+ "loss": 1.0057,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 9.4,
28
+ "grad_norm": 3.8923733234405518,
29
+ "learning_rate": 1.42e-06,
30
+ "loss": 0.8155,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 12.533333333333333,
35
+ "grad_norm": 2.569746732711792,
36
+ "learning_rate": 1.9200000000000003e-06,
37
+ "loss": 0.7921,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 15.666666666666666,
42
+ "grad_norm": 2.390493631362915,
43
+ "learning_rate": 2.42e-06,
44
+ "loss": 0.7531,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 18.8,
49
+ "grad_norm": 2.7168779373168945,
50
+ "learning_rate": 2.92e-06,
51
+ "loss": 0.7393,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 21.933333333333334,
56
+ "grad_norm": 10.27633285522461,
57
+ "learning_rate": 3.4200000000000007e-06,
58
+ "loss": 0.7292,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 25.0,
63
+ "grad_norm": 5.6921000480651855,
64
+ "learning_rate": 3.920000000000001e-06,
65
+ "loss": 0.6642,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 28.133333333333333,
70
+ "grad_norm": 2.6206777095794678,
71
+ "learning_rate": 4.42e-06,
72
+ "loss": 0.6555,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 31.266666666666666,
77
+ "grad_norm": 1.9396028518676758,
78
+ "learning_rate": 4.92e-06,
79
+ "loss": 0.6484,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 34.4,
84
+ "grad_norm": 3.44437575340271,
85
+ "learning_rate": 5.420000000000001e-06,
86
+ "loss": 0.6414,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 37.53333333333333,
91
+ "grad_norm": 2.729497194290161,
92
+ "learning_rate": 5.92e-06,
93
+ "loss": 0.6323,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 40.666666666666664,
98
+ "grad_norm": 2.3852877616882324,
99
+ "learning_rate": 6.42e-06,
100
+ "loss": 0.6073,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 43.8,
105
+ "grad_norm": 4.4287109375,
106
+ "learning_rate": 6.92e-06,
107
+ "loss": 0.6034,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 46.93333333333333,
112
+ "grad_norm": 2.1653966903686523,
113
+ "learning_rate": 7.420000000000001e-06,
114
+ "loss": 0.5865,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 50.0,
119
+ "grad_norm": 2.8120265007019043,
120
+ "learning_rate": 7.92e-06,
121
+ "loss": 0.5556,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 53.13333333333333,
126
+ "grad_norm": 2.0973806381225586,
127
+ "learning_rate": 8.42e-06,
128
+ "loss": 0.5416,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 56.266666666666666,
133
+ "grad_norm": 2.6723616123199463,
134
+ "learning_rate": 8.920000000000001e-06,
135
+ "loss": 0.5407,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 59.4,
140
+ "grad_norm": 2.1810383796691895,
141
+ "learning_rate": 9.42e-06,
142
+ "loss": 0.5174,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 62.53333333333333,
147
+ "grad_norm": 3.464071750640869,
148
+ "learning_rate": 9.920000000000002e-06,
149
+ "loss": 0.5327,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 65.66666666666667,
154
+ "grad_norm": 3.6148977279663086,
155
+ "learning_rate": 9.940000000000001e-06,
156
+ "loss": 0.5141,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 68.8,
161
+ "grad_norm": 2.5631027221679688,
162
+ "learning_rate": 9.86857142857143e-06,
163
+ "loss": 0.5246,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 71.93333333333334,
168
+ "grad_norm": 2.058468818664551,
169
+ "learning_rate": 9.797142857142858e-06,
170
+ "loss": 0.5065,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 75.0,
175
+ "grad_norm": 1.7559466361999512,
176
+ "learning_rate": 9.725714285714287e-06,
177
+ "loss": 0.4871,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 78.13333333333334,
182
+ "grad_norm": 2.653345823287964,
183
+ "learning_rate": 9.654285714285716e-06,
184
+ "loss": 0.4941,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 81.26666666666667,
189
+ "grad_norm": 2.612226724624634,
190
+ "learning_rate": 9.582857142857143e-06,
191
+ "loss": 0.4796,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 84.4,
196
+ "grad_norm": 1.7446099519729614,
197
+ "learning_rate": 9.511428571428572e-06,
198
+ "loss": 0.487,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 87.53333333333333,
203
+ "grad_norm": 2.627315044403076,
204
+ "learning_rate": 9.440000000000001e-06,
205
+ "loss": 0.4731,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 90.66666666666667,
210
+ "grad_norm": 2.4315383434295654,
211
+ "learning_rate": 9.368571428571428e-06,
212
+ "loss": 0.4812,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 93.8,
217
+ "grad_norm": 2.4056336879730225,
218
+ "learning_rate": 9.297142857142857e-06,
219
+ "loss": 0.468,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 96.93333333333334,
224
+ "grad_norm": 2.153116464614868,
225
+ "learning_rate": 9.225714285714286e-06,
226
+ "loss": 0.4829,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 100.0,
231
+ "grad_norm": 2.9421756267547607,
232
+ "learning_rate": 9.154285714285715e-06,
233
+ "loss": 0.4555,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 103.13333333333334,
238
+ "grad_norm": 1.6771883964538574,
239
+ "learning_rate": 9.082857142857143e-06,
240
+ "loss": 0.462,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 106.26666666666667,
245
+ "grad_norm": 2.9711899757385254,
246
+ "learning_rate": 9.011428571428572e-06,
247
+ "loss": 0.471,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 109.4,
252
+ "grad_norm": 1.922980546951294,
253
+ "learning_rate": 8.94e-06,
254
+ "loss": 0.4673,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 112.53333333333333,
259
+ "grad_norm": 2.49945068359375,
260
+ "learning_rate": 8.86857142857143e-06,
261
+ "loss": 0.4611,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 115.66666666666667,
266
+ "grad_norm": 2.646510362625122,
267
+ "learning_rate": 8.797142857142857e-06,
268
+ "loss": 0.4574,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 118.8,
273
+ "grad_norm": 1.7943354845046997,
274
+ "learning_rate": 8.725714285714286e-06,
275
+ "loss": 0.4658,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 121.93333333333334,
280
+ "grad_norm": 2.171827793121338,
281
+ "learning_rate": 8.654285714285715e-06,
282
+ "loss": 0.4561,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 125.0,
287
+ "grad_norm": 7.516489505767822,
288
+ "learning_rate": 8.582857142857144e-06,
289
+ "loss": 0.4472,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 125.0,
294
+ "eval_loss": 0.5154594779014587,
295
+ "eval_runtime": 0.7837,
296
+ "eval_samples_per_second": 33.175,
297
+ "eval_steps_per_second": 5.104,
298
+ "step": 1000
299
+ },
300
+ {
301
+ "epoch": 128.13333333333333,
302
+ "grad_norm": 2.5167343616485596,
303
+ "learning_rate": 8.511428571428571e-06,
304
+ "loss": 0.457,
305
+ "step": 1025
306
+ },
307
+ {
308
+ "epoch": 131.26666666666668,
309
+ "grad_norm": 3.3089983463287354,
310
+ "learning_rate": 8.44e-06,
311
+ "loss": 0.4456,
312
+ "step": 1050
313
+ },
314
+ {
315
+ "epoch": 134.4,
316
+ "grad_norm": 2.778348445892334,
317
+ "learning_rate": 8.36857142857143e-06,
318
+ "loss": 0.4612,
319
+ "step": 1075
320
+ },
321
+ {
322
+ "epoch": 137.53333333333333,
323
+ "grad_norm": 2.529778480529785,
324
+ "learning_rate": 8.297142857142859e-06,
325
+ "loss": 0.4429,
326
+ "step": 1100
327
+ },
328
+ {
329
+ "epoch": 140.66666666666666,
330
+ "grad_norm": 1.76685631275177,
331
+ "learning_rate": 8.225714285714288e-06,
332
+ "loss": 0.4399,
333
+ "step": 1125
334
+ },
335
+ {
336
+ "epoch": 143.8,
337
+ "grad_norm": 1.8449666500091553,
338
+ "learning_rate": 8.154285714285715e-06,
339
+ "loss": 0.4329,
340
+ "step": 1150
341
+ },
342
+ {
343
+ "epoch": 146.93333333333334,
344
+ "grad_norm": 1.9097468852996826,
345
+ "learning_rate": 8.082857142857144e-06,
346
+ "loss": 0.4527,
347
+ "step": 1175
348
+ },
349
+ {
350
+ "epoch": 150.0,
351
+ "grad_norm": 3.892838716506958,
352
+ "learning_rate": 8.011428571428573e-06,
353
+ "loss": 0.4448,
354
+ "step": 1200
355
+ },
356
+ {
357
+ "epoch": 153.13333333333333,
358
+ "grad_norm": 2.1518826484680176,
359
+ "learning_rate": 7.94e-06,
360
+ "loss": 0.4412,
361
+ "step": 1225
362
+ },
363
+ {
364
+ "epoch": 156.26666666666668,
365
+ "grad_norm": 1.5322662591934204,
366
+ "learning_rate": 7.86857142857143e-06,
367
+ "loss": 0.4388,
368
+ "step": 1250
369
+ },
370
+ {
371
+ "epoch": 159.4,
372
+ "grad_norm": 1.4961107969284058,
373
+ "learning_rate": 7.797142857142858e-06,
374
+ "loss": 0.4363,
375
+ "step": 1275
376
+ },
377
+ {
378
+ "epoch": 162.53333333333333,
379
+ "grad_norm": 1.8992841243743896,
380
+ "learning_rate": 7.725714285714286e-06,
381
+ "loss": 0.4474,
382
+ "step": 1300
383
+ },
384
+ {
385
+ "epoch": 165.66666666666666,
386
+ "grad_norm": 1.5015554428100586,
387
+ "learning_rate": 7.654285714285715e-06,
388
+ "loss": 0.4327,
389
+ "step": 1325
390
+ },
391
+ {
392
+ "epoch": 168.8,
393
+ "grad_norm": 2.0730693340301514,
394
+ "learning_rate": 7.5828571428571444e-06,
395
+ "loss": 0.4348,
396
+ "step": 1350
397
+ },
398
+ {
399
+ "epoch": 171.93333333333334,
400
+ "grad_norm": 2.0838747024536133,
401
+ "learning_rate": 7.511428571428572e-06,
402
+ "loss": 0.4393,
403
+ "step": 1375
404
+ },
405
+ {
406
+ "epoch": 175.0,
407
+ "grad_norm": 4.3804030418396,
408
+ "learning_rate": 7.440000000000001e-06,
409
+ "loss": 0.4386,
410
+ "step": 1400
411
+ },
412
+ {
413
+ "epoch": 178.13333333333333,
414
+ "grad_norm": 1.8927189111709595,
415
+ "learning_rate": 7.36857142857143e-06,
416
+ "loss": 0.4318,
417
+ "step": 1425
418
+ },
419
+ {
420
+ "epoch": 181.26666666666668,
421
+ "grad_norm": 1.5456620454788208,
422
+ "learning_rate": 7.297142857142858e-06,
423
+ "loss": 0.4336,
424
+ "step": 1450
425
+ },
426
+ {
427
+ "epoch": 184.4,
428
+ "grad_norm": 2.722612142562866,
429
+ "learning_rate": 7.225714285714286e-06,
430
+ "loss": 0.4281,
431
+ "step": 1475
432
+ },
433
+ {
434
+ "epoch": 187.53333333333333,
435
+ "grad_norm": 1.9484314918518066,
436
+ "learning_rate": 7.154285714285715e-06,
437
+ "loss": 0.4312,
438
+ "step": 1500
439
+ },
440
+ {
441
+ "epoch": 190.66666666666666,
442
+ "grad_norm": 2.101043224334717,
443
+ "learning_rate": 7.082857142857143e-06,
444
+ "loss": 0.427,
445
+ "step": 1525
446
+ },
447
+ {
448
+ "epoch": 193.8,
449
+ "grad_norm": 1.9785490036010742,
450
+ "learning_rate": 7.011428571428572e-06,
451
+ "loss": 0.4298,
452
+ "step": 1550
453
+ },
454
+ {
455
+ "epoch": 196.93333333333334,
456
+ "grad_norm": 2.319054126739502,
457
+ "learning_rate": 6.9400000000000005e-06,
458
+ "loss": 0.4376,
459
+ "step": 1575
460
+ },
461
+ {
462
+ "epoch": 200.0,
463
+ "grad_norm": 1.3612741231918335,
464
+ "learning_rate": 6.868571428571429e-06,
465
+ "loss": 0.4217,
466
+ "step": 1600
467
+ },
468
+ {
469
+ "epoch": 203.13333333333333,
470
+ "grad_norm": 2.128363847732544,
471
+ "learning_rate": 6.797142857142858e-06,
472
+ "loss": 0.4217,
473
+ "step": 1625
474
+ },
475
+ {
476
+ "epoch": 206.26666666666668,
477
+ "grad_norm": 1.7985234260559082,
478
+ "learning_rate": 6.725714285714287e-06,
479
+ "loss": 0.4147,
480
+ "step": 1650
481
+ },
482
+ {
483
+ "epoch": 209.4,
484
+ "grad_norm": 1.3478573560714722,
485
+ "learning_rate": 6.654285714285716e-06,
486
+ "loss": 0.4357,
487
+ "step": 1675
488
+ },
489
+ {
490
+ "epoch": 212.53333333333333,
491
+ "grad_norm": 1.5389248132705688,
492
+ "learning_rate": 6.582857142857143e-06,
493
+ "loss": 0.419,
494
+ "step": 1700
495
+ },
496
+ {
497
+ "epoch": 215.66666666666666,
498
+ "grad_norm": 1.9558783769607544,
499
+ "learning_rate": 6.511428571428572e-06,
500
+ "loss": 0.4289,
501
+ "step": 1725
502
+ },
503
+ {
504
+ "epoch": 218.8,
505
+ "grad_norm": 1.756585955619812,
506
+ "learning_rate": 6.440000000000001e-06,
507
+ "loss": 0.4168,
508
+ "step": 1750
509
+ },
510
+ {
511
+ "epoch": 221.93333333333334,
512
+ "grad_norm": 1.8744903802871704,
513
+ "learning_rate": 6.368571428571429e-06,
514
+ "loss": 0.4296,
515
+ "step": 1775
516
+ },
517
+ {
518
+ "epoch": 225.0,
519
+ "grad_norm": 1.133415699005127,
520
+ "learning_rate": 6.297142857142857e-06,
521
+ "loss": 0.4162,
522
+ "step": 1800
523
+ },
524
+ {
525
+ "epoch": 228.13333333333333,
526
+ "grad_norm": 2.819840908050537,
527
+ "learning_rate": 6.225714285714286e-06,
528
+ "loss": 0.4275,
529
+ "step": 1825
530
+ },
531
+ {
532
+ "epoch": 231.26666666666668,
533
+ "grad_norm": 1.5150210857391357,
534
+ "learning_rate": 6.1542857142857145e-06,
535
+ "loss": 0.4244,
536
+ "step": 1850
537
+ },
538
+ {
539
+ "epoch": 234.4,
540
+ "grad_norm": 2.184819459915161,
541
+ "learning_rate": 6.0828571428571435e-06,
542
+ "loss": 0.4282,
543
+ "step": 1875
544
+ },
545
+ {
546
+ "epoch": 237.53333333333333,
547
+ "grad_norm": 3.293454170227051,
548
+ "learning_rate": 6.011428571428572e-06,
549
+ "loss": 0.4215,
550
+ "step": 1900
551
+ },
552
+ {
553
+ "epoch": 240.66666666666666,
554
+ "grad_norm": 1.210433006286621,
555
+ "learning_rate": 5.94e-06,
556
+ "loss": 0.4103,
557
+ "step": 1925
558
+ },
559
+ {
560
+ "epoch": 243.8,
561
+ "grad_norm": 2.5027923583984375,
562
+ "learning_rate": 5.868571428571429e-06,
563
+ "loss": 0.4186,
564
+ "step": 1950
565
+ },
566
+ {
567
+ "epoch": 246.93333333333334,
568
+ "grad_norm": 1.9649789333343506,
569
+ "learning_rate": 5.797142857142858e-06,
570
+ "loss": 0.427,
571
+ "step": 1975
572
+ },
573
+ {
574
+ "epoch": 250.0,
575
+ "grad_norm": 5.899420261383057,
576
+ "learning_rate": 5.725714285714287e-06,
577
+ "loss": 0.4113,
578
+ "step": 2000
579
+ },
580
+ {
581
+ "epoch": 250.0,
582
+ "eval_loss": 0.4833647906780243,
583
+ "eval_runtime": 0.7095,
584
+ "eval_samples_per_second": 36.646,
585
+ "eval_steps_per_second": 5.638,
586
+ "step": 2000
587
+ }
588
+ ],
589
+ "logging_steps": 25,
590
+ "max_steps": 4000,
591
+ "num_input_tokens_seen": 0,
592
+ "num_train_epochs": 572,
593
+ "save_steps": 1000,
594
+ "stateful_callbacks": {
595
+ "TrainerControl": {
596
+ "args": {
597
+ "should_epoch_stop": false,
598
+ "should_evaluate": false,
599
+ "should_log": false,
600
+ "should_save": true,
601
+ "should_training_stop": false
602
+ },
603
+ "attributes": {}
604
+ }
605
+ },
606
+ "total_flos": 5140893067410672.0,
607
+ "train_batch_size": 16,
608
+ "trial_name": null,
609
+ "trial_params": null
610
+ }
checkpoint-2000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07e653cd3b83482a0939ee4c8a207df9a996f44ce9dd82197c4ab6cde60cf2bb
3
+ size 5432
checkpoint-3000/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<ctc_blank>": 80,
3
+ "<mask>": 79
4
+ }
checkpoint-3000/config.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "SpeechT5ForTextToSpeech"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 0,
9
+ "conv_bias": false,
10
+ "conv_dim": [
11
+ 512,
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512
18
+ ],
19
+ "conv_kernel": [
20
+ 10,
21
+ 3,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 2,
26
+ 2
27
+ ],
28
+ "conv_stride": [
29
+ 5,
30
+ 2,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2
36
+ ],
37
+ "decoder_attention_heads": 12,
38
+ "decoder_ffn_dim": 3072,
39
+ "decoder_layerdrop": 0.1,
40
+ "decoder_layers": 6,
41
+ "decoder_start_token_id": 2,
42
+ "encoder_attention_heads": 12,
43
+ "encoder_ffn_dim": 3072,
44
+ "encoder_layerdrop": 0.1,
45
+ "encoder_layers": 12,
46
+ "encoder_max_relative_position": 160,
47
+ "eos_token_id": 2,
48
+ "feat_extract_activation": "gelu",
49
+ "feat_extract_norm": "group",
50
+ "feat_proj_dropout": 0.0,
51
+ "guided_attention_loss_num_heads": 2,
52
+ "guided_attention_loss_scale": 10.0,
53
+ "guided_attention_loss_sigma": 0.4,
54
+ "hidden_act": "gelu",
55
+ "hidden_dropout": 0.1,
56
+ "hidden_size": 768,
57
+ "initializer_range": 0.02,
58
+ "is_encoder_decoder": true,
59
+ "layer_norm_eps": 1e-05,
60
+ "mask_feature_length": 10,
61
+ "mask_feature_min_masks": 0,
62
+ "mask_feature_prob": 0.0,
63
+ "mask_time_length": 10,
64
+ "mask_time_min_masks": 2,
65
+ "mask_time_prob": 0.05,
66
+ "max_length": null,
67
+ "max_speech_positions": 1876,
68
+ "max_text_positions": 600,
69
+ "model_type": "speecht5",
70
+ "num_conv_pos_embedding_groups": 16,
71
+ "num_conv_pos_embeddings": 128,
72
+ "num_feat_extract_layers": 7,
73
+ "num_mel_bins": 80,
74
+ "pad_token_id": 1,
75
+ "positional_dropout": 0.1,
76
+ "reduction_factor": 2,
77
+ "scale_embedding": false,
78
+ "speaker_embedding_dim": 512,
79
+ "speech_decoder_postnet_dropout": 0.5,
80
+ "speech_decoder_postnet_kernel": 5,
81
+ "speech_decoder_postnet_layers": 5,
82
+ "speech_decoder_postnet_units": 256,
83
+ "speech_decoder_prenet_dropout": 0.5,
84
+ "speech_decoder_prenet_layers": 2,
85
+ "speech_decoder_prenet_units": 256,
86
+ "torch_dtype": "float32",
87
+ "transformers_version": "4.52.0.dev0",
88
+ "use_cache": false,
89
+ "use_guided_attention_loss": true,
90
+ "vocab_size": 81
91
+ }
checkpoint-3000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "max_length": 1876,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "4.52.0.dev0"
9
+ }
checkpoint-3000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d78d191d64b057ef7708236443ccd41bb24d44484f04b36d4fd46df31daa1c6
3
+ size 577789320
checkpoint-3000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abb1e34922901fa49e90b622d421fcc022123b3db879e5c48cdc697dd3a9c2d3
3
+ size 1155772233
checkpoint-3000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a258daad5ac5df9273072647bd5fccfa416cdadb91b7707278c61cc1145a5964
3
+ size 14244
checkpoint-3000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0c9142a3b98e645e9dc3ffae8c602fb70b74046fea7664e6d081ebb3d0bbb58
3
+ size 988
checkpoint-3000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ff56efc76c16a3b9a712527179ae61c8d6dfccc7e3a53f8c421d6329adacfbb
3
+ size 1064
checkpoint-3000/special_tokens_map.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "mask_token": {
5
+ "content": "<mask>",
6
+ "lstrip": true,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "pad_token": "<pad>",
12
+ "unk_token": "<unk>"
13
+ }
checkpoint-3000/spm_char.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fcc48f3e225f627b1641db410ceb0c8649bd2b0c982e150b03f8be3728ab560
3
+ size 238473
checkpoint-3000/tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "79": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "80": {
44
+ "content": "<ctc_blank>",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ }
51
+ },
52
+ "bos_token": "<s>",
53
+ "clean_up_tokenization_spaces": false,
54
+ "eos_token": "</s>",
55
+ "extra_special_tokens": {},
56
+ "mask_token": "<mask>",
57
+ "model_max_length": 600,
58
+ "normalize": false,
59
+ "pad_token": "<pad>",
60
+ "processor_class": "SpeechT5Processor",
61
+ "sp_model_kwargs": {},
62
+ "tokenizer_class": "SpeechT5Tokenizer",
63
+ "unk_token": "<unk>"
64
+ }
checkpoint-3000/trainer_state.json ADDED
@@ -0,0 +1,898 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 3000,
3
+ "best_metric": 0.46799278259277344,
4
+ "best_model_checkpoint": "./speecht5_tts_mabama/checkpoint-3000",
5
+ "epoch": 375.0,
6
+ "eval_steps": 1000,
7
+ "global_step": 3000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 3.1333333333333333,
14
+ "grad_norm": 13.092179298400879,
15
+ "learning_rate": 4.2000000000000006e-07,
16
+ "loss": 1.0978,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 6.266666666666667,
21
+ "grad_norm": 13.538804054260254,
22
+ "learning_rate": 9.200000000000001e-07,
23
+ "loss": 1.0057,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 9.4,
28
+ "grad_norm": 3.8923733234405518,
29
+ "learning_rate": 1.42e-06,
30
+ "loss": 0.8155,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 12.533333333333333,
35
+ "grad_norm": 2.569746732711792,
36
+ "learning_rate": 1.9200000000000003e-06,
37
+ "loss": 0.7921,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 15.666666666666666,
42
+ "grad_norm": 2.390493631362915,
43
+ "learning_rate": 2.42e-06,
44
+ "loss": 0.7531,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 18.8,
49
+ "grad_norm": 2.7168779373168945,
50
+ "learning_rate": 2.92e-06,
51
+ "loss": 0.7393,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 21.933333333333334,
56
+ "grad_norm": 10.27633285522461,
57
+ "learning_rate": 3.4200000000000007e-06,
58
+ "loss": 0.7292,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 25.0,
63
+ "grad_norm": 5.6921000480651855,
64
+ "learning_rate": 3.920000000000001e-06,
65
+ "loss": 0.6642,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 28.133333333333333,
70
+ "grad_norm": 2.6206777095794678,
71
+ "learning_rate": 4.42e-06,
72
+ "loss": 0.6555,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 31.266666666666666,
77
+ "grad_norm": 1.9396028518676758,
78
+ "learning_rate": 4.92e-06,
79
+ "loss": 0.6484,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 34.4,
84
+ "grad_norm": 3.44437575340271,
85
+ "learning_rate": 5.420000000000001e-06,
86
+ "loss": 0.6414,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 37.53333333333333,
91
+ "grad_norm": 2.729497194290161,
92
+ "learning_rate": 5.92e-06,
93
+ "loss": 0.6323,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 40.666666666666664,
98
+ "grad_norm": 2.3852877616882324,
99
+ "learning_rate": 6.42e-06,
100
+ "loss": 0.6073,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 43.8,
105
+ "grad_norm": 4.4287109375,
106
+ "learning_rate": 6.92e-06,
107
+ "loss": 0.6034,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 46.93333333333333,
112
+ "grad_norm": 2.1653966903686523,
113
+ "learning_rate": 7.420000000000001e-06,
114
+ "loss": 0.5865,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 50.0,
119
+ "grad_norm": 2.8120265007019043,
120
+ "learning_rate": 7.92e-06,
121
+ "loss": 0.5556,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 53.13333333333333,
126
+ "grad_norm": 2.0973806381225586,
127
+ "learning_rate": 8.42e-06,
128
+ "loss": 0.5416,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 56.266666666666666,
133
+ "grad_norm": 2.6723616123199463,
134
+ "learning_rate": 8.920000000000001e-06,
135
+ "loss": 0.5407,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 59.4,
140
+ "grad_norm": 2.1810383796691895,
141
+ "learning_rate": 9.42e-06,
142
+ "loss": 0.5174,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 62.53333333333333,
147
+ "grad_norm": 3.464071750640869,
148
+ "learning_rate": 9.920000000000002e-06,
149
+ "loss": 0.5327,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 65.66666666666667,
154
+ "grad_norm": 3.6148977279663086,
155
+ "learning_rate": 9.940000000000001e-06,
156
+ "loss": 0.5141,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 68.8,
161
+ "grad_norm": 2.5631027221679688,
162
+ "learning_rate": 9.86857142857143e-06,
163
+ "loss": 0.5246,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 71.93333333333334,
168
+ "grad_norm": 2.058468818664551,
169
+ "learning_rate": 9.797142857142858e-06,
170
+ "loss": 0.5065,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 75.0,
175
+ "grad_norm": 1.7559466361999512,
176
+ "learning_rate": 9.725714285714287e-06,
177
+ "loss": 0.4871,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 78.13333333333334,
182
+ "grad_norm": 2.653345823287964,
183
+ "learning_rate": 9.654285714285716e-06,
184
+ "loss": 0.4941,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 81.26666666666667,
189
+ "grad_norm": 2.612226724624634,
190
+ "learning_rate": 9.582857142857143e-06,
191
+ "loss": 0.4796,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 84.4,
196
+ "grad_norm": 1.7446099519729614,
197
+ "learning_rate": 9.511428571428572e-06,
198
+ "loss": 0.487,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 87.53333333333333,
203
+ "grad_norm": 2.627315044403076,
204
+ "learning_rate": 9.440000000000001e-06,
205
+ "loss": 0.4731,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 90.66666666666667,
210
+ "grad_norm": 2.4315383434295654,
211
+ "learning_rate": 9.368571428571428e-06,
212
+ "loss": 0.4812,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 93.8,
217
+ "grad_norm": 2.4056336879730225,
218
+ "learning_rate": 9.297142857142857e-06,
219
+ "loss": 0.468,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 96.93333333333334,
224
+ "grad_norm": 2.153116464614868,
225
+ "learning_rate": 9.225714285714286e-06,
226
+ "loss": 0.4829,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 100.0,
231
+ "grad_norm": 2.9421756267547607,
232
+ "learning_rate": 9.154285714285715e-06,
233
+ "loss": 0.4555,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 103.13333333333334,
238
+ "grad_norm": 1.6771883964538574,
239
+ "learning_rate": 9.082857142857143e-06,
240
+ "loss": 0.462,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 106.26666666666667,
245
+ "grad_norm": 2.9711899757385254,
246
+ "learning_rate": 9.011428571428572e-06,
247
+ "loss": 0.471,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 109.4,
252
+ "grad_norm": 1.922980546951294,
253
+ "learning_rate": 8.94e-06,
254
+ "loss": 0.4673,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 112.53333333333333,
259
+ "grad_norm": 2.49945068359375,
260
+ "learning_rate": 8.86857142857143e-06,
261
+ "loss": 0.4611,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 115.66666666666667,
266
+ "grad_norm": 2.646510362625122,
267
+ "learning_rate": 8.797142857142857e-06,
268
+ "loss": 0.4574,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 118.8,
273
+ "grad_norm": 1.7943354845046997,
274
+ "learning_rate": 8.725714285714286e-06,
275
+ "loss": 0.4658,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 121.93333333333334,
280
+ "grad_norm": 2.171827793121338,
281
+ "learning_rate": 8.654285714285715e-06,
282
+ "loss": 0.4561,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 125.0,
287
+ "grad_norm": 7.516489505767822,
288
+ "learning_rate": 8.582857142857144e-06,
289
+ "loss": 0.4472,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 125.0,
294
+ "eval_loss": 0.5154594779014587,
295
+ "eval_runtime": 0.7837,
296
+ "eval_samples_per_second": 33.175,
297
+ "eval_steps_per_second": 5.104,
298
+ "step": 1000
299
+ },
300
+ {
301
+ "epoch": 128.13333333333333,
302
+ "grad_norm": 2.5167343616485596,
303
+ "learning_rate": 8.511428571428571e-06,
304
+ "loss": 0.457,
305
+ "step": 1025
306
+ },
307
+ {
308
+ "epoch": 131.26666666666668,
309
+ "grad_norm": 3.3089983463287354,
310
+ "learning_rate": 8.44e-06,
311
+ "loss": 0.4456,
312
+ "step": 1050
313
+ },
314
+ {
315
+ "epoch": 134.4,
316
+ "grad_norm": 2.778348445892334,
317
+ "learning_rate": 8.36857142857143e-06,
318
+ "loss": 0.4612,
319
+ "step": 1075
320
+ },
321
+ {
322
+ "epoch": 137.53333333333333,
323
+ "grad_norm": 2.529778480529785,
324
+ "learning_rate": 8.297142857142859e-06,
325
+ "loss": 0.4429,
326
+ "step": 1100
327
+ },
328
+ {
329
+ "epoch": 140.66666666666666,
330
+ "grad_norm": 1.76685631275177,
331
+ "learning_rate": 8.225714285714288e-06,
332
+ "loss": 0.4399,
333
+ "step": 1125
334
+ },
335
+ {
336
+ "epoch": 143.8,
337
+ "grad_norm": 1.8449666500091553,
338
+ "learning_rate": 8.154285714285715e-06,
339
+ "loss": 0.4329,
340
+ "step": 1150
341
+ },
342
+ {
343
+ "epoch": 146.93333333333334,
344
+ "grad_norm": 1.9097468852996826,
345
+ "learning_rate": 8.082857142857144e-06,
346
+ "loss": 0.4527,
347
+ "step": 1175
348
+ },
349
+ {
350
+ "epoch": 150.0,
351
+ "grad_norm": 3.892838716506958,
352
+ "learning_rate": 8.011428571428573e-06,
353
+ "loss": 0.4448,
354
+ "step": 1200
355
+ },
356
+ {
357
+ "epoch": 153.13333333333333,
358
+ "grad_norm": 2.1518826484680176,
359
+ "learning_rate": 7.94e-06,
360
+ "loss": 0.4412,
361
+ "step": 1225
362
+ },
363
+ {
364
+ "epoch": 156.26666666666668,
365
+ "grad_norm": 1.5322662591934204,
366
+ "learning_rate": 7.86857142857143e-06,
367
+ "loss": 0.4388,
368
+ "step": 1250
369
+ },
370
+ {
371
+ "epoch": 159.4,
372
+ "grad_norm": 1.4961107969284058,
373
+ "learning_rate": 7.797142857142858e-06,
374
+ "loss": 0.4363,
375
+ "step": 1275
376
+ },
377
+ {
378
+ "epoch": 162.53333333333333,
379
+ "grad_norm": 1.8992841243743896,
380
+ "learning_rate": 7.725714285714286e-06,
381
+ "loss": 0.4474,
382
+ "step": 1300
383
+ },
384
+ {
385
+ "epoch": 165.66666666666666,
386
+ "grad_norm": 1.5015554428100586,
387
+ "learning_rate": 7.654285714285715e-06,
388
+ "loss": 0.4327,
389
+ "step": 1325
390
+ },
391
+ {
392
+ "epoch": 168.8,
393
+ "grad_norm": 2.0730693340301514,
394
+ "learning_rate": 7.5828571428571444e-06,
395
+ "loss": 0.4348,
396
+ "step": 1350
397
+ },
398
+ {
399
+ "epoch": 171.93333333333334,
400
+ "grad_norm": 2.0838747024536133,
401
+ "learning_rate": 7.511428571428572e-06,
402
+ "loss": 0.4393,
403
+ "step": 1375
404
+ },
405
+ {
406
+ "epoch": 175.0,
407
+ "grad_norm": 4.3804030418396,
408
+ "learning_rate": 7.440000000000001e-06,
409
+ "loss": 0.4386,
410
+ "step": 1400
411
+ },
412
+ {
413
+ "epoch": 178.13333333333333,
414
+ "grad_norm": 1.8927189111709595,
415
+ "learning_rate": 7.36857142857143e-06,
416
+ "loss": 0.4318,
417
+ "step": 1425
418
+ },
419
+ {
420
+ "epoch": 181.26666666666668,
421
+ "grad_norm": 1.5456620454788208,
422
+ "learning_rate": 7.297142857142858e-06,
423
+ "loss": 0.4336,
424
+ "step": 1450
425
+ },
426
+ {
427
+ "epoch": 184.4,
428
+ "grad_norm": 2.722612142562866,
429
+ "learning_rate": 7.225714285714286e-06,
430
+ "loss": 0.4281,
431
+ "step": 1475
432
+ },
433
+ {
434
+ "epoch": 187.53333333333333,
435
+ "grad_norm": 1.9484314918518066,
436
+ "learning_rate": 7.154285714285715e-06,
437
+ "loss": 0.4312,
438
+ "step": 1500
439
+ },
440
+ {
441
+ "epoch": 190.66666666666666,
442
+ "grad_norm": 2.101043224334717,
443
+ "learning_rate": 7.082857142857143e-06,
444
+ "loss": 0.427,
445
+ "step": 1525
446
+ },
447
+ {
448
+ "epoch": 193.8,
449
+ "grad_norm": 1.9785490036010742,
450
+ "learning_rate": 7.011428571428572e-06,
451
+ "loss": 0.4298,
452
+ "step": 1550
453
+ },
454
+ {
455
+ "epoch": 196.93333333333334,
456
+ "grad_norm": 2.319054126739502,
457
+ "learning_rate": 6.9400000000000005e-06,
458
+ "loss": 0.4376,
459
+ "step": 1575
460
+ },
461
+ {
462
+ "epoch": 200.0,
463
+ "grad_norm": 1.3612741231918335,
464
+ "learning_rate": 6.868571428571429e-06,
465
+ "loss": 0.4217,
466
+ "step": 1600
467
+ },
468
+ {
469
+ "epoch": 203.13333333333333,
470
+ "grad_norm": 2.128363847732544,
471
+ "learning_rate": 6.797142857142858e-06,
472
+ "loss": 0.4217,
473
+ "step": 1625
474
+ },
475
+ {
476
+ "epoch": 206.26666666666668,
477
+ "grad_norm": 1.7985234260559082,
478
+ "learning_rate": 6.725714285714287e-06,
479
+ "loss": 0.4147,
480
+ "step": 1650
481
+ },
482
+ {
483
+ "epoch": 209.4,
484
+ "grad_norm": 1.3478573560714722,
485
+ "learning_rate": 6.654285714285716e-06,
486
+ "loss": 0.4357,
487
+ "step": 1675
488
+ },
489
+ {
490
+ "epoch": 212.53333333333333,
491
+ "grad_norm": 1.5389248132705688,
492
+ "learning_rate": 6.582857142857143e-06,
493
+ "loss": 0.419,
494
+ "step": 1700
495
+ },
496
+ {
497
+ "epoch": 215.66666666666666,
498
+ "grad_norm": 1.9558783769607544,
499
+ "learning_rate": 6.511428571428572e-06,
500
+ "loss": 0.4289,
501
+ "step": 1725
502
+ },
503
+ {
504
+ "epoch": 218.8,
505
+ "grad_norm": 1.756585955619812,
506
+ "learning_rate": 6.440000000000001e-06,
507
+ "loss": 0.4168,
508
+ "step": 1750
509
+ },
510
+ {
511
+ "epoch": 221.93333333333334,
512
+ "grad_norm": 1.8744903802871704,
513
+ "learning_rate": 6.368571428571429e-06,
514
+ "loss": 0.4296,
515
+ "step": 1775
516
+ },
517
+ {
518
+ "epoch": 225.0,
519
+ "grad_norm": 1.133415699005127,
520
+ "learning_rate": 6.297142857142857e-06,
521
+ "loss": 0.4162,
522
+ "step": 1800
523
+ },
524
+ {
525
+ "epoch": 228.13333333333333,
526
+ "grad_norm": 2.819840908050537,
527
+ "learning_rate": 6.225714285714286e-06,
528
+ "loss": 0.4275,
529
+ "step": 1825
530
+ },
531
+ {
532
+ "epoch": 231.26666666666668,
533
+ "grad_norm": 1.5150210857391357,
534
+ "learning_rate": 6.1542857142857145e-06,
535
+ "loss": 0.4244,
536
+ "step": 1850
537
+ },
538
+ {
539
+ "epoch": 234.4,
540
+ "grad_norm": 2.184819459915161,
541
+ "learning_rate": 6.0828571428571435e-06,
542
+ "loss": 0.4282,
543
+ "step": 1875
544
+ },
545
+ {
546
+ "epoch": 237.53333333333333,
547
+ "grad_norm": 3.293454170227051,
548
+ "learning_rate": 6.011428571428572e-06,
549
+ "loss": 0.4215,
550
+ "step": 1900
551
+ },
552
+ {
553
+ "epoch": 240.66666666666666,
554
+ "grad_norm": 1.210433006286621,
555
+ "learning_rate": 5.94e-06,
556
+ "loss": 0.4103,
557
+ "step": 1925
558
+ },
559
+ {
560
+ "epoch": 243.8,
561
+ "grad_norm": 2.5027923583984375,
562
+ "learning_rate": 5.868571428571429e-06,
563
+ "loss": 0.4186,
564
+ "step": 1950
565
+ },
566
+ {
567
+ "epoch": 246.93333333333334,
568
+ "grad_norm": 1.9649789333343506,
569
+ "learning_rate": 5.797142857142858e-06,
570
+ "loss": 0.427,
571
+ "step": 1975
572
+ },
573
+ {
574
+ "epoch": 250.0,
575
+ "grad_norm": 5.899420261383057,
576
+ "learning_rate": 5.725714285714287e-06,
577
+ "loss": 0.4113,
578
+ "step": 2000
579
+ },
580
+ {
581
+ "epoch": 250.0,
582
+ "eval_loss": 0.4833647906780243,
583
+ "eval_runtime": 0.7095,
584
+ "eval_samples_per_second": 36.646,
585
+ "eval_steps_per_second": 5.638,
586
+ "step": 2000
587
+ },
588
+ {
589
+ "epoch": 253.13333333333333,
590
+ "grad_norm": 2.0845134258270264,
591
+ "learning_rate": 5.654285714285714e-06,
592
+ "loss": 0.4168,
593
+ "step": 2025
594
+ },
595
+ {
596
+ "epoch": 256.26666666666665,
597
+ "grad_norm": 1.3729593753814697,
598
+ "learning_rate": 5.582857142857143e-06,
599
+ "loss": 0.4099,
600
+ "step": 2050
601
+ },
602
+ {
603
+ "epoch": 259.4,
604
+ "grad_norm": 1.8317629098892212,
605
+ "learning_rate": 5.511428571428572e-06,
606
+ "loss": 0.4136,
607
+ "step": 2075
608
+ },
609
+ {
610
+ "epoch": 262.53333333333336,
611
+ "grad_norm": 1.6238123178482056,
612
+ "learning_rate": 5.4400000000000004e-06,
613
+ "loss": 0.4204,
614
+ "step": 2100
615
+ },
616
+ {
617
+ "epoch": 265.6666666666667,
618
+ "grad_norm": 1.6968961954116821,
619
+ "learning_rate": 5.368571428571429e-06,
620
+ "loss": 0.4119,
621
+ "step": 2125
622
+ },
623
+ {
624
+ "epoch": 268.8,
625
+ "grad_norm": 2.1868855953216553,
626
+ "learning_rate": 5.297142857142858e-06,
627
+ "loss": 0.4114,
628
+ "step": 2150
629
+ },
630
+ {
631
+ "epoch": 271.93333333333334,
632
+ "grad_norm": 1.3070896863937378,
633
+ "learning_rate": 5.225714285714286e-06,
634
+ "loss": 0.4108,
635
+ "step": 2175
636
+ },
637
+ {
638
+ "epoch": 275.0,
639
+ "grad_norm": 1.977940559387207,
640
+ "learning_rate": 5.154285714285715e-06,
641
+ "loss": 0.4045,
642
+ "step": 2200
643
+ },
644
+ {
645
+ "epoch": 278.1333333333333,
646
+ "grad_norm": 1.6485978364944458,
647
+ "learning_rate": 5.082857142857144e-06,
648
+ "loss": 0.4119,
649
+ "step": 2225
650
+ },
651
+ {
652
+ "epoch": 281.26666666666665,
653
+ "grad_norm": 1.9459550380706787,
654
+ "learning_rate": 5.011428571428571e-06,
655
+ "loss": 0.411,
656
+ "step": 2250
657
+ },
658
+ {
659
+ "epoch": 284.4,
660
+ "grad_norm": 1.5531017780303955,
661
+ "learning_rate": 4.94e-06,
662
+ "loss": 0.4083,
663
+ "step": 2275
664
+ },
665
+ {
666
+ "epoch": 287.53333333333336,
667
+ "grad_norm": 1.232640027999878,
668
+ "learning_rate": 4.868571428571429e-06,
669
+ "loss": 0.4121,
670
+ "step": 2300
671
+ },
672
+ {
673
+ "epoch": 290.6666666666667,
674
+ "grad_norm": 7.107569217681885,
675
+ "learning_rate": 4.800000000000001e-06,
676
+ "loss": 0.4013,
677
+ "step": 2325
678
+ },
679
+ {
680
+ "epoch": 293.8,
681
+ "grad_norm": 1.387934684753418,
682
+ "learning_rate": 4.728571428571429e-06,
683
+ "loss": 0.4135,
684
+ "step": 2350
685
+ },
686
+ {
687
+ "epoch": 296.93333333333334,
688
+ "grad_norm": 1.8122384548187256,
689
+ "learning_rate": 4.657142857142857e-06,
690
+ "loss": 0.4025,
691
+ "step": 2375
692
+ },
693
+ {
694
+ "epoch": 300.0,
695
+ "grad_norm": 3.2206528186798096,
696
+ "learning_rate": 4.585714285714286e-06,
697
+ "loss": 0.4055,
698
+ "step": 2400
699
+ },
700
+ {
701
+ "epoch": 303.1333333333333,
702
+ "grad_norm": 1.6222842931747437,
703
+ "learning_rate": 4.514285714285714e-06,
704
+ "loss": 0.4125,
705
+ "step": 2425
706
+ },
707
+ {
708
+ "epoch": 306.26666666666665,
709
+ "grad_norm": 1.4375584125518799,
710
+ "learning_rate": 4.442857142857143e-06,
711
+ "loss": 0.4033,
712
+ "step": 2450
713
+ },
714
+ {
715
+ "epoch": 309.4,
716
+ "grad_norm": 1.173034906387329,
717
+ "learning_rate": 4.371428571428572e-06,
718
+ "loss": 0.4081,
719
+ "step": 2475
720
+ },
721
+ {
722
+ "epoch": 312.53333333333336,
723
+ "grad_norm": 1.9508713483810425,
724
+ "learning_rate": 4.3e-06,
725
+ "loss": 0.4126,
726
+ "step": 2500
727
+ },
728
+ {
729
+ "epoch": 315.6666666666667,
730
+ "grad_norm": 1.6111533641815186,
731
+ "learning_rate": 4.228571428571429e-06,
732
+ "loss": 0.3956,
733
+ "step": 2525
734
+ },
735
+ {
736
+ "epoch": 318.8,
737
+ "grad_norm": 2.0711958408355713,
738
+ "learning_rate": 4.1571428571428575e-06,
739
+ "loss": 0.4079,
740
+ "step": 2550
741
+ },
742
+ {
743
+ "epoch": 321.93333333333334,
744
+ "grad_norm": 2.312619924545288,
745
+ "learning_rate": 4.0857142857142865e-06,
746
+ "loss": 0.4172,
747
+ "step": 2575
748
+ },
749
+ {
750
+ "epoch": 325.0,
751
+ "grad_norm": 8.329635620117188,
752
+ "learning_rate": 4.014285714285715e-06,
753
+ "loss": 0.3956,
754
+ "step": 2600
755
+ },
756
+ {
757
+ "epoch": 328.1333333333333,
758
+ "grad_norm": 6.655773639678955,
759
+ "learning_rate": 3.942857142857143e-06,
760
+ "loss": 0.3998,
761
+ "step": 2625
762
+ },
763
+ {
764
+ "epoch": 331.26666666666665,
765
+ "grad_norm": 1.7531079053878784,
766
+ "learning_rate": 3.871428571428572e-06,
767
+ "loss": 0.4023,
768
+ "step": 2650
769
+ },
770
+ {
771
+ "epoch": 334.4,
772
+ "grad_norm": 2.5502614974975586,
773
+ "learning_rate": 3.8000000000000005e-06,
774
+ "loss": 0.4026,
775
+ "step": 2675
776
+ },
777
+ {
778
+ "epoch": 337.53333333333336,
779
+ "grad_norm": 1.471871256828308,
780
+ "learning_rate": 3.7285714285714286e-06,
781
+ "loss": 0.3981,
782
+ "step": 2700
783
+ },
784
+ {
785
+ "epoch": 340.6666666666667,
786
+ "grad_norm": 2.094290018081665,
787
+ "learning_rate": 3.6571428571428576e-06,
788
+ "loss": 0.4006,
789
+ "step": 2725
790
+ },
791
+ {
792
+ "epoch": 343.8,
793
+ "grad_norm": 1.3232810497283936,
794
+ "learning_rate": 3.5857142857142862e-06,
795
+ "loss": 0.4013,
796
+ "step": 2750
797
+ },
798
+ {
799
+ "epoch": 346.93333333333334,
800
+ "grad_norm": 1.5902683734893799,
801
+ "learning_rate": 3.5142857142857144e-06,
802
+ "loss": 0.4042,
803
+ "step": 2775
804
+ },
805
+ {
806
+ "epoch": 350.0,
807
+ "grad_norm": 5.186419486999512,
808
+ "learning_rate": 3.4428571428571434e-06,
809
+ "loss": 0.3843,
810
+ "step": 2800
811
+ },
812
+ {
813
+ "epoch": 353.1333333333333,
814
+ "grad_norm": 2.3405115604400635,
815
+ "learning_rate": 3.3714285714285716e-06,
816
+ "loss": 0.409,
817
+ "step": 2825
818
+ },
819
+ {
820
+ "epoch": 356.26666666666665,
821
+ "grad_norm": 1.1804980039596558,
822
+ "learning_rate": 3.3000000000000006e-06,
823
+ "loss": 0.405,
824
+ "step": 2850
825
+ },
826
+ {
827
+ "epoch": 359.4,
828
+ "grad_norm": 1.596712589263916,
829
+ "learning_rate": 3.2285714285714288e-06,
830
+ "loss": 0.4098,
831
+ "step": 2875
832
+ },
833
+ {
834
+ "epoch": 362.53333333333336,
835
+ "grad_norm": 1.9429064989089966,
836
+ "learning_rate": 3.1571428571428573e-06,
837
+ "loss": 0.413,
838
+ "step": 2900
839
+ },
840
+ {
841
+ "epoch": 365.6666666666667,
842
+ "grad_norm": 1.3636008501052856,
843
+ "learning_rate": 3.085714285714286e-06,
844
+ "loss": 0.394,
845
+ "step": 2925
846
+ },
847
+ {
848
+ "epoch": 368.8,
849
+ "grad_norm": 1.2349225282669067,
850
+ "learning_rate": 3.0142857142857145e-06,
851
+ "loss": 0.3964,
852
+ "step": 2950
853
+ },
854
+ {
855
+ "epoch": 371.93333333333334,
856
+ "grad_norm": 1.3793219327926636,
857
+ "learning_rate": 2.9428571428571427e-06,
858
+ "loss": 0.3958,
859
+ "step": 2975
860
+ },
861
+ {
862
+ "epoch": 375.0,
863
+ "grad_norm": 7.785330772399902,
864
+ "learning_rate": 2.8714285714285717e-06,
865
+ "loss": 0.3906,
866
+ "step": 3000
867
+ },
868
+ {
869
+ "epoch": 375.0,
870
+ "eval_loss": 0.46799278259277344,
871
+ "eval_runtime": 0.7043,
872
+ "eval_samples_per_second": 36.916,
873
+ "eval_steps_per_second": 5.679,
874
+ "step": 3000
875
+ }
876
+ ],
877
+ "logging_steps": 25,
878
+ "max_steps": 4000,
879
+ "num_input_tokens_seen": 0,
880
+ "num_train_epochs": 572,
881
+ "save_steps": 1000,
882
+ "stateful_callbacks": {
883
+ "TrainerControl": {
884
+ "args": {
885
+ "should_epoch_stop": false,
886
+ "should_evaluate": false,
887
+ "should_log": false,
888
+ "should_save": true,
889
+ "should_training_stop": false
890
+ },
891
+ "attributes": {}
892
+ }
893
+ },
894
+ "total_flos": 7721257243235184.0,
895
+ "train_batch_size": 16,
896
+ "trial_name": null,
897
+ "trial_params": null
898
+ }
checkpoint-3000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07e653cd3b83482a0939ee4c8a207df9a996f44ce9dd82197c4ab6cde60cf2bb
3
+ size 5432
checkpoint-4000/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<ctc_blank>": 80,
3
+ "<mask>": 79
4
+ }
checkpoint-4000/config.json ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "SpeechT5ForTextToSpeech"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 0,
9
+ "conv_bias": false,
10
+ "conv_dim": [
11
+ 512,
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512
18
+ ],
19
+ "conv_kernel": [
20
+ 10,
21
+ 3,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 2,
26
+ 2
27
+ ],
28
+ "conv_stride": [
29
+ 5,
30
+ 2,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2
36
+ ],
37
+ "decoder_attention_heads": 12,
38
+ "decoder_ffn_dim": 3072,
39
+ "decoder_layerdrop": 0.1,
40
+ "decoder_layers": 6,
41
+ "decoder_start_token_id": 2,
42
+ "encoder_attention_heads": 12,
43
+ "encoder_ffn_dim": 3072,
44
+ "encoder_layerdrop": 0.1,
45
+ "encoder_layers": 12,
46
+ "encoder_max_relative_position": 160,
47
+ "eos_token_id": 2,
48
+ "feat_extract_activation": "gelu",
49
+ "feat_extract_norm": "group",
50
+ "feat_proj_dropout": 0.0,
51
+ "guided_attention_loss_num_heads": 2,
52
+ "guided_attention_loss_scale": 10.0,
53
+ "guided_attention_loss_sigma": 0.4,
54
+ "hidden_act": "gelu",
55
+ "hidden_dropout": 0.1,
56
+ "hidden_size": 768,
57
+ "initializer_range": 0.02,
58
+ "is_encoder_decoder": true,
59
+ "layer_norm_eps": 1e-05,
60
+ "mask_feature_length": 10,
61
+ "mask_feature_min_masks": 0,
62
+ "mask_feature_prob": 0.0,
63
+ "mask_time_length": 10,
64
+ "mask_time_min_masks": 2,
65
+ "mask_time_prob": 0.05,
66
+ "max_length": null,
67
+ "max_speech_positions": 1876,
68
+ "max_text_positions": 600,
69
+ "model_type": "speecht5",
70
+ "num_conv_pos_embedding_groups": 16,
71
+ "num_conv_pos_embeddings": 128,
72
+ "num_feat_extract_layers": 7,
73
+ "num_mel_bins": 80,
74
+ "pad_token_id": 1,
75
+ "positional_dropout": 0.1,
76
+ "reduction_factor": 2,
77
+ "scale_embedding": false,
78
+ "speaker_embedding_dim": 512,
79
+ "speech_decoder_postnet_dropout": 0.5,
80
+ "speech_decoder_postnet_kernel": 5,
81
+ "speech_decoder_postnet_layers": 5,
82
+ "speech_decoder_postnet_units": 256,
83
+ "speech_decoder_prenet_dropout": 0.5,
84
+ "speech_decoder_prenet_layers": 2,
85
+ "speech_decoder_prenet_units": 256,
86
+ "torch_dtype": "float32",
87
+ "transformers_version": "4.52.0.dev0",
88
+ "use_cache": false,
89
+ "use_guided_attention_loss": true,
90
+ "vocab_size": 81
91
+ }
checkpoint-4000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "max_length": 1876,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "4.52.0.dev0"
9
+ }
checkpoint-4000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:055e59911edf63563dfc4431a4301a869cc74effda4ba5c905ace376e831bd5d
3
+ size 577789320
checkpoint-4000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e74b50ecc65d621a0c3fd3c8d0516e0345843175eb2a55af482c69d69da162e7
3
+ size 1155772233
checkpoint-4000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75ff95056306e75aaca85257572ca65ded44b3fc874ae842724682d1ad4067c2
3
+ size 14244
checkpoint-4000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:030f559b5aedef78935dd7632eb67ec4527791e9aca3eb758b902243f597abd2
3
+ size 988
checkpoint-4000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7efbf80617c96c78286826ce59d9a12c86da62d7631874b3d6364a8e993ada60
3
+ size 1064
checkpoint-4000/special_tokens_map.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "mask_token": {
5
+ "content": "<mask>",
6
+ "lstrip": true,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "pad_token": "<pad>",
12
+ "unk_token": "<unk>"
13
+ }