miosipof commited on
Commit
5b017ec
·
verified ·
1 Parent(s): a832978

Training in progress, epoch 1

Browse files
Files changed (5) hide show
  1. README.md +82 -0
  2. config.json +160 -0
  3. model.safetensors +3 -0
  4. preprocessor_config.json +14 -0
  5. training_args.bin +3 -0
README.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: openai/whisper-small
5
+ tags:
6
+ - generated_from_trainer
7
+ datasets:
8
+ - balbus-classifier
9
+ metrics:
10
+ - accuracy
11
+ model-index:
12
+ - name: miosipof/whisper-small-ft-balbus-sep28k-v1
13
+ results:
14
+ - task:
15
+ name: Audio Classification
16
+ type: audio-classification
17
+ dataset:
18
+ name: Apple dataset
19
+ type: balbus-classifier
20
+ config: default
21
+ split: train
22
+ args: default
23
+ metrics:
24
+ - name: Accuracy
25
+ type: accuracy
26
+ value: 0.7953596287703016
27
+ ---
28
+
29
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
30
+ should probably proofread and complete it, then remove this comment. -->
31
+
32
+ # miosipof/whisper-small-ft-balbus-sep28k-v1
33
+
34
+ This model is a fine-tuned version of [openai/whisper-small](https://huggingface.co/openai/whisper-small) on the Apple dataset dataset.
35
+ It achieves the following results on the evaluation set:
36
+ - Loss: 0.5668
37
+ - Accuracy: 0.7954
38
+
39
+ ## Model description
40
+
41
+ More information needed
42
+
43
+ ## Intended uses & limitations
44
+
45
+ More information needed
46
+
47
+ ## Training and evaluation data
48
+
49
+ More information needed
50
+
51
+ ## Training procedure
52
+
53
+ ### Training hyperparameters
54
+
55
+ The following hyperparameters were used during training:
56
+ - learning_rate: 0.0001
57
+ - train_batch_size: 32
58
+ - eval_batch_size: 32
59
+ - seed: 42
60
+ - gradient_accumulation_steps: 2
61
+ - total_train_batch_size: 64
62
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
63
+ - lr_scheduler_type: linear
64
+ - lr_scheduler_warmup_ratio: 0.2
65
+ - num_epochs: 3
66
+ - mixed_precision_training: Native AMP
67
+
68
+ ### Training results
69
+
70
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
71
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|
72
+ | No log | 1.0 | 404 | 0.4751 | 0.7748 |
73
+ | 0.494 | 2.0 | 808 | 0.4533 | 0.7901 |
74
+ | 0.3256 | 3.0 | 1212 | 0.5668 | 0.7954 |
75
+
76
+
77
+ ### Framework versions
78
+
79
+ - Transformers 4.48.1
80
+ - Pytorch 2.2.0
81
+ - Datasets 3.2.0
82
+ - Tokenizers 0.21.0
config.json ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai/whisper-small",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "apply_spec_augment": false,
6
+ "architectures": [
7
+ "WhisperForAudioClassification"
8
+ ],
9
+ "attention_dropout": 0.0,
10
+ "begin_suppress_tokens": [
11
+ 220,
12
+ 50257
13
+ ],
14
+ "bos_token_id": 50257,
15
+ "classifier_proj_size": 256,
16
+ "d_model": 768,
17
+ "decoder_attention_heads": 12,
18
+ "decoder_ffn_dim": 3072,
19
+ "decoder_layerdrop": 0.0,
20
+ "decoder_layers": 12,
21
+ "decoder_start_token_id": 50258,
22
+ "dropout": 0.0,
23
+ "encoder_attention_heads": 12,
24
+ "encoder_ffn_dim": 3072,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 12,
27
+ "eos_token_id": 50257,
28
+ "forced_decoder_ids": [
29
+ [
30
+ 1,
31
+ 50259
32
+ ],
33
+ [
34
+ 2,
35
+ 50359
36
+ ],
37
+ [
38
+ 3,
39
+ 50363
40
+ ]
41
+ ],
42
+ "id2label": {
43
+ "0": "0",
44
+ "1": "1"
45
+ },
46
+ "init_std": 0.02,
47
+ "is_encoder_decoder": true,
48
+ "label2id": {
49
+ "0": "0",
50
+ "1": "1"
51
+ },
52
+ "mask_feature_length": 10,
53
+ "mask_feature_min_masks": 0,
54
+ "mask_feature_prob": 0.0,
55
+ "mask_time_length": 10,
56
+ "mask_time_min_masks": 2,
57
+ "mask_time_prob": 0.05,
58
+ "max_length": 448,
59
+ "max_source_positions": 1500,
60
+ "max_target_positions": 448,
61
+ "median_filter_width": 7,
62
+ "model_type": "whisper",
63
+ "num_hidden_layers": 12,
64
+ "num_mel_bins": 80,
65
+ "pad_token_id": 50257,
66
+ "scale_embedding": false,
67
+ "suppress_tokens": [
68
+ 1,
69
+ 2,
70
+ 7,
71
+ 8,
72
+ 9,
73
+ 10,
74
+ 14,
75
+ 25,
76
+ 26,
77
+ 27,
78
+ 28,
79
+ 29,
80
+ 31,
81
+ 58,
82
+ 59,
83
+ 60,
84
+ 61,
85
+ 62,
86
+ 63,
87
+ 90,
88
+ 91,
89
+ 92,
90
+ 93,
91
+ 359,
92
+ 503,
93
+ 522,
94
+ 542,
95
+ 873,
96
+ 893,
97
+ 902,
98
+ 918,
99
+ 922,
100
+ 931,
101
+ 1350,
102
+ 1853,
103
+ 1982,
104
+ 2460,
105
+ 2627,
106
+ 3246,
107
+ 3253,
108
+ 3268,
109
+ 3536,
110
+ 3846,
111
+ 3961,
112
+ 4183,
113
+ 4667,
114
+ 6585,
115
+ 6647,
116
+ 7273,
117
+ 9061,
118
+ 9383,
119
+ 10428,
120
+ 10929,
121
+ 11938,
122
+ 12033,
123
+ 12331,
124
+ 12562,
125
+ 13793,
126
+ 14157,
127
+ 14635,
128
+ 15265,
129
+ 15618,
130
+ 16553,
131
+ 16604,
132
+ 18362,
133
+ 18956,
134
+ 20075,
135
+ 21675,
136
+ 22520,
137
+ 26130,
138
+ 26161,
139
+ 26435,
140
+ 28279,
141
+ 29464,
142
+ 31650,
143
+ 32302,
144
+ 32470,
145
+ 36865,
146
+ 42863,
147
+ 47425,
148
+ 49870,
149
+ 50254,
150
+ 50258,
151
+ 50360,
152
+ 50361,
153
+ 50362
154
+ ],
155
+ "torch_dtype": "float32",
156
+ "transformers_version": "4.48.1",
157
+ "use_cache": true,
158
+ "use_weighted_layer_sum": false,
159
+ "vocab_size": 51865
160
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa99db766664cbe909c0a8cae719531750ee0cb7345c4ad4ec3ac9b6231d72b8
3
+ size 353426360
preprocessor_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "feature_extractor_type": "WhisperFeatureExtractor",
4
+ "feature_size": 80,
5
+ "hop_length": 160,
6
+ "n_fft": 400,
7
+ "n_samples": 480000,
8
+ "nb_max_frames": 3000,
9
+ "padding_side": "right",
10
+ "padding_value": 0.0,
11
+ "processor_class": "WhisperProcessor",
12
+ "return_attention_mask": false,
13
+ "sampling_rate": 16000
14
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1947f1971528c3ca413572ff2ae5a1e2aa4e282216590c7946978a3765076158
3
+ size 5368