LAP-DEV commited on
Commit
f2e3ed5
·
verified ·
1 Parent(s): ac71e94

Delete modules/whisper/whisper_parameter.py

Browse files
Files changed (1) hide show
  1. modules/whisper/whisper_parameter.py +0 -381
modules/whisper/whisper_parameter.py DELETED
@@ -1,381 +0,0 @@
1
- from dataclasses import dataclass, fields
2
- import gradio as gr
3
- from typing import Optional, Dict
4
- import yaml
5
-
6
-
7
- @dataclass
8
- class WhisperParameters:
9
- model_size: gr.Dropdown
10
- lang: gr.Dropdown
11
- is_translate: gr.Checkbox
12
- beam_size: gr.Number
13
- log_prob_threshold: gr.Number
14
- no_speech_threshold: gr.Number
15
- compute_type: gr.Dropdown
16
- best_of: gr.Number
17
- patience: gr.Number
18
- condition_on_previous_text: gr.Checkbox
19
- prompt_reset_on_temperature: gr.Slider
20
- initial_prompt: gr.Textbox
21
- temperature: gr.Slider
22
- compression_ratio_threshold: gr.Number
23
- vad_filter: gr.Checkbox
24
- threshold: gr.Slider
25
- min_speech_duration_ms: gr.Number
26
- max_speech_duration_s: gr.Number
27
- min_silence_duration_ms: gr.Number
28
- speech_pad_ms: gr.Number
29
- batch_size: gr.Number
30
- is_diarize: gr.Checkbox
31
- hf_token: gr.Textbox
32
- diarization_device: gr.Dropdown
33
- length_penalty: gr.Number
34
- repetition_penalty: gr.Number
35
- no_repeat_ngram_size: gr.Number
36
- prefix: gr.Textbox
37
- suppress_blank: gr.Checkbox
38
- suppress_tokens: gr.Textbox
39
- max_initial_timestamp: gr.Number
40
- word_timestamps: gr.Checkbox
41
- prepend_punctuations: gr.Textbox
42
- append_punctuations: gr.Textbox
43
- max_new_tokens: gr.Number
44
- chunk_length: gr.Number
45
- hallucination_silence_threshold: gr.Number
46
- hotwords: gr.Textbox
47
- language_detection_threshold: gr.Number
48
- language_detection_segments: gr.Number
49
- is_bgm_separate: gr.Checkbox
50
- uvr_model_size: gr.Dropdown
51
- uvr_device: gr.Dropdown
52
- uvr_segment_size: gr.Number
53
- uvr_save_file: gr.Checkbox
54
- uvr_enable_offload: gr.Checkbox
55
- whisper_enable_offload: gr.Checkbox
56
- diarization_enable_offload: gr.Checkbox
57
- """
58
- A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
59
- This data class is used to mitigate the key-value problem between Gradio components and function parameters.
60
- Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
61
- See more about Gradio pre-processing: https://www.gradio.app/docs/components
62
-
63
- Attributes
64
- ----------
65
- model_size: gr.Dropdown
66
- Whisper model size.
67
-
68
- lang: gr.Dropdown
69
- Source language of the file to transcribe.
70
-
71
- is_translate: gr.Checkbox
72
- Boolean value that determines whether to translate to English.
73
- It's Whisper's feature to translate speech from another language directly into English end-to-end.
74
-
75
- beam_size: gr.Number
76
- Int value that is used for decoding option.
77
-
78
- log_prob_threshold: gr.Number
79
- If the average log probability over sampled tokens is below this value, treat as failed.
80
-
81
- no_speech_threshold: gr.Number
82
- If the no_speech probability is higher than this value AND
83
- the average log probability over sampled tokens is below `log_prob_threshold`,
84
- consider the segment as silent.
85
-
86
- compute_type: gr.Dropdown
87
- compute type for transcription.
88
- see more info : https://opennmt.net/CTranslate2/quantization.html
89
-
90
- best_of: gr.Number
91
- Number of candidates when sampling with non-zero temperature.
92
-
93
- patience: gr.Number
94
- Beam search patience factor.
95
-
96
- condition_on_previous_text: gr.Checkbox
97
- if True, the previous output of the model is provided as a prompt for the next window;
98
- disabling may make the text inconsistent across windows, but the model becomes less prone to
99
- getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
100
-
101
- initial_prompt: gr.Textbox
102
- Optional text to provide as a prompt for the first window. This can be used to provide, or
103
- "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
104
- to make it more likely to predict those word correctly.
105
-
106
- temperature: gr.Slider
107
- Temperature for sampling. It can be a tuple of temperatures,
108
- which will be successively used upon failures according to either
109
- `compression_ratio_threshold` or `log_prob_threshold`.
110
-
111
- compression_ratio_threshold: gr.Number
112
- If the gzip compression ratio is above this value, treat as failed
113
-
114
- vad_filter: gr.Checkbox
115
- Enable the voice activity detection (VAD) to filter out parts of the audio
116
- without speech. This step is using the Silero VAD model
117
- https://github.com/snakers4/silero-vad.
118
-
119
- threshold: gr.Slider
120
- This parameter is related with Silero VAD. Speech threshold.
121
- Silero VAD outputs speech probabilities for each audio chunk,
122
- probabilities ABOVE this value are considered as SPEECH. It is better to tune this
123
- parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
124
-
125
- min_speech_duration_ms: gr.Number
126
- This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.
127
-
128
- max_speech_duration_s: gr.Number
129
- This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
130
- than max_speech_duration_s will be split at the timestamp of the last silence that
131
- lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
132
- split aggressively just before max_speech_duration_s.
133
-
134
- min_silence_duration_ms: gr.Number
135
- This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
136
- before separating it
137
-
138
- speech_pad_ms: gr.Number
139
- This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
140
-
141
- batch_size: gr.Number
142
- This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
143
-
144
- is_diarize: gr.Checkbox
145
- This parameter is related with whisperx. Boolean value that determines whether to diarize or not.
146
-
147
- hf_token: gr.Textbox
148
- This parameter is related with whisperx. Huggingface token is needed to download diarization models.
149
- Read more about : https://huggingface.co/pyannote/speaker-diarization-3.1#requirements
150
-
151
- diarization_device: gr.Dropdown
152
- This parameter is related with whisperx. Device to run diarization model
153
-
154
- length_penalty: gr.Number
155
- This parameter is related to faster-whisper. Exponential length penalty constant.
156
-
157
- repetition_penalty: gr.Number
158
- This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
159
- (set > 1 to penalize).
160
-
161
- no_repeat_ngram_size: gr.Number
162
- This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
163
-
164
- prefix: gr.Textbox
165
- This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
166
-
167
- suppress_blank: gr.Checkbox
168
- This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
169
-
170
- suppress_tokens: gr.Textbox
171
- This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
172
- of symbols as defined in the model config.json file.
173
-
174
- max_initial_timestamp: gr.Number
175
- This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
176
-
177
- word_timestamps: gr.Checkbox
178
- This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
179
- and dynamic time warping, and include the timestamps for each word in each segment.
180
-
181
- prepend_punctuations: gr.Textbox
182
- This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
183
- with the next word.
184
-
185
- append_punctuations: gr.Textbox
186
- This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
187
- with the previous word.
188
-
189
- max_new_tokens: gr.Number
190
- This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
191
- the maximum will be set by the default max_length.
192
-
193
- chunk_length: gr.Number
194
- This parameter is related to faster-whisper and insanely-fast-whisper. The length of audio segments in seconds.
195
- If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.
196
-
197
- hallucination_silence_threshold: gr.Number
198
- This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
199
- (in seconds) when a possible hallucination is detected.
200
-
201
- hotwords: gr.Textbox
202
- This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
203
-
204
- language_detection_threshold: gr.Number
205
- This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
206
-
207
- language_detection_segments: gr.Number
208
- This parameter is related to faster-whisper. Number of segments to consider for the language detection.
209
-
210
- is_separate_bgm: gr.Checkbox
211
- This parameter is related to UVR. Boolean value that determines whether to separate bgm or not.
212
-
213
- uvr_model_size: gr.Dropdown
214
- This parameter is related to UVR. UVR model size.
215
-
216
- uvr_device: gr.Dropdown
217
- This parameter is related to UVR. Device to run UVR model.
218
-
219
- uvr_segment_size: gr.Number
220
- This parameter is related to UVR. Segment size for UVR model.
221
-
222
- uvr_save_file: gr.Checkbox
223
- This parameter is related to UVR. Boolean value that determines whether to save the file or not.
224
-
225
- uvr_enable_offload: gr.Checkbox
226
- This parameter is related to UVR. Boolean value that determines whether to offload the UVR model or not
227
- after each transcription.
228
-
229
- whisper_enable_offload: gr.Checkbox
230
- This parameter is related to Whisper. Boolean value that determines whether to offload the Whisper model.
231
-
232
- diarization_enable_offload: gr.Checkbox
233
- This parameter is related to diarization. Boolean value that determines whether to offload the diarization model.
234
- """
235
-
236
- def as_list(self) -> list:
237
- """
238
- Converts the data class attributes into a list, Use in Gradio UI before Gradio pre-processing.
239
- See more about Gradio pre-processing: : https://www.gradio.app/docs/components
240
-
241
- Returns
242
- ----------
243
- A list of Gradio components
244
- """
245
- return [getattr(self, f.name) for f in fields(self)]
246
-
247
- @staticmethod
248
- def as_value(*args) -> 'WhisperValues':
249
- """
250
- To use Whisper parameters in function after Gradio post-processing.
251
- See more about Gradio post-processing: : https://www.gradio.app/docs/components
252
-
253
- Returns
254
- ----------
255
- WhisperValues
256
- Data class that has values of parameters
257
- """
258
- return WhisperValues(*args)
259
-
260
-
261
- @dataclass
262
- class WhisperValues:
263
- model_size: str = "large-v2"
264
- lang: Optional[str] = None
265
- is_translate: bool = False
266
- beam_size: int = 5
267
- log_prob_threshold: float = -1.0
268
- no_speech_threshold: float = 0.6
269
- compute_type: str = "float16"
270
- best_of: int = 5
271
- patience: float = 1.0
272
- condition_on_previous_text: bool = True
273
- prompt_reset_on_temperature: float = 0.5
274
- initial_prompt: Optional[str] = None
275
- temperature: float = 0.0
276
- compression_ratio_threshold: float = 2.4
277
- vad_filter: bool = False
278
- threshold: float = 0.5
279
- min_speech_duration_ms: int = 250
280
- max_speech_duration_s: float = float("inf")
281
- min_silence_duration_ms: int = 2000
282
- speech_pad_ms: int = 400
283
- batch_size: int = 24
284
- is_diarize: bool = False
285
- hf_token: str = ""
286
- diarization_device: str = "cuda"
287
- length_penalty: float = 1.0
288
- repetition_penalty: float = 1.0
289
- no_repeat_ngram_size: int = 0
290
- prefix: Optional[str] = None
291
- suppress_blank: bool = True
292
- suppress_tokens: Optional[str] = "[-1]"
293
- max_initial_timestamp: float = 0.0
294
- word_timestamps: bool = False
295
- prepend_punctuations: Optional[str] = "\"'“¿([{-"
296
- append_punctuations: Optional[str] = "\"'.。,,!!??::”)]}、"
297
- max_new_tokens: Optional[int] = None
298
- chunk_length: Optional[int] = 30
299
- hallucination_silence_threshold: Optional[float] = None
300
- hotwords: Optional[str] = None
301
- language_detection_threshold: Optional[float] = None
302
- language_detection_segments: int = 1
303
- is_bgm_separate: bool = False
304
- uvr_model_size: str = "UVR-MDX-NET-Inst_HQ_4"
305
- uvr_device: str = "cuda"
306
- uvr_segment_size: int = 256
307
- uvr_save_file: bool = False
308
- uvr_enable_offload: bool = True
309
- whisper_enable_offload: bool = True
310
- diarization_enable_offload: bool = True
311
- """
312
- A data class to use Whisper parameters.
313
- """
314
-
315
- def to_yaml(self) -> Dict:
316
- data = {
317
- "whisper": {
318
- "model_size": self.model_size,
319
- "lang": "Automatic Detection" if self.lang is None else self.lang,
320
- "is_translate": self.is_translate,
321
- "beam_size": self.beam_size,
322
- "log_prob_threshold": self.log_prob_threshold,
323
- "no_speech_threshold": self.no_speech_threshold,
324
- "best_of": self.best_of,
325
- "patience": self.patience,
326
- "condition_on_previous_text": self.condition_on_previous_text,
327
- "prompt_reset_on_temperature": self.prompt_reset_on_temperature,
328
- "initial_prompt": None if not self.initial_prompt else self.initial_prompt,
329
- "temperature": self.temperature,
330
- "compression_ratio_threshold": self.compression_ratio_threshold,
331
- "batch_size": self.batch_size,
332
- "length_penalty": self.length_penalty,
333
- "repetition_penalty": self.repetition_penalty,
334
- "no_repeat_ngram_size": self.no_repeat_ngram_size,
335
- "prefix": None if not self.prefix else self.prefix,
336
- "suppress_blank": self.suppress_blank,
337
- "suppress_tokens": self.suppress_tokens,
338
- "max_initial_timestamp": self.max_initial_timestamp,
339
- "word_timestamps": self.word_timestamps,
340
- "prepend_punctuations": self.prepend_punctuations,
341
- "append_punctuations": self.append_punctuations,
342
- "max_new_tokens": self.max_new_tokens,
343
- "chunk_length": self.chunk_length,
344
- "hallucination_silence_threshold": self.hallucination_silence_threshold,
345
- "hotwords": None if not self.hotwords else self.hotwords,
346
- "language_detection_threshold": self.language_detection_threshold,
347
- "language_detection_segments": self.language_detection_segments,
348
- "enable_offload": self.whisper_enable_offload
349
- },
350
- "vad": {
351
- "vad_filter": self.vad_filter,
352
- "threshold": self.threshold,
353
- "min_speech_duration_ms": self.min_speech_duration_ms,
354
- "max_speech_duration_s": self.max_speech_duration_s,
355
- "min_silence_duration_ms": self.min_silence_duration_ms,
356
- "speech_pad_ms": self.speech_pad_ms,
357
- },
358
- "diarization": {
359
- "is_diarize": self.is_diarize,
360
- "hf_token": self.hf_token,
361
- "enable_offload": self.diarization_enable_offload
362
- },
363
- "bgm_separation": {
364
- "is_separate_bgm": self.is_bgm_separate,
365
- "model_size": self.uvr_model_size,
366
- "segment_size": self.uvr_segment_size,
367
- "save_file": self.uvr_save_file,
368
- "enable_offload": self.uvr_enable_offload
369
- },
370
- }
371
- return data
372
-
373
- def as_list(self) -> list:
374
- """
375
- Converts the data class attributes into a list
376
-
377
- Returns
378
- ----------
379
- A list of Whisper parameters
380
- """
381
- return [getattr(self, f.name) for f in fields(self)]