Add word timestamps to Simple and reorder
Browse files
app.py
CHANGED
|
@@ -84,44 +84,49 @@ class WhisperTranscriber:
|
|
| 84 |
print("[Auto parallel] Using GPU devices " + str(self.parallel_device_list) + " and " + str(self.vad_cpu_cores) + " CPU cores for VAD/transcription.")
|
| 85 |
|
| 86 |
# Entry function for the simple tab
|
| 87 |
-
def transcribe_webui_simple(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
# Entry function for the simple tab progress
|
| 91 |
-
def transcribe_webui_simple_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
| 92 |
-
|
|
|
|
|
|
|
| 93 |
|
| 94 |
-
vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize,
|
| 95 |
|
| 96 |
-
return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions,
|
|
|
|
| 97 |
|
| 98 |
# Entry function for the full tab
|
| 99 |
def transcribe_webui_full(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
append_punctuations: str, highlight_words: bool = False):
|
| 107 |
|
| 108 |
return self.transcribe_webui_full_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
| 109 |
vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
|
|
|
|
| 110 |
initial_prompt, temperature, best_of, beam_size, patience, length_penalty, suppress_tokens,
|
| 111 |
condition_on_previous_text, fp16, temperature_increment_on_fallback,
|
| 112 |
-
compression_ratio_threshold, logprob_threshold, no_speech_threshold
|
| 113 |
-
word_timestamps, prepend_punctuations, append_punctuations, highlight_words)
|
| 114 |
|
| 115 |
# Entry function for the full tab with progress
|
| 116 |
def transcribe_webui_full_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
progress=gr.Progress()):
|
| 125 |
|
| 126 |
# Handle temperature_increment_on_fallback
|
| 127 |
if temperature_increment_on_fallback is not None:
|
|
@@ -469,24 +474,34 @@ def create_ui(app_config: ApplicationConfig):
|
|
| 469 |
|
| 470 |
whisper_models = app_config.get_model_names()
|
| 471 |
|
| 472 |
-
|
| 473 |
gr.Dropdown(choices=whisper_models, value=app_config.default_model_name, label="Model"),
|
| 474 |
gr.Dropdown(choices=sorted(get_language_names()), label="Language", value=app_config.language),
|
| 475 |
gr.Text(label="URL (YouTube, etc.)"),
|
| 476 |
gr.File(label="Upload Files", file_count="multiple"),
|
| 477 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
| 478 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task", value=app_config.task),
|
|
|
|
|
|
|
|
|
|
| 479 |
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD"),
|
| 480 |
gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window),
|
| 481 |
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size),
|
| 482 |
-
|
| 483 |
-
|
|
|
|
|
|
|
|
|
|
| 484 |
]
|
| 485 |
|
| 486 |
is_queue_mode = app_config.queue_concurrency_count is not None and app_config.queue_concurrency_count > 0
|
| 487 |
|
| 488 |
simple_transcribe = gr.Interface(fn=ui.transcribe_webui_simple_progress if is_queue_mode else ui.transcribe_webui_simple,
|
| 489 |
-
description=ui_description, article=ui_article, inputs=
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
gr.File(label="Download"),
|
| 491 |
gr.Text(label="Transcription"),
|
| 492 |
gr.Text(label="Segments")
|
|
@@ -496,8 +511,17 @@ def create_ui(app_config: ApplicationConfig):
|
|
| 496 |
|
| 497 |
full_transcribe = gr.Interface(fn=ui.transcribe_webui_full_progress if is_queue_mode else ui.transcribe_webui_full,
|
| 498 |
description=full_description, article=ui_article, inputs=[
|
| 499 |
-
*
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
gr.Dropdown(choices=["prepend_first_segment", "prepend_all_segments"], value=app_config.vad_initial_prompt_mode, label="VAD - Initial Prompt Mode"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 501 |
gr.TextArea(label="Initial Prompt"),
|
| 502 |
gr.Number(label="Temperature", value=app_config.temperature),
|
| 503 |
gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0),
|
|
@@ -511,13 +535,6 @@ def create_ui(app_config: ApplicationConfig):
|
|
| 511 |
gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold),
|
| 512 |
gr.Number(label="Logprob threshold", value=app_config.logprob_threshold),
|
| 513 |
gr.Number(label="No speech threshold", value=app_config.no_speech_threshold),
|
| 514 |
-
|
| 515 |
-
# Word timestamps
|
| 516 |
-
gr.Checkbox(label="Word Timestamps", value=app_config.word_timestamps),
|
| 517 |
-
gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations),
|
| 518 |
-
gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations),
|
| 519 |
-
gr.Checkbox(label="Word Timestamps - Highlight Words", value=app_config.highlight_words),
|
| 520 |
-
|
| 521 |
], outputs=[
|
| 522 |
gr.File(label="Download"),
|
| 523 |
gr.Text(label="Transcription"),
|
|
|
|
| 84 |
print("[Auto parallel] Using GPU devices " + str(self.parallel_device_list) + " and " + str(self.vad_cpu_cores) + " CPU cores for VAD/transcription.")
|
| 85 |
|
| 86 |
# Entry function for the simple tab
|
| 87 |
+
def transcribe_webui_simple(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
| 88 |
+
vad, vadMergeWindow, vadMaxMergeSize,
|
| 89 |
+
word_timestamps: bool = False, highlight_words: bool = False):
|
| 90 |
+
return self.transcribe_webui_simple_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
| 91 |
+
vad, vadMergeWindow, vadMaxMergeSize,
|
| 92 |
+
word_timestamps, highlight_words)
|
| 93 |
|
| 94 |
# Entry function for the simple tab progress
|
| 95 |
+
def transcribe_webui_simple_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
| 96 |
+
vad, vadMergeWindow, vadMaxMergeSize,
|
| 97 |
+
word_timestamps: bool = False, highlight_words: bool = False,
|
| 98 |
+
progress=gr.Progress()):
|
| 99 |
|
| 100 |
+
vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, self.app_config.vad_padding, self.app_config.vad_prompt_window, self.app_config.vad_initial_prompt_mode)
|
| 101 |
|
| 102 |
+
return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions,
|
| 103 |
+
word_timestamps=word_timestamps, highlight_words=highlight_words, progress=progress)
|
| 104 |
|
| 105 |
# Entry function for the full tab
|
| 106 |
def transcribe_webui_full(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
| 107 |
+
vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
|
| 108 |
+
# Word timestamps
|
| 109 |
+
word_timestamps: bool, highlight_words: bool, prepend_punctuations: str, append_punctuations: str,
|
| 110 |
+
initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
|
| 111 |
+
condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
|
| 112 |
+
compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float):
|
|
|
|
| 113 |
|
| 114 |
return self.transcribe_webui_full_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
| 115 |
vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
|
| 116 |
+
word_timestamps, highlight_words, prepend_punctuations, append_punctuations,
|
| 117 |
initial_prompt, temperature, best_of, beam_size, patience, length_penalty, suppress_tokens,
|
| 118 |
condition_on_previous_text, fp16, temperature_increment_on_fallback,
|
| 119 |
+
compression_ratio_threshold, logprob_threshold, no_speech_threshold)
|
|
|
|
| 120 |
|
| 121 |
# Entry function for the full tab with progress
|
| 122 |
def transcribe_webui_full_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
| 123 |
+
vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
|
| 124 |
+
# Word timestamps
|
| 125 |
+
word_timestamps: bool, highlight_words: bool, prepend_punctuations: str, append_punctuations: str,
|
| 126 |
+
initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
|
| 127 |
+
condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
|
| 128 |
+
compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float,
|
| 129 |
+
progress=gr.Progress()):
|
|
|
|
| 130 |
|
| 131 |
# Handle temperature_increment_on_fallback
|
| 132 |
if temperature_increment_on_fallback is not None:
|
|
|
|
| 474 |
|
| 475 |
whisper_models = app_config.get_model_names()
|
| 476 |
|
| 477 |
+
common_inputs = lambda : [
|
| 478 |
gr.Dropdown(choices=whisper_models, value=app_config.default_model_name, label="Model"),
|
| 479 |
gr.Dropdown(choices=sorted(get_language_names()), label="Language", value=app_config.language),
|
| 480 |
gr.Text(label="URL (YouTube, etc.)"),
|
| 481 |
gr.File(label="Upload Files", file_count="multiple"),
|
| 482 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
| 483 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task", value=app_config.task),
|
| 484 |
+
]
|
| 485 |
+
|
| 486 |
+
common_vad_inputs = lambda : [
|
| 487 |
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD"),
|
| 488 |
gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window),
|
| 489 |
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size),
|
| 490 |
+
]
|
| 491 |
+
|
| 492 |
+
common_word_timestamps_inputs = lambda : [
|
| 493 |
+
gr.Checkbox(label="Word Timestamps", value=app_config.word_timestamps),
|
| 494 |
+
gr.Checkbox(label="Word Timestamps - Highlight Words", value=app_config.highlight_words),
|
| 495 |
]
|
| 496 |
|
| 497 |
is_queue_mode = app_config.queue_concurrency_count is not None and app_config.queue_concurrency_count > 0
|
| 498 |
|
| 499 |
simple_transcribe = gr.Interface(fn=ui.transcribe_webui_simple_progress if is_queue_mode else ui.transcribe_webui_simple,
|
| 500 |
+
description=ui_description, article=ui_article, inputs=[
|
| 501 |
+
*common_inputs(),
|
| 502 |
+
*common_vad_inputs(),
|
| 503 |
+
*common_word_timestamps_inputs(),
|
| 504 |
+
], outputs=[
|
| 505 |
gr.File(label="Download"),
|
| 506 |
gr.Text(label="Transcription"),
|
| 507 |
gr.Text(label="Segments")
|
|
|
|
| 511 |
|
| 512 |
full_transcribe = gr.Interface(fn=ui.transcribe_webui_full_progress if is_queue_mode else ui.transcribe_webui_full,
|
| 513 |
description=full_description, article=ui_article, inputs=[
|
| 514 |
+
*common_inputs(),
|
| 515 |
+
|
| 516 |
+
*common_vad_inputs(),
|
| 517 |
+
gr.Number(label="VAD - Padding (s)", precision=None, value=app_config.vad_padding),
|
| 518 |
+
gr.Number(label="VAD - Prompt Window (s)", precision=None, value=app_config.vad_prompt_window),
|
| 519 |
gr.Dropdown(choices=["prepend_first_segment", "prepend_all_segments"], value=app_config.vad_initial_prompt_mode, label="VAD - Initial Prompt Mode"),
|
| 520 |
+
|
| 521 |
+
*common_word_timestamps_inputs(),
|
| 522 |
+
gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations),
|
| 523 |
+
gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations),
|
| 524 |
+
|
| 525 |
gr.TextArea(label="Initial Prompt"),
|
| 526 |
gr.Number(label="Temperature", value=app_config.temperature),
|
| 527 |
gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0),
|
|
|
|
| 535 |
gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold),
|
| 536 |
gr.Number(label="Logprob threshold", value=app_config.logprob_threshold),
|
| 537 |
gr.Number(label="No speech threshold", value=app_config.no_speech_threshold),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 538 |
], outputs=[
|
| 539 |
gr.File(label="Download"),
|
| 540 |
gr.Text(label="Transcription"),
|