Spaces:
Build error
Build error
Make it easier to use the old segmentation strategy
Browse files- app.py +24 -20
- cli.py +3 -3
- src/vad.py +58 -22
app.py
CHANGED
|
@@ -14,7 +14,7 @@ import gradio as gr
|
|
| 14 |
|
| 15 |
from src.download import ExceededMaximumDuration, download_url
|
| 16 |
from src.utils import slugify, write_srt, write_vtt
|
| 17 |
-
from src.vad import VadPeriodicTranscription, VadSileroTranscription
|
| 18 |
|
| 19 |
# Limitations (set to -1 to disable)
|
| 20 |
DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
|
|
@@ -94,25 +94,17 @@ class WhisperTranscriber:
|
|
| 94 |
|
| 95 |
# The results
|
| 96 |
if (vad == 'silero-vad'):
|
| 97 |
-
#
|
| 98 |
-
|
| 99 |
-
self.vad_model = VadSileroTranscription()
|
| 100 |
-
|
| 101 |
-
process_gaps = VadSileroTranscription(transcribe_non_speech = True,
|
| 102 |
-
max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
|
| 103 |
-
segment_padding_left=vadPadding, segment_padding_right=vadPadding,
|
| 104 |
-
max_prompt_window=vadPromptWindow, copy=self.vad_model)
|
| 105 |
result = process_gaps.transcribe(audio_path, whisperCallable)
|
| 106 |
elif (vad == 'silero-vad-skip-gaps'):
|
| 107 |
-
#
|
| 108 |
-
|
| 109 |
-
self.vad_model = VadSileroTranscription()
|
| 110 |
-
|
| 111 |
-
skip_gaps = VadSileroTranscription(transcribe_non_speech = False,
|
| 112 |
-
max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
|
| 113 |
-
segment_padding_left=vadPadding, segment_padding_right=vadPadding,
|
| 114 |
-
max_prompt_window=vadPromptWindow, copy=self.vad_model)
|
| 115 |
result = skip_gaps.transcribe(audio_path, whisperCallable)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
elif (vad == 'periodic-vad'):
|
| 117 |
# Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
|
| 118 |
# it may create a break in the middle of a sentence, causing some artifacts.
|
|
@@ -124,6 +116,18 @@ class WhisperTranscriber:
|
|
| 124 |
|
| 125 |
return result
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
def write_result(self, result: dict, source_name: str, output_dir: str):
|
| 128 |
if not os.path.exists(output_dir):
|
| 129 |
os.makedirs(output_dir)
|
|
@@ -218,11 +222,11 @@ def create_ui(inputAudioMaxDuration, share=False, server_name: str = None):
|
|
| 218 |
gr.Audio(source="upload", type="filepath", label="Upload Audio"),
|
| 219 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
| 220 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
|
| 221 |
-
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "periodic-vad"], label="VAD"),
|
| 222 |
-
gr.Number(label="VAD - Merge Window (s)", precision=0, value=
|
| 223 |
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
|
| 224 |
gr.Number(label="VAD - Padding (s)", precision=None, value=1),
|
| 225 |
-
gr.Number(label="VAD - Prompt Window (s)", precision=None, value=
|
| 226 |
], outputs=[
|
| 227 |
gr.File(label="Download"),
|
| 228 |
gr.Text(label="Transcription"),
|
|
|
|
| 14 |
|
| 15 |
from src.download import ExceededMaximumDuration, download_url
|
| 16 |
from src.utils import slugify, write_srt, write_vtt
|
| 17 |
+
from src.vad import NonSpeechStrategy, VadPeriodicTranscription, VadSileroTranscription
|
| 18 |
|
| 19 |
# Limitations (set to -1 to disable)
|
| 20 |
DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
|
|
|
|
| 94 |
|
| 95 |
# The results
|
| 96 |
if (vad == 'silero-vad'):
|
| 97 |
+
# Silero VAD where non-speech gaps are transcribed
|
| 98 |
+
process_gaps = self._create_silero_vad(NonSpeechStrategy.CREATE_SEGMENT, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
result = process_gaps.transcribe(audio_path, whisperCallable)
|
| 100 |
elif (vad == 'silero-vad-skip-gaps'):
|
| 101 |
+
# Silero VAD where non-speech gaps are simply ignored
|
| 102 |
+
skip_gaps = self._create_silero_vad(NonSpeechStrategy.SKIP, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
result = skip_gaps.transcribe(audio_path, whisperCallable)
|
| 104 |
+
elif (vad == 'silero-vad-expand-into-gaps'):
|
| 105 |
+
# Use Silero VAD where speech-segments are expanded into non-speech gaps
|
| 106 |
+
expand_gaps = self._create_silero_vad(NonSpeechStrategy.EXPAND_SEGMENT, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
|
| 107 |
+
result = expand_gaps.transcribe(audio_path, whisperCallable)
|
| 108 |
elif (vad == 'periodic-vad'):
|
| 109 |
# Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
|
| 110 |
# it may create a break in the middle of a sentence, causing some artifacts.
|
|
|
|
| 116 |
|
| 117 |
return result
|
| 118 |
|
| 119 |
+
def _create_silero_vad(self, non_speech_strategy: NonSpeechStrategy, vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1):
|
| 120 |
+
# Use Silero VAD
|
| 121 |
+
if (self.vad_model is None):
|
| 122 |
+
self.vad_model = VadSileroTranscription()
|
| 123 |
+
|
| 124 |
+
result = VadSileroTranscription(non_speech_strategy = non_speech_strategy,
|
| 125 |
+
max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
|
| 126 |
+
segment_padding_left=vadPadding, segment_padding_right=vadPadding,
|
| 127 |
+
max_prompt_window=vadPromptWindow, copy=self.vad_model)
|
| 128 |
+
|
| 129 |
+
return result
|
| 130 |
+
|
| 131 |
def write_result(self, result: dict, source_name: str, output_dir: str):
|
| 132 |
if not os.path.exists(output_dir):
|
| 133 |
os.makedirs(output_dir)
|
|
|
|
| 222 |
gr.Audio(source="upload", type="filepath", label="Upload Audio"),
|
| 223 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
| 224 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
|
| 225 |
+
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], label="VAD"),
|
| 226 |
+
gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
|
| 227 |
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
|
| 228 |
gr.Number(label="VAD - Padding (s)", precision=None, value=1),
|
| 229 |
+
gr.Number(label="VAD - Prompt Window (s)", precision=None, value=3)
|
| 230 |
], outputs=[
|
| 231 |
gr.File(label="Download"),
|
| 232 |
gr.Text(label="Transcription"),
|
cli.py
CHANGED
|
@@ -26,11 +26,11 @@ def cli():
|
|
| 26 |
parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
|
| 27 |
parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES), help="language spoken in the audio, specify None to perform language detection")
|
| 28 |
|
| 29 |
-
parser.add_argument("--vad", type=str, default="none", choices=["none", "silero-vad", "silero-vad-skip-gaps", "periodic-vad"], help="The voice activity detection algorithm to use")
|
| 30 |
parser.add_argument("--vad_merge_window", type=optional_float, default=5, help="The window size (in seconds) to merge voice segments")
|
| 31 |
-
parser.add_argument("--vad_max_merge_size", type=optional_float, default=
|
| 32 |
parser.add_argument("--vad_padding", type=optional_float, default=1, help="The padding (in seconds) to add to each voice segment")
|
| 33 |
-
parser.add_argument("--vad_prompt_window", type=optional_float, default=
|
| 34 |
|
| 35 |
parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
|
| 36 |
parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
|
|
|
|
| 26 |
parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
|
| 27 |
parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES), help="language spoken in the audio, specify None to perform language detection")
|
| 28 |
|
| 29 |
+
parser.add_argument("--vad", type=str, default="none", choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], help="The voice activity detection algorithm to use")
|
| 30 |
parser.add_argument("--vad_merge_window", type=optional_float, default=5, help="The window size (in seconds) to merge voice segments")
|
| 31 |
+
parser.add_argument("--vad_max_merge_size", type=optional_float, default=30, help="The maximum size (in seconds) of a voice segment")
|
| 32 |
parser.add_argument("--vad_padding", type=optional_float, default=1, help="The padding (in seconds) to add to each voice segment")
|
| 33 |
+
parser.add_argument("--vad_prompt_window", type=optional_float, default=3, help="The window size of the prompt to pass to Whisper")
|
| 34 |
|
| 35 |
parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
|
| 36 |
parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
|
src/vad.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from abc import ABC, abstractmethod
|
| 2 |
from collections import Counter, deque
|
| 3 |
-
|
|
|
|
| 4 |
|
| 5 |
from pprint import pprint
|
| 6 |
|
|
@@ -19,6 +20,20 @@ import numpy as np
|
|
| 19 |
from src.utils import format_timestamp
|
| 20 |
from enum import Enum
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
# Defaults for Silero
|
| 23 |
SPEECH_TRESHOLD = 0.3
|
| 24 |
MAX_SILENT_PERIOD = 10 # seconds
|
|
@@ -28,9 +43,6 @@ MAX_MERGE_SIZE = 150 # Do not create segments larger than 2.5 minutes
|
|
| 28 |
SEGMENT_PADDING_LEFT = 1 # Start detected text segment early
|
| 29 |
SEGMENT_PADDING_RIGHT = 1 # End detected segments late
|
| 30 |
|
| 31 |
-
# Whether to attempt to transcribe non-speech
|
| 32 |
-
TRANSCRIBE_NON_SPEECH = False
|
| 33 |
-
|
| 34 |
# Minimum size of segments to process
|
| 35 |
MIN_SEGMENT_DURATION = 1
|
| 36 |
|
|
@@ -46,13 +58,13 @@ VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
|
|
| 46 |
|
| 47 |
class AbstractTranscription(ABC):
|
| 48 |
def __init__(self, segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
|
| 49 |
-
max_merge_size: float = None,
|
| 50 |
self.sampling_rate = 16000
|
| 51 |
self.segment_padding_left = segment_padding_left
|
| 52 |
self.segment_padding_right = segment_padding_right
|
| 53 |
self.max_silent_period = max_silent_period
|
| 54 |
self.max_merge_size = max_merge_size
|
| 55 |
-
self.
|
| 56 |
self.max_prompt_window = max_prompt_window
|
| 57 |
|
| 58 |
self.min_force_merge_gap = MIN_FORCE_MERGE_GAP
|
|
@@ -107,16 +119,18 @@ class AbstractTranscription(ABC):
|
|
| 107 |
print("Timestamps:")
|
| 108 |
pprint(merged)
|
| 109 |
|
| 110 |
-
if self.
|
| 111 |
max_audio_duration = get_audio_duration(audio)
|
| 112 |
|
| 113 |
# Expand segments to include the gaps between them
|
| 114 |
-
if (self.
|
| 115 |
# When we have a prompt window, we create speech segments betwen each segment if we exceed the merge size
|
| 116 |
merged = self.fill_gaps(merged, total_duration=max_audio_duration, max_expand_size=self.max_merge_size)
|
| 117 |
-
|
| 118 |
-
# With no prompt window, it is better to expand the segments
|
| 119 |
merged = self.expand_gaps(merged, total_duration=max_audio_duration)
|
|
|
|
|
|
|
| 120 |
|
| 121 |
print("Transcribing non-speech:")
|
| 122 |
pprint(merged)
|
|
@@ -150,6 +164,17 @@ class AbstractTranscription(ABC):
|
|
| 150 |
|
| 151 |
adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
# Append to output
|
| 154 |
result['text'] += segment_result['text']
|
| 155 |
result['segments'].extend(adjusted_segments)
|
|
@@ -158,20 +183,30 @@ class AbstractTranscription(ABC):
|
|
| 158 |
languageCounter[segment_result['language']] += 1
|
| 159 |
|
| 160 |
# Update prompt window
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
for segment in adjusted_segments:
|
| 164 |
-
if segment.get('no_speech_prob', 0) <= PROMPT_NO_SPEECH_PROB:
|
| 165 |
-
prompt_window.append(segment)
|
| 166 |
-
|
| 167 |
-
while (len(prompt_window) > 0 and prompt_window[0]['end'] < segment_end - self.max_prompt_window):
|
| 168 |
-
prompt_window.popleft()
|
| 169 |
-
|
| 170 |
if len(languageCounter) > 0:
|
| 171 |
result['language'] = languageCounter.most_common(1)[0][0]
|
| 172 |
|
| 173 |
return result
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
def include_gaps(self, segments: Iterator[dict], min_gap_length: float, total_duration: float):
|
| 176 |
result = []
|
| 177 |
last_end_time = 0
|
|
@@ -360,7 +395,8 @@ class AbstractTranscription(ABC):
|
|
| 360 |
if distance <= max_merge_gap and (max_merge_size is None or current_entry_size <= max_merge_size):
|
| 361 |
# Regular merge
|
| 362 |
current_entry['end'] = entry['end']
|
| 363 |
-
elif min_force_merge_gap is not None and distance <= min_force_merge_gap and
|
|
|
|
| 364 |
# Force merge if the distance is small (up to a certain maximum size)
|
| 365 |
current_entry['end'] = entry['end']
|
| 366 |
else:
|
|
@@ -389,10 +425,10 @@ class AbstractTranscription(ABC):
|
|
| 389 |
|
| 390 |
class VadSileroTranscription(AbstractTranscription):
|
| 391 |
def __init__(self, segment_padding_left=SEGMENT_PADDING_LEFT, segment_padding_right=SEGMENT_PADDING_RIGHT,
|
| 392 |
-
max_silent_period=MAX_SILENT_PERIOD, max_merge_size=MAX_MERGE_SIZE,
|
| 393 |
max_prompt_window=MAX_PROMPT_WINDOW, copy = None):
|
| 394 |
super().__init__(segment_padding_left=segment_padding_left, segment_padding_right=segment_padding_right,
|
| 395 |
-
max_silent_period=max_silent_period, max_merge_size=max_merge_size,
|
| 396 |
|
| 397 |
if copy:
|
| 398 |
self.model = copy.model
|
|
|
|
| 1 |
from abc import ABC, abstractmethod
|
| 2 |
from collections import Counter, deque
|
| 3 |
+
|
| 4 |
+
from typing import Any, Deque, Iterator, List, Dict
|
| 5 |
|
| 6 |
from pprint import pprint
|
| 7 |
|
|
|
|
| 20 |
from src.utils import format_timestamp
|
| 21 |
from enum import Enum
|
| 22 |
|
| 23 |
+
class NonSpeechStrategy(Enum):
|
| 24 |
+
"""
|
| 25 |
+
Ignore non-speech frames segments.
|
| 26 |
+
"""
|
| 27 |
+
SKIP = 1
|
| 28 |
+
"""
|
| 29 |
+
Just treat non-speech segments as speech.
|
| 30 |
+
"""
|
| 31 |
+
CREATE_SEGMENT = 2
|
| 32 |
+
"""
|
| 33 |
+
Expand speech segments into subsequent non-speech segments.
|
| 34 |
+
"""
|
| 35 |
+
EXPAND_SEGMENT = 3
|
| 36 |
+
|
| 37 |
# Defaults for Silero
|
| 38 |
SPEECH_TRESHOLD = 0.3
|
| 39 |
MAX_SILENT_PERIOD = 10 # seconds
|
|
|
|
| 43 |
SEGMENT_PADDING_LEFT = 1 # Start detected text segment early
|
| 44 |
SEGMENT_PADDING_RIGHT = 1 # End detected segments late
|
| 45 |
|
|
|
|
|
|
|
|
|
|
| 46 |
# Minimum size of segments to process
|
| 47 |
MIN_SEGMENT_DURATION = 1
|
| 48 |
|
|
|
|
| 58 |
|
| 59 |
class AbstractTranscription(ABC):
|
| 60 |
def __init__(self, segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
|
| 61 |
+
max_merge_size: float = None, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP, max_prompt_window: float = None):
|
| 62 |
self.sampling_rate = 16000
|
| 63 |
self.segment_padding_left = segment_padding_left
|
| 64 |
self.segment_padding_right = segment_padding_right
|
| 65 |
self.max_silent_period = max_silent_period
|
| 66 |
self.max_merge_size = max_merge_size
|
| 67 |
+
self.non_speech_strategy = non_speech_strategy
|
| 68 |
self.max_prompt_window = max_prompt_window
|
| 69 |
|
| 70 |
self.min_force_merge_gap = MIN_FORCE_MERGE_GAP
|
|
|
|
| 119 |
print("Timestamps:")
|
| 120 |
pprint(merged)
|
| 121 |
|
| 122 |
+
if self.non_speech_strategy != NonSpeechStrategy.SKIP:
|
| 123 |
max_audio_duration = get_audio_duration(audio)
|
| 124 |
|
| 125 |
# Expand segments to include the gaps between them
|
| 126 |
+
if (self.non_speech_strategy == NonSpeechStrategy.CREATE_SEGMENT):
|
| 127 |
# When we have a prompt window, we create speech segments betwen each segment if we exceed the merge size
|
| 128 |
merged = self.fill_gaps(merged, total_duration=max_audio_duration, max_expand_size=self.max_merge_size)
|
| 129 |
+
elif self.non_speech_strategy == NonSpeechStrategy.EXPAND_SEGMENT:
|
| 130 |
+
# With no prompt window, it is better to just expand the segments (this effectively passes the prompt to the next segment)
|
| 131 |
merged = self.expand_gaps(merged, total_duration=max_audio_duration)
|
| 132 |
+
else:
|
| 133 |
+
raise Exception("Unknown non-speech strategy: " + str(self.non_speech_strategy))
|
| 134 |
|
| 135 |
print("Transcribing non-speech:")
|
| 136 |
pprint(merged)
|
|
|
|
| 164 |
|
| 165 |
adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
|
| 166 |
|
| 167 |
+
# Propagate expand amount to the segments
|
| 168 |
+
if (segment_expand_amount > 0):
|
| 169 |
+
segment_without_expansion = segment_duration - segment_expand_amount
|
| 170 |
+
|
| 171 |
+
for adjusted_segment in adjusted_segments:
|
| 172 |
+
adjusted_segment_end = adjusted_segment['end']
|
| 173 |
+
|
| 174 |
+
# Add expand amount if the segment got expanded
|
| 175 |
+
if (adjusted_segment_end > segment_without_expansion):
|
| 176 |
+
adjusted_segment["expand_amount"] = adjusted_segment_end - segment_without_expansion
|
| 177 |
+
|
| 178 |
# Append to output
|
| 179 |
result['text'] += segment_result['text']
|
| 180 |
result['segments'].extend(adjusted_segments)
|
|
|
|
| 183 |
languageCounter[segment_result['language']] += 1
|
| 184 |
|
| 185 |
# Update prompt window
|
| 186 |
+
self.__update_prompt_window(prompt_window, adjusted_segments, segment_end)
|
| 187 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
if len(languageCounter) > 0:
|
| 189 |
result['language'] = languageCounter.most_common(1)[0][0]
|
| 190 |
|
| 191 |
return result
|
| 192 |
|
| 193 |
+
def __update_prompt_window(self, prompt_window: Deque, adjusted_segments: List, segment_end: float):
|
| 194 |
+
if (self.max_prompt_window is not None and self.max_prompt_window > 0):
|
| 195 |
+
# Add segments to the current prompt window
|
| 196 |
+
for segment in adjusted_segments:
|
| 197 |
+
if segment.get('no_speech_prob', 0) <= PROMPT_NO_SPEECH_PROB:
|
| 198 |
+
prompt_window.append(segment)
|
| 199 |
+
|
| 200 |
+
while (len(prompt_window) > 0):
|
| 201 |
+
first_end_time = prompt_window[0].get('end', 0)
|
| 202 |
+
# Time expanded in the segments should be discounted from the prompt window
|
| 203 |
+
first_expand_time = prompt_window[0].get('expand_amount', 0)
|
| 204 |
+
|
| 205 |
+
if (first_end_time - first_expand_time < segment_end - self.max_prompt_window):
|
| 206 |
+
prompt_window.popleft()
|
| 207 |
+
else:
|
| 208 |
+
break
|
| 209 |
+
|
| 210 |
def include_gaps(self, segments: Iterator[dict], min_gap_length: float, total_duration: float):
|
| 211 |
result = []
|
| 212 |
last_end_time = 0
|
|
|
|
| 395 |
if distance <= max_merge_gap and (max_merge_size is None or current_entry_size <= max_merge_size):
|
| 396 |
# Regular merge
|
| 397 |
current_entry['end'] = entry['end']
|
| 398 |
+
elif min_force_merge_gap is not None and distance <= min_force_merge_gap and \
|
| 399 |
+
(max_force_merge_size is None or current_entry_size <= max_force_merge_size):
|
| 400 |
# Force merge if the distance is small (up to a certain maximum size)
|
| 401 |
current_entry['end'] = entry['end']
|
| 402 |
else:
|
|
|
|
| 425 |
|
| 426 |
class VadSileroTranscription(AbstractTranscription):
|
| 427 |
def __init__(self, segment_padding_left=SEGMENT_PADDING_LEFT, segment_padding_right=SEGMENT_PADDING_RIGHT,
|
| 428 |
+
max_silent_period=MAX_SILENT_PERIOD, max_merge_size=MAX_MERGE_SIZE, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP,
|
| 429 |
max_prompt_window=MAX_PROMPT_WINDOW, copy = None):
|
| 430 |
super().__init__(segment_padding_left=segment_padding_left, segment_padding_right=segment_padding_right,
|
| 431 |
+
max_silent_period=max_silent_period, max_merge_size=max_merge_size, non_speech_strategy=non_speech_strategy, max_prompt_window=max_prompt_window)
|
| 432 |
|
| 433 |
if copy:
|
| 434 |
self.model = copy.model
|