faster-whisper-webui

Running

App Files Files Community

aadnk commited on Mar 23, 2023

Commit

44d964a

1 Parent(s): 512321e

Add configuration file and support for custom models

Browse files

Custom models can be added to the configuration file,
under the "models" section. See the comments for more
details.

Files changed (8) hide show

.gitignore +1 -0
app.py +37 -17
cli.py +72 -38
config.json5 +62 -0
requirements.txt +3 -1
src/config.py +134 -0
src/conversion/hf_converter.py +67 -0
src/whisperContainer.py +29 -3

.gitignore CHANGED Viewed

@@ -1,5 +1,6 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 flagged/
 *.py[cod]
 *$py.class

 # Byte-compiled / optimized / DLL files
 __pycache__/
+.vscode/
 flagged/
 *.py[cod]
 *$py.class

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import zipfile
 import numpy as np
 import torch
 from src.modelCache import ModelCache
 from src.source import get_audio_source_collection
 from src.vadParallel import ParallelContext, ParallelTranscription
@@ -62,7 +63,8 @@ WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v1", "large
 class WhisperTranscriber:
     def __init__(self, input_audio_max_duration: float = DEFAULT_INPUT_AUDIO_MAX_DURATION, vad_process_timeout: float = None,
-                 vad_cpu_cores: int = 1, delete_uploaded_files: bool = DELETE_UPLOADED_FILES, output_dir: str = None):
         self.model_cache = ModelCache()
         self.parallel_device_list = None
         self.gpu_parallel_context = None
@@ -75,6 +77,8 @@ class WhisperTranscriber:
         self.deleteUploadedFiles = delete_uploaded_files
         self.output_dir = output_dir
     def set_parallel_devices(self, vad_parallel_devices: str):
         self.parallel_device_list = [ device.strip() for device in vad_parallel_devices.split(",") ] if vad_parallel_devices else None
@@ -115,7 +119,7 @@ class WhisperTranscriber:
                 selectedLanguage = languageName.lower() if len(languageName) > 0 else None
                 selectedModel = modelName if modelName is not None else "base"
-                model = WhisperContainer(model_name=selectedModel, cache=self.model_cache)
                 # Result
                 download = []
@@ -360,8 +364,8 @@ class WhisperTranscriber:
 def create_ui(input_audio_max_duration, share=False, server_name: str = None, server_port: int = 7860,
               default_model_name: str = "medium", default_vad: str = None, vad_parallel_devices: str = None,
               vad_process_timeout: float = None, vad_cpu_cores: int = 1, auto_parallel: bool = False,
-              output_dir: str = None):
-    ui = WhisperTranscriber(input_audio_max_duration, vad_process_timeout, vad_cpu_cores, DELETE_UPLOADED_FILES, output_dir)
     # Specify a list of devices to use for parallel processing
     ui.set_parallel_devices(vad_parallel_devices)
@@ -378,8 +382,10 @@ def create_ui(input_audio_max_duration, share=False, server_name: str = None, se
     ui_article = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)"
     simple_inputs = lambda : [
-        gr.Dropdown(choices=WHISPER_MODELS, value=default_model_name, label="Model"),
         gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
         gr.Text(label="URL (YouTube, etc.)"),
         gr.File(label="Upload Files", file_count="multiple"),
@@ -429,18 +435,32 @@ def create_ui(input_audio_max_duration, share=False, server_name: str = None, se
     ui.close()
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("--input_audio_max_duration", type=int, default=DEFAULT_INPUT_AUDIO_MAX_DURATION, help="Maximum audio file length in seconds, or -1 for no limit.")
-    parser.add_argument("--share", type=bool, default=False, help="True to share the app on HuggingFace.")
-    parser.add_argument("--server_name", type=str, default=None, help="The host or IP to bind to. If None, bind to localhost.")
-    parser.add_argument("--server_port", type=int, default=7860, help="The port to bind to.")
-    parser.add_argument("--default_model_name", type=str, choices=WHISPER_MODELS, default="medium", help="The default model name.")
-    parser.add_argument("--default_vad", type=str, default="silero-vad", help="The default VAD.")
-    parser.add_argument("--vad_parallel_devices", type=str, default="", help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.")
-    parser.add_argument("--vad_cpu_cores", type=int, default=1, help="The number of CPU cores to use for VAD pre-processing.")
-    parser.add_argument("--vad_process_timeout", type=float, default="1800", help="The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout.")
-    parser.add_argument("--auto_parallel", type=bool, default=False, help="True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.")
-    parser.add_argument("--output_dir", "-o", type=str, default=None, help="directory to save the outputs")
     args = parser.parse_args().__dict__
-    create_ui(**args)

 import numpy as np
 import torch
+from src.config import ApplicationConfig
 from src.modelCache import ModelCache
 from src.source import get_audio_source_collection
 from src.vadParallel import ParallelContext, ParallelTranscription
 class WhisperTranscriber:
     def __init__(self, input_audio_max_duration: float = DEFAULT_INPUT_AUDIO_MAX_DURATION, vad_process_timeout: float = None,
+                 vad_cpu_cores: int = 1, delete_uploaded_files: bool = DELETE_UPLOADED_FILES, output_dir: str = None,
+                 app_config: ApplicationConfig = None):
         self.model_cache = ModelCache()
         self.parallel_device_list = None
         self.gpu_parallel_context = None
         self.deleteUploadedFiles = delete_uploaded_files
         self.output_dir = output_dir
+        self.app_config = app_config
     def set_parallel_devices(self, vad_parallel_devices: str):
         self.parallel_device_list = [ device.strip() for device in vad_parallel_devices.split(",") ] if vad_parallel_devices else None
                 selectedLanguage = languageName.lower() if len(languageName) > 0 else None
                 selectedModel = modelName if modelName is not None else "base"
+                model = WhisperContainer(model_name=selectedModel, cache=self.model_cache, models=self.app_config.models)
                 # Result
                 download = []
 def create_ui(input_audio_max_duration, share=False, server_name: str = None, server_port: int = 7860,
               default_model_name: str = "medium", default_vad: str = None, vad_parallel_devices: str = None,
               vad_process_timeout: float = None, vad_cpu_cores: int = 1, auto_parallel: bool = False,
+              output_dir: str = None, app_config: ApplicationConfig = None):
+    ui = WhisperTranscriber(input_audio_max_duration, vad_process_timeout, vad_cpu_cores, DELETE_UPLOADED_FILES, output_dir, app_config)
     # Specify a list of devices to use for parallel processing
     ui.set_parallel_devices(vad_parallel_devices)
     ui_article = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)"
+    whisper_models = app_config.get_model_names()
     simple_inputs = lambda : [
+        gr.Dropdown(choices=whisper_models, value=default_model_name, label="Model"),
         gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
         gr.Text(label="URL (YouTube, etc.)"),
         gr.File(label="Upload Files", file_count="multiple"),
     ui.close()
 if __name__ == '__main__':
+    app_config = ApplicationConfig.parse_file(os.environ.get("WHISPER_WEBUI_CONFIG", "config.json5"))
+    whisper_models = app_config.get_model_names()
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--input_audio_max_duration", type=int, default=app_config.input_audio_max_duration, \
+                        help="Maximum audio file length in seconds, or -1 for no limit.") # 600
+    parser.add_argument("--share", type=bool, default=app_config.share, \
+                        help="True to share the app on HuggingFace.") # False
+    parser.add_argument("--server_name", type=str, default=app_config.server_name, \
+                        help="The host or IP to bind to. If None, bind to localhost.") # None
+    parser.add_argument("--server_port", type=int, default=app_config.server_port, \
+                        help="The port to bind to.") # 7860
+    parser.add_argument("--default_model_name", type=str, choices=whisper_models, default=app_config.default_model_name, \
+                        help="The default model name.") # medium
+    parser.add_argument("--default_vad", type=str, default=app_config.default_vad, \
+                        help="The default VAD.") # silero-vad
+    parser.add_argument("--vad_parallel_devices", type=str, default=app_config.vad_parallel_devices, \
+                        help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.") # ""
+    parser.add_argument("--vad_cpu_cores", type=int, default=app_config.vad_cpu_cores, \
+                        help="The number of CPU cores to use for VAD pre-processing.") # 1
+    parser.add_argument("--vad_process_timeout", type=float, default=app_config.vad_process_timeout, \
+                        help="The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout.") # 1800
+    parser.add_argument("--auto_parallel", type=bool, default=app_config.auto_parallel, \
+                        help="True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.") # False
+    parser.add_argument("--output_dir", "-o", type=str, default=app_config.output_dir, \
+                        help="directory to save the outputs") # None
     args = parser.parse_args().__dict__
+    create_ui(app_config=app_config, **args)

cli.py CHANGED Viewed

@@ -6,48 +6,81 @@ import warnings
 import numpy as np
 import torch
-from app import LANGUAGES, WHISPER_MODELS, WhisperTranscriber
 from src.download import download_url
 from src.utils import optional_float, optional_int, str2bool
 from src.whisperContainer import WhisperContainer
 def cli():
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
-    parser.add_argument("--model", default="small", choices=WHISPER_MODELS, help="name of the Whisper model to use")
-    parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
-    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
-    parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
-    parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
-    parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
-    parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES), help="language spoken in the audio, specify None to perform language detection")
-    parser.add_argument("--vad", type=str, default="none", choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], help="The voice activity detection algorithm to use")
-    parser.add_argument("--vad_merge_window", type=optional_float, default=5, help="The window size (in seconds) to merge voice segments")
-    parser.add_argument("--vad_max_merge_size", type=optional_float, default=30, help="The maximum size (in seconds) of a voice segment")
-    parser.add_argument("--vad_padding", type=optional_float, default=1, help="The padding (in seconds) to add to each voice segment")
-    parser.add_argument("--vad_prompt_window", type=optional_float, default=3, help="The window size of the prompt to pass to Whisper")
-    parser.add_argument("--vad_cpu_cores", type=int, default=1, help="The number of CPU cores to use for VAD pre-processing.")
-    parser.add_argument("--vad_parallel_devices", type=str, default="", help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.")
-    parser.add_argument("--auto_parallel", type=bool, default=False, help="True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.")
-    parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
-    parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
-    parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
-    parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
-    parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple lengt normalization by default")
-    parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
-    parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
-    parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
-    parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
-    parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
-    parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
-    parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
-    parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
     args = parser.parse_args().__dict__
     model_name: str = args.pop("model")
@@ -74,12 +107,13 @@ def cli():
     vad_prompt_window = args.pop("vad_prompt_window")
     vad_cpu_cores = args.pop("vad_cpu_cores")
     auto_parallel = args.pop("auto_parallel")
-    model = WhisperContainer(model_name, device=device, download_root=model_dir)
-    transcriber = WhisperTranscriber(delete_uploaded_files=False, vad_cpu_cores=vad_cpu_cores)
     transcriber.set_parallel_devices(args.pop("vad_parallel_devices"))
     transcriber.set_auto_parallel(auto_parallel)
     if (transcriber._has_parallel_devices()):
         print("Using parallel devices:", transcriber.parallel_device_list)

 import numpy as np
 import torch
+from app import LANGUAGES, WhisperTranscriber
+from src.config import ApplicationConfig
 from src.download import download_url
 from src.utils import optional_float, optional_int, str2bool
 from src.whisperContainer import WhisperContainer
 def cli():
+    app_config = ApplicationConfig.parse_file(os.environ.get("WHISPER_WEBUI_CONFIG", "config.json5"))
+    whisper_models = app_config.get_model_names()
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("audio", nargs="+", type=str, \
+                        help="audio file(s) to transcribe")
+    parser.add_argument("--model", default=app_config.default_model_name, choices=whisper_models, \
+                        help="name of the Whisper model to use") # medium
+    parser.add_argument("--model_dir", type=str, default=None, \
+                        help="the path to save model files; uses ~/.cache/whisper by default")
+    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", \
+                        help="device to use for PyTorch inference")
+    parser.add_argument("--output_dir", "-o", type=str, default=".", \
+                        help="directory to save the outputs")
+    parser.add_argument("--verbose", type=str2bool, default=True, \
+                        help="whether to print out the progress and debug messages")
+    parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], \
+                        help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
+    parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES), \
+                        help="language spoken in the audio, specify None to perform language detection")
+    parser.add_argument("--vad", type=str, default=app_config.default_vad, choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], \
+                        help="The voice activity detection algorithm to use") # silero-vad
+    parser.add_argument("--vad_merge_window", type=optional_float, default=5, \
+                        help="The window size (in seconds) to merge voice segments")
+    parser.add_argument("--vad_max_merge_size", type=optional_float, default=30,\
+                         help="The maximum size (in seconds) of a voice segment")
+    parser.add_argument("--vad_padding", type=optional_float, default=1, \
+                        help="The padding (in seconds) to add to each voice segment")
+    parser.add_argument("--vad_prompt_window", type=optional_float, default=3, \
+                        help="The window size of the prompt to pass to Whisper")
+    parser.add_argument("--vad_cpu_cores", type=int, default=app_config.vad_cpu_cores, \
+                        help="The number of CPU cores to use for VAD pre-processing.") # 1
+    parser.add_argument("--vad_parallel_devices", type=str, default=app_config.vad_parallel_devices, \
+                        help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.") # ""
+    parser.add_argument("--auto_parallel", type=bool, default=app_config.auto_parallel, \
+                        help="True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.") # False
+    parser.add_argument("--temperature", type=float, default=0, \
+                        help="temperature to use for sampling")
+    parser.add_argument("--best_of", type=optional_int, default=5, \
+                        help="number of candidates when sampling with non-zero temperature")
+    parser.add_argument("--beam_size", type=optional_int, default=5, \
+                        help="number of beams in beam search, only applicable when temperature is zero")
+    parser.add_argument("--patience", type=float, default=None, \
+                        help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
+    parser.add_argument("--length_penalty", type=float, default=None, \
+                        help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple lengt normalization by default")
+    parser.add_argument("--suppress_tokens", type=str, default="-1", \
+                        help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
+    parser.add_argument("--initial_prompt", type=str, default=None, \
+                        help="optional text to provide as a prompt for the first window.")
+    parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, \
+                        help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
+    parser.add_argument("--fp16", type=str2bool, default=True, \
+                        help="whether to perform inference in fp16; True by default")
+    parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, \
+                        help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
+    parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, \
+                        help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
+    parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, \
+                        help="if the average log probability is lower than this value, treat the decoding as failed")
+    parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, \
+                        help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
     args = parser.parse_args().__dict__
     model_name: str = args.pop("model")
     vad_prompt_window = args.pop("vad_prompt_window")
     vad_cpu_cores = args.pop("vad_cpu_cores")
     auto_parallel = args.pop("auto_parallel")
+    transcriber = WhisperTranscriber(delete_uploaded_files=False, vad_cpu_cores=vad_cpu_cores, app_config=app_config)
     transcriber.set_parallel_devices(args.pop("vad_parallel_devices"))
     transcriber.set_auto_parallel(auto_parallel)
+    model = WhisperContainer(model_name, device=device, download_root=model_dir, models=app_config.models)
     if (transcriber._has_parallel_devices()):
         print("Using parallel devices:", transcriber.parallel_device_list)

config.json5 ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+    "models": [
+        // Configuration for the built-in models. You can remove any of these
+        // if you don't want to use the default models.
+        {
+            "name": "tiny",
+            "url": "tiny"
+        },
+        {
+            "name": "base",
+            "url": "base"
+        },
+        {
+            "name": "small",
+            "url": "small"
+        },
+        {
+            "name": "medium",
+            "url": "medium"
+        },
+        {
+            "name": "large",
+            "url": "large"
+        },
+        {
+            "name": "large-v2",
+            "url": "large-v2"
+        },
+        // Uncomment to add custom Japanese models
+        //{
+        //    "name": "whisper-large-v2-mix-jp",
+        //    "url": "vumichien/whisper-large-v2-mix-jp",
+        //    // The type of the model. Can be "huggingface" or "whisper" - "whisper" is the default.
+        //    // HuggingFace models are loaded using the HuggingFace transformers library and then converted to Whisper models.
+        //    "type": "huggingface",
+        //}
+    ],
+    // Configuration options that will be used if they are not specified in the command line arguments.
+    // Maximum audio file length in seconds, or -1 for no limit.
+    "input_audio_max_duration": 600,
+    // True to share the app on HuggingFace.
+    "share": false,
+    // The host or IP to bind to. If None, bind to localhost.
+    "server_name": null,
+    // The port to bind to.
+    "server_port": 7860,
+    // The default model name.
+    "default_model_name": "medium",
+    // The default VAD.
+    "default_vad": "silero-vad",
+    // A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.
+    "vad_parallel_devices": "",
+    // The number of CPU cores to use for VAD pre-processing.
+    "vad_cpu_cores": 1,
+    // The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout.
+    "vad_process_timeout": 1800,
+    // True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.
+    "auto_parallel": false,
+    // Directory to save the outputs
+    "output_dir": null
+}

requirements.txt CHANGED Viewed

@@ -1,7 +1,9 @@
 git+https://github.com/openai/whisper.git
 transformers
 ffmpeg-python==0.2.0
 gradio==3.13.0
 yt-dlp
 torchaudio
-altair

+git+https://github.com/huggingface/transformers
 git+https://github.com/openai/whisper.git
 transformers
 ffmpeg-python==0.2.0
 gradio==3.13.0
 yt-dlp
 torchaudio
+altair
+json5

src/config.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import urllib
+import os
+from typing import List
+from urllib.parse import urlparse
+from tqdm import tqdm
+from src.conversion.hf_converter import convert_hf_whisper
+class ModelConfig:
+    def __init__(self, name: str, url: str, path: str = None, type: str = "whisper"):
+        """
+        Initialize a model configuration.
+        name: Name of the model
+        url: URL to download the model from
+        path: Path to the model file. If not set, the model will be downloaded from the URL.
+        type: Type of model. Can be whisper or huggingface.
+        """
+        self.name = name
+        self.url = url
+        self.path = path
+        self.type = type
+    def download_url(self, root_dir: str):
+        import whisper
+        # See if path is already set
+        if self.path is not None:
+            return self.path
+        if root_dir is None:
+            root_dir = os.path.join(os.path.expanduser("~"), ".cache", "whisper")
+        model_type = self.type.lower() if self.type is not None else "whisper"
+        if model_type in ["huggingface", "hf"]:
+            self.path = self.url
+            destination_target = os.path.join(root_dir, self.name + ".pt")
+            # Convert from HuggingFace format to Whisper format
+            if os.path.exists(destination_target):
+                print(f"File {destination_target} already exists, skipping conversion")
+            else:
+                print("Saving HuggingFace model in Whisper format to " + destination_target)
+                convert_hf_whisper(self.url, destination_target)
+            self.path = destination_target
+        elif model_type in ["whisper", "w"]:
+            self.path = self.url
+            # See if URL is just a file
+            if self.url in whisper._MODELS:
+                # No need to download anything - Whisper will handle it
+                self.path = self.url
+            elif self.url.startswith("file://"):
+                # Get file path
+                self.path = urlparse(self.url).path
+            # See if it is an URL
+            elif self.url.startswith("http://") or self.url.startswith("https://"):
+                # Extension (or file name)
+                extension = os.path.splitext(self.url)[-1]
+                download_target = os.path.join(root_dir, self.name + extension)
+                if os.path.exists(download_target) and not os.path.isfile(download_target):
+                    raise RuntimeError(f"{download_target} exists and is not a regular file")
+                if not os.path.isfile(download_target):
+                    self._download_file(self.url, download_target)
+                else:
+                    print(f"File {download_target} already exists, skipping download")
+                self.path = download_target
+            # Must be a local file
+            else:
+                self.path = self.url
+        else:
+            raise ValueError(f"Unknown model type {model_type}")
+        return self.path
+    def _download_file(self, url: str, destination: str):
+        with urllib.request.urlopen(url) as source, open(destination, "wb") as output:
+            with tqdm(
+                total=int(source.info().get("Content-Length")),
+                ncols=80,
+                unit="iB",
+                unit_scale=True,
+                unit_divisor=1024,
+            ) as loop:
+                while True:
+                    buffer = source.read(8192)
+                    if not buffer:
+                        break
+                    output.write(buffer)
+                    loop.update(len(buffer))
+class ApplicationConfig:
+    def __init__(self, models: List[ModelConfig] = [], input_audio_max_duration: int = 600,
+                 share: bool = False, server_name: str = None, server_port: int = 7860, default_model_name: str = "medium",
+                 default_vad: str = "silero-vad", vad_parallel_devices: str = "", vad_cpu_cores: int = 1, vad_process_timeout: int = 1800,
+                 auto_parallel: bool = False, output_dir: str = None):
+        self.models = models
+        self.input_audio_max_duration = input_audio_max_duration
+        self.share = share
+        self.server_name = server_name
+        self.server_port = server_port
+        self.default_model_name = default_model_name
+        self.default_vad = default_vad
+        self.vad_parallel_devices = vad_parallel_devices
+        self.vad_cpu_cores = vad_cpu_cores
+        self.vad_process_timeout = vad_process_timeout
+        self.auto_parallel = auto_parallel
+        self.output_dir = output_dir
+    def get_model_names(self):
+        return [ x.name for x in self.models ]
+    @staticmethod
+    def parse_file(config_path: str):
+        import json5
+        with open(config_path, "r") as f:
+            # Load using json5
+            data = json5.load(f)
+            data_models = data.pop("models", [])
+            models = [ ModelConfig(**x) for x in data_models ]
+            return ApplicationConfig(models, **data)

src/conversion/hf_converter.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# https://github.com/bayartsogt-ya/whisper-multiple-hf-datasets
+from copy import deepcopy
+import torch
+from transformers import WhisperForConditionalGeneration
+WHISPER_MAPPING = {
+    "layers": "blocks",
+    "fc1": "mlp.0",
+    "fc2": "mlp.2",
+    "final_layer_norm": "mlp_ln",
+    "layers": "blocks",
+    ".self_attn.q_proj": ".attn.query",
+    ".self_attn.k_proj": ".attn.key",
+    ".self_attn.v_proj": ".attn.value",
+    ".self_attn_layer_norm": ".attn_ln",
+    ".self_attn.out_proj": ".attn.out",
+    ".encoder_attn.q_proj": ".cross_attn.query",
+    ".encoder_attn.k_proj": ".cross_attn.key",
+    ".encoder_attn.v_proj": ".cross_attn.value",
+    ".encoder_attn_layer_norm": ".cross_attn_ln",
+    ".encoder_attn.out_proj": ".cross_attn.out",
+    "decoder.layer_norm.": "decoder.ln.",
+    "encoder.layer_norm.": "encoder.ln_post.",
+    "embed_tokens": "token_embedding",
+    "encoder.embed_positions.weight": "encoder.positional_embedding",
+    "decoder.embed_positions.weight": "decoder.positional_embedding",
+    "layer_norm": "ln_post",
+}
+def rename_keys(s_dict):
+    keys = list(s_dict.keys())
+    for key in keys:
+        new_key = key
+        for k, v in WHISPER_MAPPING.items():
+            if k in key:
+                new_key = new_key.replace(k, v)
+        print(f"{key} -> {new_key}")
+        s_dict[new_key] = s_dict.pop(key)
+    return s_dict
+def convert_hf_whisper(hf_model_name_or_path: str, whisper_state_path: str):
+    transformer_model = WhisperForConditionalGeneration.from_pretrained(hf_model_name_or_path)
+    config = transformer_model.config
+    # first build dims
+    dims = {
+        'n_mels': config.num_mel_bins,
+        'n_vocab': config.vocab_size,
+        'n_audio_ctx': config.max_source_positions,
+        'n_audio_state': config.d_model,
+        'n_audio_head': config.encoder_attention_heads,
+        'n_audio_layer': config.encoder_layers,
+        'n_text_ctx': config.max_target_positions,
+        'n_text_state': config.d_model,
+        'n_text_head': config.decoder_attention_heads,
+        'n_text_layer': config.decoder_layers
+    }
+    state_dict = deepcopy(transformer_model.model.state_dict())
+    state_dict = rename_keys(state_dict)
+    torch.save({"dims": dims, "model_state_dict": state_dict}, whisper_state_path)

src/whisperContainer.py CHANGED Viewed

@@ -1,11 +1,14 @@
 # External programs
 import os
 import whisper
 from src.modelCache import GLOBAL_MODEL_CACHE, ModelCache
 class WhisperContainer:
-    def __init__(self, model_name: str, device: str = None, download_root: str = None, cache: ModelCache = None):
         self.model_name = model_name
         self.device = device
         self.download_root = download_root
@@ -13,6 +16,9 @@ class WhisperContainer:
         # Will be created on demand
         self.model = None
     def get_model(self):
         if self.model is None:
@@ -32,21 +38,40 @@ class WhisperContainer:
         # Warning: Using private API here
         try:
             root_dir = self.download_root
             if root_dir is None:
                 root_dir = os.path.join(os.path.expanduser("~"), ".cache", "whisper")
             if self.model_name in whisper._MODELS:
                 whisper._download(whisper._MODELS[self.model_name], root_dir, False)
             return True
         except Exception as e:
             # Given that the API is private, it could change at any time. We don't want to crash the program
             print("Error pre-downloading model: " + str(e))
             return False
     def _create_model(self):
         print("Loading whisper model " + self.model_name)
-        return whisper.load_model(self.model_name, device=self.device, download_root=self.download_root)
     def create_callback(self, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
         """
@@ -71,12 +96,13 @@ class WhisperContainer:
     # This is required for multiprocessing
     def __getstate__(self):
-        return { "model_name": self.model_name, "device": self.device, "download_root": self.download_root }
     def __setstate__(self, state):
         self.model_name = state["model_name"]
         self.device = state["device"]
         self.download_root = state["download_root"]
         self.model = None
         # Depickled objects must use the global cache
         self.cache = GLOBAL_MODEL_CACHE

 # External programs
 import os
+from typing import List
 import whisper
+from src.config import ModelConfig
 from src.modelCache import GLOBAL_MODEL_CACHE, ModelCache
 class WhisperContainer:
+    def __init__(self, model_name: str, device: str = None, download_root: str = None,
+                 cache: ModelCache = None, models: List[ModelConfig] = []):
         self.model_name = model_name
         self.device = device
         self.download_root = download_root
         # Will be created on demand
         self.model = None
+        # List of known models
+        self.models = models
     def get_model(self):
         if self.model is None:
         # Warning: Using private API here
         try:
             root_dir = self.download_root
+            model_config = self.get_model_config()
             if root_dir is None:
                 root_dir = os.path.join(os.path.expanduser("~"), ".cache", "whisper")
             if self.model_name in whisper._MODELS:
                 whisper._download(whisper._MODELS[self.model_name], root_dir, False)
+            else:
+                # If the model is not in the official list, see if it needs to be downloaded
+                model_config.download_url(root_dir)
             return True
         except Exception as e:
             # Given that the API is private, it could change at any time. We don't want to crash the program
             print("Error pre-downloading model: " + str(e))
             return False
+    def get_model_config(self) -> ModelConfig:
+        """
+        Get the model configuration for the model.
+        """
+        for model in self.models:
+            if model.name == self.model_name:
+                return model
+        return None
     def _create_model(self):
         print("Loading whisper model " + self.model_name)
+        model_config = self.get_model_config()
+        # Note that the model will not be downloaded in the case of an official Whisper model
+        model_path = model_config.download_url(self.download_root)
+        return whisper.load_model(model_path, device=self.device, download_root=self.download_root)
     def create_callback(self, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
         """
     # This is required for multiprocessing
     def __getstate__(self):
+        return { "model_name": self.model_name, "device": self.device, "download_root": self.download_root, "models": self.models }
     def __setstate__(self, state):
         self.model_name = state["model_name"]
         self.device = state["device"]
         self.download_root = state["download_root"]
+        self.models = state["models"]
         self.model = None
         # Depickled objects must use the global cache
         self.cache = GLOBAL_MODEL_CACHE