danelkay commited on
Commit
bac20d5
·
1 Parent(s): cd0f407

🔥 chore: remove unused CLI and alternative requirements files

Browse files

The commit removes legacy CLI files and alternative requirements files, consolidating dependencies into pyproject.toml.

This change:
- Removes app-local.py, app-network.py and app-shared.py
- Removes cli.py as functionality is now handled differently
- Removes requirements-fasterWhisper.txt and requirements-whisper.txt
- Adds pyproject.toml with proper dependencies configuration

app-local.py DELETED
@@ -1,5 +0,0 @@
1
- # Run the app with no audio file restrictions
2
- from app import create_ui
3
- from src.config import ApplicationConfig
4
-
5
- create_ui(ApplicationConfig.create_default(input_audio_max_duration=-1))
 
 
 
 
 
 
app-network.py DELETED
@@ -1,5 +0,0 @@
1
- # Run the app with no audio file restrictions, and make it available on the network
2
- from app import create_ui
3
- from src.config import ApplicationConfig
4
-
5
- create_ui(ApplicationConfig.create_default(input_audio_max_duration=-1, server_name="0.0.0.0"))
 
 
 
 
 
 
app-shared.py DELETED
@@ -1,5 +0,0 @@
1
- # Run the app with no audio file restrictions
2
- from app import create_ui
3
- from src.config import ApplicationConfig
4
-
5
- create_ui(ApplicationConfig.create_default(input_audio_max_duration=-1, share=True))
 
 
 
 
 
 
cli.py DELETED
@@ -1,206 +0,0 @@
1
- import argparse
2
- import os
3
- import pathlib
4
- from urllib.parse import urlparse
5
- import warnings
6
- import numpy as np
7
-
8
- import torch
9
- from app import VadOptions, WhisperTranscriber
10
- from src.config import VAD_INITIAL_PROMPT_MODE_VALUES, ApplicationConfig, VadInitialPromptMode
11
- from src.diarization.diarization import Diarization
12
- from src.download import download_url
13
- from src.languages import get_language_names
14
-
15
- from src.utils import optional_float, optional_int, str2bool
16
- from src.whisper.whisperFactory import create_whisper_container
17
-
18
- def cli():
19
- app_config = ApplicationConfig.create_default()
20
- whisper_models = app_config.get_model_names()
21
-
22
- # For the CLI, we fallback to saving the output to the current directory
23
- output_dir = app_config.output_dir if app_config.output_dir is not None else "."
24
-
25
- # Environment variable overrides
26
- default_whisper_implementation = os.environ.get("WHISPER_IMPLEMENTATION", app_config.whisper_implementation)
27
-
28
- parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
29
- parser.add_argument("audio", nargs="+", type=str, \
30
- help="audio file(s) to transcribe")
31
- parser.add_argument("--model", default=app_config.default_model_name, choices=whisper_models, \
32
- help="name of the Whisper model to use") # medium
33
- parser.add_argument("--model_dir", type=str, default=app_config.model_dir, \
34
- help="the path to save model files; uses ~/.cache/whisper by default")
35
- parser.add_argument("--device", default=app_config.device, \
36
- help="device to use for PyTorch inference")
37
- parser.add_argument("--output_dir", "-o", type=str, default=output_dir, \
38
- help="directory to save the outputs")
39
- parser.add_argument("--verbose", type=str2bool, default=app_config.verbose, \
40
- help="whether to print out the progress and debug messages")
41
- parser.add_argument("--whisper_implementation", type=str, default=default_whisper_implementation, choices=["whisper", "faster-whisper"],\
42
- help="the Whisper implementation to use")
43
-
44
- parser.add_argument("--task", type=str, default=app_config.task, choices=["transcribe", "translate"], \
45
- help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
46
- parser.add_argument("--language", type=str, default=app_config.language, choices=sorted(get_language_names()), \
47
- help="language spoken in the audio, specify None to perform language detection")
48
-
49
- parser.add_argument("--vad", type=str, default=app_config.default_vad, choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], \
50
- help="The voice activity detection algorithm to use") # silero-vad
51
- parser.add_argument("--vad_initial_prompt_mode", type=str, default=app_config.vad_initial_prompt_mode, choices=VAD_INITIAL_PROMPT_MODE_VALUES, \
52
- help="Whether or not to prepend the initial prompt to each VAD segment (prepend_all_segments), or just the first segment (prepend_first_segment)") # prepend_first_segment
53
- parser.add_argument("--vad_merge_window", type=optional_float, default=app_config.vad_merge_window, \
54
- help="The window size (in seconds) to merge voice segments")
55
- parser.add_argument("--vad_max_merge_size", type=optional_float, default=app_config.vad_max_merge_size,\
56
- help="The maximum size (in seconds) of a voice segment")
57
- parser.add_argument("--vad_padding", type=optional_float, default=app_config.vad_padding, \
58
- help="The padding (in seconds) to add to each voice segment")
59
- parser.add_argument("--vad_prompt_window", type=optional_float, default=app_config.vad_prompt_window, \
60
- help="The window size of the prompt to pass to Whisper")
61
- parser.add_argument("--vad_cpu_cores", type=int, default=app_config.vad_cpu_cores, \
62
- help="The number of CPU cores to use for VAD pre-processing.") # 1
63
- parser.add_argument("--vad_parallel_devices", type=str, default=app_config.vad_parallel_devices, \
64
- help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.") # ""
65
- parser.add_argument("--auto_parallel", type=bool, default=app_config.auto_parallel, \
66
- help="True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.") # False
67
-
68
- parser.add_argument("--temperature", type=float, default=app_config.temperature, \
69
- help="temperature to use for sampling")
70
- parser.add_argument("--best_of", type=optional_int, default=app_config.best_of, \
71
- help="number of candidates when sampling with non-zero temperature")
72
- parser.add_argument("--beam_size", type=optional_int, default=app_config.beam_size, \
73
- help="number of beams in beam search, only applicable when temperature is zero")
74
- parser.add_argument("--patience", type=float, default=app_config.patience, \
75
- help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
76
- parser.add_argument("--length_penalty", type=float, default=app_config.length_penalty, \
77
- help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple lengt normalization by default")
78
-
79
- parser.add_argument("--suppress_tokens", type=str, default=app_config.suppress_tokens, \
80
- help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
81
- parser.add_argument("--initial_prompt", type=str, default=app_config.initial_prompt, \
82
- help="optional text to provide as a prompt for the first window.")
83
- parser.add_argument("--condition_on_previous_text", type=str2bool, default=app_config.condition_on_previous_text, \
84
- help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
85
- parser.add_argument("--fp16", type=str2bool, default=app_config.fp16, \
86
- help="whether to perform inference in fp16; True by default")
87
- parser.add_argument("--compute_type", type=str, default=app_config.compute_type, choices=["default", "auto", "int8", "int8_float16", "int16", "float16", "float32"], \
88
- help="the compute type to use for inference")
89
-
90
- parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=app_config.temperature_increment_on_fallback, \
91
- help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
92
- parser.add_argument("--compression_ratio_threshold", type=optional_float, default=app_config.compression_ratio_threshold, \
93
- help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
94
- parser.add_argument("--logprob_threshold", type=optional_float, default=app_config.logprob_threshold, \
95
- help="if the average log probability is lower than this value, treat the decoding as failed")
96
- parser.add_argument("--no_speech_threshold", type=optional_float, default=app_config.no_speech_threshold, \
97
- help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
98
-
99
- parser.add_argument("--word_timestamps", type=str2bool, default=app_config.word_timestamps,
100
- help="(experimental) extract word-level timestamps and refine the results based on them")
101
- parser.add_argument("--prepend_punctuations", type=str, default=app_config.prepend_punctuations,
102
- help="if word_timestamps is True, merge these punctuation symbols with the next word")
103
- parser.add_argument("--append_punctuations", type=str, default=app_config.append_punctuations,
104
- help="if word_timestamps is True, merge these punctuation symbols with the previous word")
105
- parser.add_argument("--highlight_words", type=str2bool, default=app_config.highlight_words,
106
- help="(requires --word_timestamps True) underline each word as it is spoken in srt and vtt")
107
- parser.add_argument("--threads", type=optional_int, default=0,
108
- help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
109
-
110
- # Diarization
111
- parser.add_argument('--auth_token', type=str, default=app_config.auth_token, help='HuggingFace API Token (optional)')
112
- parser.add_argument("--diarization", type=str2bool, default=app_config.diarization, \
113
- help="whether to perform speaker diarization")
114
- parser.add_argument("--diarization_num_speakers", type=int, default=app_config.diarization_speakers, help="Number of speakers")
115
- parser.add_argument("--diarization_min_speakers", type=int, default=app_config.diarization_min_speakers, help="Minimum number of speakers")
116
- parser.add_argument("--diarization_max_speakers", type=int, default=app_config.diarization_max_speakers, help="Maximum number of speakers")
117
-
118
- args = parser.parse_args().__dict__
119
- model_name: str = args.pop("model")
120
- model_dir: str = args.pop("model_dir")
121
- output_dir: str = args.pop("output_dir")
122
- device: str = args.pop("device")
123
- os.makedirs(output_dir, exist_ok=True)
124
-
125
- if (threads := args.pop("threads")) > 0:
126
- torch.set_num_threads(threads)
127
-
128
- whisper_implementation = args.pop("whisper_implementation")
129
- print(f"Using {whisper_implementation} for Whisper")
130
-
131
- if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
132
- warnings.warn(f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead.")
133
- args["language"] = "en"
134
-
135
- temperature = args.pop("temperature")
136
- temperature_increment_on_fallback = args.pop("temperature_increment_on_fallback")
137
- if temperature_increment_on_fallback is not None:
138
- temperature = tuple(np.arange(temperature, 1.0 + 1e-6, temperature_increment_on_fallback))
139
- else:
140
- temperature = [temperature]
141
-
142
- vad = args.pop("vad")
143
- vad_initial_prompt_mode = args.pop("vad_initial_prompt_mode")
144
- vad_merge_window = args.pop("vad_merge_window")
145
- vad_max_merge_size = args.pop("vad_max_merge_size")
146
- vad_padding = args.pop("vad_padding")
147
- vad_prompt_window = args.pop("vad_prompt_window")
148
- vad_cpu_cores = args.pop("vad_cpu_cores")
149
- auto_parallel = args.pop("auto_parallel")
150
-
151
- compute_type = args.pop("compute_type")
152
- highlight_words = args.pop("highlight_words")
153
-
154
- auth_token = args.pop("auth_token")
155
- diarization = args.pop("diarization")
156
- num_speakers = args.pop("diarization_num_speakers")
157
- min_speakers = args.pop("diarization_min_speakers")
158
- max_speakers = args.pop("diarization_max_speakers")
159
-
160
- transcriber = WhisperTranscriber(delete_uploaded_files=False, vad_cpu_cores=vad_cpu_cores, app_config=app_config)
161
- transcriber.set_parallel_devices(args.pop("vad_parallel_devices"))
162
- transcriber.set_auto_parallel(auto_parallel)
163
-
164
- if diarization:
165
- transcriber.set_diarization(auth_token=auth_token, enable_daemon_process=False, num_speakers=num_speakers, min_speakers=min_speakers, max_speakers=max_speakers)
166
-
167
- model = create_whisper_container(whisper_implementation=whisper_implementation, model_name=model_name,
168
- device=device, compute_type=compute_type, download_root=model_dir, models=app_config.models)
169
-
170
- if (transcriber._has_parallel_devices()):
171
- print("Using parallel devices:", transcriber.parallel_device_list)
172
-
173
- for audio_path in args.pop("audio"):
174
- sources = []
175
-
176
- # Detect URL and download the audio
177
- if (uri_validator(audio_path)):
178
- # Download from YouTube/URL directly
179
- for source_path in download_url(audio_path, maxDuration=-1, destinationDirectory=output_dir, playlistItems=None):
180
- source_name = os.path.basename(source_path)
181
- sources.append({ "path": source_path, "name": source_name })
182
- else:
183
- sources.append({ "path": audio_path, "name": os.path.basename(audio_path) })
184
-
185
- for source in sources:
186
- source_path = source["path"]
187
- source_name = source["name"]
188
-
189
- vadOptions = VadOptions(vad, vad_merge_window, vad_max_merge_size, vad_padding, vad_prompt_window,
190
- VadInitialPromptMode.from_string(vad_initial_prompt_mode))
191
-
192
- result = transcriber.transcribe_file(model, source_path, temperature=temperature, vadOptions=vadOptions, **args)
193
-
194
- transcriber.write_result(result, source_name, output_dir, highlight_words)
195
-
196
- transcriber.close()
197
-
198
- def uri_validator(x):
199
- try:
200
- result = urlparse(x)
201
- return all([result.scheme, result.netloc])
202
- except:
203
- return False
204
-
205
- if __name__ == '__main__':
206
- cli()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pyproject.toml ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "whatshutup"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ requires-python = ">=3.12"
6
+ dependencies = [
7
+ "faster-whisper>=1.1.1",
8
+ "ffmpeg>=1.4",
9
+ "gradio>=5.14.0",
10
+ "huggingface-hub>=0.28.1",
11
+ "pydub>=0.25.1",
12
+ "sentencepiece>=0.2.0",
13
+ "torch>=2.6.0",
14
+ "transformers>=4.48.2",
15
+ ]
16
+
17
+ [dependency-groups]
18
+ app = [
19
+ "aiofiles==23.2.1",
20
+ "annotated-types==0.7.0",
21
+ "anyio==4.8.0",
22
+ "audioop-lts==0.2.1 ; python_full_version >= '3.13'",
23
+ "av==14.1.0",
24
+ "certifi==2025.1.31",
25
+ "charset-normalizer==3.4.1",
26
+ "click==8.1.8 ; sys_platform != 'emscripten'",
27
+ "colorama==0.4.6 ; sys_platform == 'win32'",
28
+ "coloredlogs==15.0.1",
29
+ "ctranslate2==4.5.0",
30
+ "fastapi==0.115.8",
31
+ "faster-whisper==1.1.1",
32
+ "ffmpeg==1.4",
33
+ "ffmpy==0.5.0",
34
+ "filelock==3.17.0",
35
+ "flatbuffers==25.1.24",
36
+ "fsspec==2024.12.0",
37
+ "gradio==5.14.0",
38
+ "gradio-client==1.7.0",
39
+ "h11==0.14.0",
40
+ "httpcore==1.0.7",
41
+ "httpx==0.28.1",
42
+ "huggingface-hub==0.28.1",
43
+ "humanfriendly==10.0",
44
+ "idna==3.10",
45
+ "jinja2==3.1.5",
46
+ "markdown-it-py==3.0.0 ; sys_platform != 'emscripten'",
47
+ "markupsafe==2.1.5",
48
+ "mdurl==0.1.2 ; sys_platform != 'emscripten'",
49
+ "mpmath==1.3.0",
50
+ "networkx==3.4.2",
51
+ "numpy==2.2.2",
52
+ "nvidia-cublas-cu12==12.4.5.8 ; platform_machine == 'x86_64' and sys_platform == 'linux'",
53
+ "nvidia-cuda-cupti-cu12==12.4.127 ; platform_machine == 'x86_64' and sys_platform == 'linux'",
54
+ "nvidia-cuda-nvrtc-cu12==12.4.127 ; platform_machine == 'x86_64' and sys_platform == 'linux'",
55
+ "nvidia-cuda-runtime-cu12==12.4.127 ; platform_machine == 'x86_64' and sys_platform == 'linux'",
56
+ "nvidia-cudnn-cu12==9.1.0.70 ; platform_machine == 'x86_64' and sys_platform == 'linux'",
57
+ "nvidia-cufft-cu12==11.2.1.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'",
58
+ "nvidia-curand-cu12==10.3.5.147 ; platform_machine == 'x86_64' and sys_platform == 'linux'",
59
+ "nvidia-cusolver-cu12==11.6.1.9 ; platform_machine == 'x86_64' and sys_platform == 'linux'",
60
+ "nvidia-cusparse-cu12==12.3.1.170 ; platform_machine == 'x86_64' and sys_platform == 'linux'",
61
+ "nvidia-cusparselt-cu12==0.6.2 ; platform_machine == 'x86_64' and sys_platform == 'linux'",
62
+ "nvidia-nccl-cu12==2.21.5 ; platform_machine == 'x86_64' and sys_platform == 'linux'",
63
+ "nvidia-nvjitlink-cu12==12.4.127 ; platform_machine == 'x86_64' and sys_platform == 'linux'",
64
+ "nvidia-nvtx-cu12==12.4.127 ; platform_machine == 'x86_64' and sys_platform == 'linux'",
65
+ "onnxruntime==1.20.1",
66
+ "orjson==3.10.15",
67
+ "packaging==24.2",
68
+ "pandas==2.2.3",
69
+ "pillow==11.1.0",
70
+ "protobuf==5.29.3",
71
+ "pydantic==2.10.6",
72
+ "pydantic-core==2.27.2",
73
+ "pydub==0.25.1",
74
+ "pygments==2.19.1 ; sys_platform != 'emscripten'",
75
+ "pyreadline3==3.5.4 ; sys_platform == 'win32'",
76
+ "python-dateutil==2.9.0.post0",
77
+ "python-multipart==0.0.20",
78
+ "pytz==2025.1",
79
+ "pyyaml==6.0.2",
80
+ "regex==2024.11.6",
81
+ "requests==2.32.3",
82
+ "rich==13.9.4 ; sys_platform != 'emscripten'",
83
+ "ruff==0.9.4 ; sys_platform != 'emscripten'",
84
+ "safehttpx==0.1.6",
85
+ "safetensors==0.5.2",
86
+ "semantic-version==2.10.0",
87
+ "sentencepiece==0.2.0",
88
+ "setuptools==75.8.0",
89
+ "shellingham==1.5.4 ; sys_platform != 'emscripten'",
90
+ "six==1.17.0",
91
+ "sniffio==1.3.1",
92
+ "starlette==0.45.3",
93
+ "sympy==1.13.1",
94
+ "tokenizers==0.21.0",
95
+ "tomlkit==0.13.2",
96
+ "torch==2.6.0",
97
+ "tqdm==4.67.1",
98
+ "transformers==4.48.2",
99
+ "triton==3.2.0 ; platform_machine == 'x86_64' and sys_platform == 'linux'",
100
+ "typer==0.15.1 ; sys_platform != 'emscripten'",
101
+ "typing-extensions==4.12.2",
102
+ "tzdata==2025.1",
103
+ "urllib3==2.3.0",
104
+ "uvicorn==0.34.0 ; sys_platform != 'emscripten'",
105
+ "websockets==14.2",
106
+ ]
requirements-fasterWhisper.txt DELETED
@@ -1,16 +0,0 @@
1
- ctranslate2
2
- faster-whisper
3
- ffmpeg-python==0.2.0
4
- gradio==3.38.0
5
- gradio-client==0.8.1
6
- yt-dlp
7
- json5
8
- torch
9
- torchaudio
10
- more_itertools
11
-
12
- # Needed by diarization
13
- intervaltree
14
- srt
15
- torch
16
- https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements-whisper.txt DELETED
@@ -1,16 +0,0 @@
1
- git+https://github.com/huggingface/transformers
2
- git+https://github.com/openai/whisper.git
3
- transformers
4
- ffmpeg-python==0.2.0
5
- gradio==3.38.0
6
- gradio-client==0.8.1
7
- yt-dlp
8
- torchaudio
9
- altair
10
- json5
11
-
12
- # Needed by diarization
13
- intervaltree
14
- srt
15
- torch
16
- https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  # This file was autogenerated by uv via the following command:
2
- # uv export
3
  aiofiles==23.2.1 \
4
  --hash=sha256:19297512c647d4b27a2cf7c34caa7e405c0d60b5560618a29a9fe027b18b0107 \
5
  --hash=sha256:84ec2218d8419404abcb9f0c02df3f34c6e0a68ed41072acfb1cef5cbc29051a
 
1
  # This file was autogenerated by uv via the following command:
2
+ # uv export --output-file .\requirements.txt
3
  aiofiles==23.2.1 \
4
  --hash=sha256:19297512c647d4b27a2cf7c34caa7e405c0d60b5560618a29a9fe027b18b0107 \
5
  --hash=sha256:84ec2218d8419404abcb9f0c02df3f34c6e0a68ed41072acfb1cef5cbc29051a
uv.lock ADDED
The diff for this file is too large to render. See raw diff