File size: 6,572 Bytes
1dbb0f9 bfaa6be 6407f12 1dbb0f9 bfaa6be 6407f12 bfaa6be 6407f12 2286113 6407f12 bfaa6be 1dbb0f9 b4f3cf4 1dbb0f9 55ca3a3 1dbb0f9 55ca3a3 8688650 1dbb0f9 57a80e1 1dbb0f9 2286113 1dbb0f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import time
import logging
import os
import gradio as gr
from faster_whisper import WhisperModel
from languages import get_language_names, get_language_from_name
from subtitle_manager import Subtitle
from pathlib import Path
import psutil
import pynvml
logging.basicConfig(level=logging.INFO)
last_model = None
model = None
def get_free_gpu_memory():
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
pynvml.nvmlShutdown()
return meminfo.free
def get_workers_count():
try:
memory = get_free_gpu_memory()
logging.info("CUDA memory")
except Exception:
memory = psutil.virtual_memory().available
logging.info("RAM memory")
logging.info(f"memory:{memory/ 1_000_000_000} GB")
workers = int(memory / 2_000_000_000)
logging.info(f"workers:{workers}")
return workers
def transcribe_webui_simple_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
chunk_length, compute_type, beam_size, vad_filter, min_silence_duration_ms,
progress=gr.Progress()):
global last_model
global model
progress(0, desc="Loading Audio..")
logging.info(f"languageName:{languageName}")
logging.info(f"urlData:{urlData}")
logging.info(f"multipleFiles:{multipleFiles}")
logging.info(f"microphoneData:{microphoneData}")
logging.info(f"task: {task}")
logging.info(f"chunk_length: {chunk_length}")
if last_model == None or modelName != last_model:
logging.info("first or new model")
progress(0.1, desc="Loading Model..")
model = None
model = WhisperModel(modelName, device="auto",compute_type=compute_type, cpu_threads=os.cpu_count(),)#device="auto", compute_type="float16"
print('loaded')
else:
logging.info("Model not changed")
last_model = modelName
srt_sub = Subtitle("srt")
# vtt_sub = Subtitle("vtt")
# txt_sub = Subtitle("txt")
files = []
if multipleFiles:
files+=multipleFiles
if urlData:
files.append(urlData)
if microphoneData:
files.append(microphoneData)
logging.info(files)
languageName = None if languageName == "Automatic Detection" else get_language_from_name(languageName).code
files_out = []
vtt=""
txt=""
for file in progress.tqdm(files, desc="Working..."):
start_time = time.time()
segments, info = model.transcribe(
file,
beam_size=beam_size,
vad_filter=vad_filter,
language=languageName,
vad_parameters=dict(min_silence_duration_ms=min_silence_duration_ms),
# max_new_tokens=128,
condition_on_previous_text=False,
chunk_length=chunk_length,
)
file_name = Path(file).stem
files_out_srt = srt_sub.write_subtitle(segments, file_name, modelName, progress)
# txt = txt_sub.get_subtitle(segments, progress)
logging.info(print(f"transcribe: {time.time() - start_time} sec."))
files_out += [files_out_srt]
return files_out, vtt, txt
with gr.Blocks(title="Fast Whisper WebUI") as demo:
description = "faster-whisper is a reimplementation of OpenAI's Whisper model using CTranslate2, which is a fast inference engine for Transformer models."
article = "Read the [documentation here](https://github.com/SYSTRAN/faster-whisper)."
whisper_models = [
"tiny", "tiny.en",
"base", "base.en", "Systran/faster-whisper-base.en", "Systran/faster-whisper-base",
"small", "small.en", "distil-small.en", "Systran/faster-distil-whisper-small.en",
"medium", "medium.en", "distil-medium.en", "Systran/faster-distil-whisper-medium.en", "Systran/faster-whisper-medium",
"large",
"large-v1", "Systran/faster-whisper-large-v1",
"large-v2", "distil-large-v2", "Systran/faster-distil-whisper-large-v2",
"large-v3", "distil-large-v3", "Systran/faster-distil-whisper-large-v3", "distil-whisper/distil-large-v3-ct2",
]
compute_types = [
"auto", "default", "int8", "int8_float32",
"int8_float16", "int8_bfloat16", "int16",
"float16", "float32", "bfloat16"
]
# settings
# cant put Dropdown in inputs
# with gr.Accordion("Settings", open=False):
# task = gr.Dropdown(choices=["transcribe", "translate"], label="Task", value="transcribe", interactive = True),
# chunk_length = gr.Number(label='chunk_length',value=30, interactive = True),
# compute_type = gr.Dropdown(label="compute_type", choices=compute_types, value="auto", interactive = True),
# beam_size = gr.Number(label='beam_size',value=5, interactive = True),
# vad_filter = gr.Checkbox(label='vad_filter',info='Use vad_filter', value=True),
# vad_min_silence_duration_ms = gr.Number(label='Vad min_silence_duration_ms',value=500, interactive = True),
gr.Interface(
fn=transcribe_webui_simple_progress,
description=description,
article=article,
inputs=[
gr.Dropdown(choices=whisper_models, value="distil-large-v2", label="Model", info="Select whisper model", interactive = True,),
gr.Dropdown(choices=["Automatic Detection"] + sorted(get_language_names()), value="Automatic Detection", label="Language", info="Select audio voice language", interactive = True,),
gr.Text(label="URL", info="(YouTube, etc.)", interactive = True),
gr.File(label="Upload Files", file_count="multiple"),
gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input Audio"),
gr.Dropdown(choices=["transcribe", "translate"], label="Task", value="transcribe", interactive = True),
gr.Number(label='chunk_length',value=30, interactive = True),
gr.Dropdown(label="compute_type", choices=compute_types, value="auto", interactive = True),
gr.Number(label='beam_size',value=5, interactive = True),
gr.Checkbox(label='vad_filter',info='Use vad_filter', value=True),
gr.Number(label='Vad min_silence_duration_ms',value=500, interactive = True),
],
outputs=[
gr.File(label="Download"),
gr.Text(label="Transcription"),
gr.Text(label="Segments"),
]
)
if __name__ == "__main__":
demo.queue(default_concurrency_limit=get_workers_count())
demo.launch(share=True)
|