File size: 6,572 Bytes
1dbb0f9
 
 
 
 
 
 
 
bfaa6be
6407f12
1dbb0f9
 
 
 
 
bfaa6be
6407f12
 
 
 
 
 
 
bfaa6be
6407f12
 
 
 
2286113
6407f12
 
 
bfaa6be
 
 
 
1dbb0f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4f3cf4
1dbb0f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55ca3a3
 
 
1dbb0f9
55ca3a3
 
8688650
1dbb0f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57a80e1
1dbb0f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2286113
 
1dbb0f9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import time
import logging
import os
import gradio as gr
from faster_whisper import WhisperModel
from languages import get_language_names, get_language_from_name
from subtitle_manager import Subtitle
from pathlib import Path
import psutil
import pynvml

logging.basicConfig(level=logging.INFO)
last_model = None
model = None


def get_free_gpu_memory():
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
    pynvml.nvmlShutdown()
    return meminfo.free

def get_workers_count():
    try:
        memory = get_free_gpu_memory()
        logging.info("CUDA memory")
    except Exception:
        memory = psutil.virtual_memory().available
        logging.info("RAM memory")
        
    logging.info(f"memory:{memory/ 1_000_000_000} GB")
    workers = int(memory / 2_000_000_000)
    logging.info(f"workers:{workers}")
    return workers
    
def transcribe_webui_simple_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
                                    chunk_length, compute_type, beam_size, vad_filter, min_silence_duration_ms,
                                    progress=gr.Progress()):
    global last_model
    global model

    progress(0, desc="Loading Audio..")
    logging.info(f"languageName:{languageName}")
    logging.info(f"urlData:{urlData}")
    logging.info(f"multipleFiles:{multipleFiles}")
    logging.info(f"microphoneData:{microphoneData}")
    logging.info(f"task: {task}")
    logging.info(f"chunk_length: {chunk_length}")

    if last_model == None or modelName != last_model:
        logging.info("first or new model")
        progress(0.1, desc="Loading Model..")
        model = None
        model = WhisperModel(modelName, device="auto",compute_type=compute_type, cpu_threads=os.cpu_count(),)#device="auto", compute_type="float16"
        print('loaded')
    else:
        logging.info("Model not changed")
    last_model = modelName

    srt_sub = Subtitle("srt")
    # vtt_sub = Subtitle("vtt")
    # txt_sub = Subtitle("txt")

    files = []
    if multipleFiles:
        files+=multipleFiles
    if urlData:
        files.append(urlData)
    if microphoneData:
        files.append(microphoneData)
    logging.info(files)

    languageName = None if languageName == "Automatic Detection" else get_language_from_name(languageName).code

    files_out = []
    vtt=""
    txt=""
    for file in progress.tqdm(files, desc="Working..."):

        start_time = time.time()
        segments, info = model.transcribe(
            file,
            beam_size=beam_size,
            vad_filter=vad_filter,
            language=languageName,
            vad_parameters=dict(min_silence_duration_ms=min_silence_duration_ms),
            # max_new_tokens=128,
            condition_on_previous_text=False,
            chunk_length=chunk_length,
            )

        file_name = Path(file).stem
        files_out_srt = srt_sub.write_subtitle(segments, file_name, modelName, progress)
        # txt = txt_sub.get_subtitle(segments, progress)
        logging.info(print(f"transcribe: {time.time() - start_time} sec."))
        files_out += [files_out_srt]
    
    return files_out, vtt, txt


with gr.Blocks(title="Fast Whisper WebUI") as demo:
    description = "faster-whisper is a reimplementation of OpenAI's Whisper model using CTranslate2, which is a fast inference engine for Transformer models."
    article = "Read the [documentation here](https://github.com/SYSTRAN/faster-whisper)."
    whisper_models = [
        "tiny", "tiny.en",
        "base", "base.en", "Systran/faster-whisper-base.en", "Systran/faster-whisper-base",
        "small", "small.en", "distil-small.en", "Systran/faster-distil-whisper-small.en", 
        "medium", "medium.en", "distil-medium.en", "Systran/faster-distil-whisper-medium.en", "Systran/faster-whisper-medium",
        "large",
        "large-v1", "Systran/faster-whisper-large-v1",
        "large-v2", "distil-large-v2", "Systran/faster-distil-whisper-large-v2",
        "large-v3", "distil-large-v3", "Systran/faster-distil-whisper-large-v3", "distil-whisper/distil-large-v3-ct2",
    ]
    compute_types = [
        "auto", "default", "int8", "int8_float32",
        "int8_float16", "int8_bfloat16", "int16",
        "float16", "float32", "bfloat16"
    ]


    # settings
    # cant put Dropdown in inputs
    # with gr.Accordion("Settings", open=False):
        # task = gr.Dropdown(choices=["transcribe", "translate"], label="Task", value="transcribe", interactive = True),
        # chunk_length = gr.Number(label='chunk_length',value=30, interactive = True),
        # compute_type = gr.Dropdown(label="compute_type", choices=compute_types, value="auto", interactive = True),
        # beam_size = gr.Number(label='beam_size',value=5, interactive = True),
        # vad_filter = gr.Checkbox(label='vad_filter',info='Use vad_filter', value=True),
        # vad_min_silence_duration_ms = gr.Number(label='Vad min_silence_duration_ms',value=500, interactive = True),

    gr.Interface(
        fn=transcribe_webui_simple_progress,
        description=description,
        article=article,
        inputs=[
            gr.Dropdown(choices=whisper_models, value="distil-large-v2", label="Model", info="Select whisper model", interactive = True,),
            gr.Dropdown(choices=["Automatic Detection"] + sorted(get_language_names()), value="Automatic Detection", label="Language", info="Select audio voice language", interactive = True,),
            gr.Text(label="URL", info="(YouTube, etc.)", interactive = True),
            gr.File(label="Upload Files", file_count="multiple"),
            gr.Audio(sources=["upload", "microphone"], type="filepath", label="Input Audio"),

            gr.Dropdown(choices=["transcribe", "translate"], label="Task", value="transcribe", interactive = True),
            gr.Number(label='chunk_length',value=30, interactive = True),
            gr.Dropdown(label="compute_type", choices=compute_types, value="auto", interactive = True),
            gr.Number(label='beam_size',value=5, interactive = True),
            gr.Checkbox(label='vad_filter',info='Use vad_filter', value=True),
            gr.Number(label='Vad min_silence_duration_ms',value=500, interactive = True),
        ],
        outputs=[
            gr.File(label="Download"),
            gr.Text(label="Transcription"), 
            gr.Text(label="Segments"),
        ]
    )

if __name__ == "__main__":
    demo.queue(default_concurrency_limit=get_workers_count())
    demo.launch(share=True)