File size: 1,327 Bytes
bbd4e37
 
 
 
00f6f1d
b22bcbc
82d8c86
bbd4e37
 
 
 
 
 
 
0ae9155
bbd4e37
0ae9155
bbd4e37
 
b22bcbc
 
00f6f1d
b22bcbc
 
00f6f1d
82d8c86
 
 
 
bbd4e37
82d8c86
bbd4e37
 
 
0ae9155
 
00f6f1d
b22bcbc
0ae9155
 
 
82d8c86
0ae9155
bbd4e37
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from faster_whisper import WhisperModel

model = None
model_size = None
initial_prompt = None
language = 'ja'
transcribe_kwargs = {}

def load_model(_model_size):
    global model_size, model

    if _model_size and model_size != _model_size:
        model_size = _model_size

    try:
        model = WhisperModel(model_size, device="cuda", compute_type="float16")
    except:
        model = WhisperModel(model_size, device="cpu", compute_type="int8")

def set_prompt(prompt, _language = None):
    global initial_prompt, language
    initial_prompt = prompt
    if _language:
        language = _language

def set_transcribe_kwargs(args):
    global transcribe_kwargs
    transcribe_kwargs = args

def speech_to_text(audio_file, _model_size = None):
    global model_size, model, transcribe_kwargs

    load_model(_model_size)

    segments, info = model.transcribe(
        audio_file,
        initial_prompt=initial_prompt,
        language=language,
        beam_size=5,
        vad_filter=True,
        without_timestamps=False,
        **transcribe_kwargs,
    )

    text_only = ''
    text_with_timestamps = ''
    for segment in segments:
        text_only += f"{segment.text}\n"
        text_with_timestamps += f"{segment.start:.2f}\t{segment.end:.2f}\t{segment.text}\n"

    return text_only, text_with_timestamps