File size: 1,479 Bytes
e2c81e7
 
 
 
 
 
 
 
 
bbd4e37
 
e2c81e7
7935c8d
bbd4e37
21e6f0a
 
bbd4e37
e2c81e7
 
 
 
 
 
 
 
 
 
 
 
 
bbd4e37
7935c8d
 
 
 
bbd4e37
7935c8d
bbd4e37
21e6f0a
 
bbd4e37
e2c81e7
7935c8d
 
e68b7ad
7935c8d
bbd4e37
e2c81e7
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import json
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

# config
model_id = "kotoba-tech/kotoba-whisper-v1.0"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = None
pipe = None
initial_prompt = None

def load_model():
    global model, pipe

    # load model
    model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
    model.to(device)
    processor = AutoProcessor.from_pretrained(model_id)
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        torch_dtype=torch_dtype,
        device=device,
    )

def set_prompt(prompt):
    global initial_prompt
    initial_prompt = prompt

def speech_to_text(audio_file, _model_size = None):
    global model, pipe, initial_prompt

    if not model:
        load_model()

    # run inference
    generate_kwargs = {}
    if initial_prompt:
        generate_kwargs['prompt_ids'] = pipe.tokenizer.get_prompt_ids(initial_prompt, return_tensors="pt").to(device)
    result = pipe(audio_file, generate_kwargs=generate_kwargs)

    try:
        res = json.dumps(result)
    except:
        res = ''
    
    return result["text"], res