aka7774 commited on
Commit
e2c81e7
·
verified ·
1 Parent(s): 1e29629

Update fn.py

Browse files
Files changed (1) hide show
  1. fn.py +33 -21
fn.py CHANGED
@@ -1,36 +1,48 @@
1
- from faster_whisper import WhisperModel
 
 
 
 
 
 
 
 
2
 
3
  model = None
4
  model_size = None
 
5
 
6
  def load_model(_model_size):
7
- global model_size, model
8
 
9
  if _model_size and model_size != _model_size:
10
  model_size = _model_size
11
 
12
- try:
13
- model = WhisperModel(model_size, device="cuda", compute_type="float16")
14
- except:
15
- model = WhisperModel(model_size, device="cpu", compute_type="int8")
 
 
 
 
 
 
 
 
 
16
 
17
  def speech_to_text(audio_file, _model_size = None):
18
- global model_size, model
19
 
20
  load_model(_model_size)
21
 
22
- segments, info = model.transcribe(
23
- audio_file,
24
- language='ja',
25
- beam_size=5,
26
- vad_filter=True,
27
- without_timestamps=False,
28
- )
29
-
30
- text_only = ''
31
- text_with_timestamps = ''
32
- for segment in segments:
33
- text_only += f"{segment.text}\n"
34
- text_with_timestamps += f"{segment.start:.2f}\t{segment.end:.2f}\t{segment.text}\n"
35
 
36
- return text_only, text_with_timestamps
 
 
 
 
 
 
1
+ import json
2
+ import torch
3
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
+ from datasets import load_dataset
5
+
6
+ # config
7
+ model_id = "kotoba-tech/kotoba-whisper-v1.0"
8
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
9
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
 
11
  model = None
12
  model_size = None
13
+ pipe = None
14
 
15
  def load_model(_model_size):
16
+ global model_size, model, pipe
17
 
18
  if _model_size and model_size != _model_size:
19
  model_size = _model_size
20
 
21
+ # load model
22
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
23
+ model.to(device)
24
+ processor = AutoProcessor.from_pretrained(model_id)
25
+ pipe = pipeline(
26
+ "automatic-speech-recognition",
27
+ model=model,
28
+ tokenizer=processor.tokenizer,
29
+ feature_extractor=processor.feature_extractor,
30
+ max_new_tokens=128,
31
+ torch_dtype=torch_dtype,
32
+ device=device,
33
+ )
34
 
35
  def speech_to_text(audio_file, _model_size = None):
36
+ global model_size, model, pipe
37
 
38
  load_model(_model_size)
39
 
40
+ # run inference
41
+ result = pipe(audio_file)
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ try:
44
+ res = json.dumps(result)
45
+ except:
46
+ res = ''
47
+
48
+ return result["text"], res