File size: 2,184 Bytes
de07127
 
50f4595
4afec78
c6ad7d5
decc59e
3bef3fb
664eb76
3f0eaa9
0597769
8a965da
a2384ba
5ce6e7e
0597769
 
8a965da
973bb39
be77284
6b7e790
db75012
68ed0e8
 
0597769
68ed0e8
 
 
 
 
 
0597769
68ed0e8
d490aec
0597769
3f0eaa9
 
 
176ad20
 
bac1bec
 
 
 
 
 
 
 
 
7843939
bac1bec
 
 
 
 
3bef3fb
75a25c7
 
 
 
 
 
 
3bef3fb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os 
import sys 
os.system("pip install transformers==4.27.0")
os.system("pip install numpy==1.23")
from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor, AutoFeatureExtractor, AutoProcessor, WhisperConfig, WhisperProcessor, WhisperForConditionalGeneration
os.system("pip install jiwer")
from jiwer import wer
os.system("pip install datasets[audio]")
from evaluate import evaluator, load
from transformers import AutoModelForSequenceClassification, pipeline, BertTokenizer, AutoTokenizer, GPT2Model
from datasets import load_dataset, Audio, disable_caching, set_caching_enabled
import gradio as gr
import torch
from datasets import load_dataset
from transformers import WhisperForConditionalGeneration, WhisperProcessor


processor = WhisperProcessor.from_pretrained("mskov/whisper-small-esc50")
model = WhisperForConditionalGeneration.from_pretrained("mskov/whisper-small-esc50").to("cuda")

def map_to_pred(batch):
    audio = batch["audio"]
    input_features = processor(audio["array"], sampling_rate=16000, return_tensors="pt").input_features
    batch["reference"] = processor.tokenizer._normalize(batch['category'])

    with torch.no_grad():
        predicted_ids = model.generate(input_features.to("cuda"))[0]
    transcription = processor.decode(predicted_ids)
    batch["prediction"] = processor.tokenizer._normalize(transcription)
    print(batch["prediction"])
    return batch

result = librispeech_test_clean.map(map_to_pred)

wer = load("wer")
print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))


    
def transcribe(audio):
    text = pipe(audio)["text"]
    return text, test

iface = gr.Interface(
    fn=transcribe, 
    inputs=gr.Audio(source="microphone", type="filepath"), 
    outputs="text",
    title="Whisper Small ESC50 Test",
)

iface.launch()


'''
print("check check")
print(inputs)
input_features = inputs.input_features
decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
list(last_hidden_state.shape)
print(list(last_hidden_state.shape))
'''