File size: 2,184 Bytes
de07127 50f4595 4afec78 c6ad7d5 decc59e 3bef3fb 664eb76 3f0eaa9 0597769 8a965da a2384ba 5ce6e7e 0597769 8a965da 973bb39 be77284 6b7e790 db75012 68ed0e8 0597769 68ed0e8 0597769 68ed0e8 d490aec 0597769 3f0eaa9 176ad20 bac1bec 7843939 bac1bec 3bef3fb 75a25c7 3bef3fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import os
import sys
os.system("pip install transformers==4.27.0")
os.system("pip install numpy==1.23")
from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor, AutoFeatureExtractor, AutoProcessor, WhisperConfig, WhisperProcessor, WhisperForConditionalGeneration
os.system("pip install jiwer")
from jiwer import wer
os.system("pip install datasets[audio]")
from evaluate import evaluator, load
from transformers import AutoModelForSequenceClassification, pipeline, BertTokenizer, AutoTokenizer, GPT2Model
from datasets import load_dataset, Audio, disable_caching, set_caching_enabled
import gradio as gr
import torch
from datasets import load_dataset
from transformers import WhisperForConditionalGeneration, WhisperProcessor
processor = WhisperProcessor.from_pretrained("mskov/whisper-small-esc50")
model = WhisperForConditionalGeneration.from_pretrained("mskov/whisper-small-esc50").to("cuda")
def map_to_pred(batch):
audio = batch["audio"]
input_features = processor(audio["array"], sampling_rate=16000, return_tensors="pt").input_features
batch["reference"] = processor.tokenizer._normalize(batch['category'])
with torch.no_grad():
predicted_ids = model.generate(input_features.to("cuda"))[0]
transcription = processor.decode(predicted_ids)
batch["prediction"] = processor.tokenizer._normalize(transcription)
print(batch["prediction"])
return batch
result = librispeech_test_clean.map(map_to_pred)
wer = load("wer")
print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
def transcribe(audio):
text = pipe(audio)["text"]
return text, test
iface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs="text",
title="Whisper Small ESC50 Test",
)
iface.launch()
'''
print("check check")
print(inputs)
input_features = inputs.input_features
decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
list(last_hidden_state.shape)
print(list(last_hidden_state.shape))
''' |