import os import sys os.system("pip install transformers==4.27.0") os.system("pip install numpy==1.23") from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor, AutoFeatureExtractor, AutoProcessor, WhisperConfig, WhisperProcessor, WhisperForConditionalGeneration os.system("pip install jiwer") from jiwer import wer os.system("pip install datasets[audio]") from evaluate import evaluator, load from transformers import AutoModelForSequenceClassification, pipeline, BertTokenizer, AutoTokenizer, GPT2Model from datasets import load_dataset, Audio, disable_caching, set_caching_enabled import gradio as gr import torch from datasets import load_dataset from transformers import WhisperForConditionalGeneration, WhisperProcessor processor = WhisperProcessor.from_pretrained("mskov/whisper-small-esc50") model = WhisperForConditionalGeneration.from_pretrained("mskov/whisper-small-esc50").to("cuda") def map_to_pred(batch): audio = batch["audio"] input_features = processor(audio["array"], sampling_rate=16000, return_tensors="pt").input_features batch["reference"] = processor.tokenizer._normalize(batch['category']) with torch.no_grad(): predicted_ids = model.generate(input_features.to("cuda"))[0] transcription = processor.decode(predicted_ids) batch["prediction"] = processor.tokenizer._normalize(transcription) print(batch["prediction"]) return batch result = librispeech_test_clean.map(map_to_pred) wer = load("wer") print(100 * wer.compute(references=result["reference"], predictions=result["prediction"])) def transcribe(audio): text = pipe(audio)["text"] return text, test iface = gr.Interface( fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs="text", title="Whisper Small ESC50 Test", ) iface.launch() ''' print("check check") print(inputs) input_features = inputs.input_features decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state list(last_hidden_state.shape) print(list(last_hidden_state.shape)) '''