import os import sys os.system("pip install transformers==4.27.0") os.system("pip install numpy==1.23") from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor, AutoFeatureExtractor, AutoProcessor, WhisperConfig os.system("pip install jiwer") from jiwer import wer os.system("pip install datasets[audio]") from evaluate import evaluator from datasets import load_dataset, Audio, disable_caching, set_caching_enabled import gradio as gr set_caching_enabled(False) disable_caching() huggingface_token = os.environ["huggingface_token"] pipe = pipeline(model="mskov/whisper-small-esc50") print(pipe) ''' model = WhisperModel.from_pretrained("mskov/whisper-small-miso", use_auth_token=huggingface_token) feature_extractor = AutoFeatureExtractor.from_pretrained("mskov/whisper-small-miso", use_auth_token=huggingface_token) miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper-small-miso", use_auth_token=huggingface_token) ''' dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio(sampling_rate=16000)) print(dataset, "and at 0[audio][array] ", dataset[0]["audio"]["array"], type(dataset[0]["audio"]["array"]), "and at audio : ", dataset[0]["audio"]) def transcribe(audio): text = pipe(audio)["text"] return text iface = gr.Interface( fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs="text", title="Whisper Small Miso Test", ) iface.launch() ''' inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt") print("inputs ::: ", inputs, "and dataset type for good measure: ", type(dataset)) tempDataset = dataset[0]["audio"]["array"].tostring() tokenized_dataset = miso_tokenizer(tempDataset) # Tokenize the dataset input_ids = features.input_ids attention_mask = features.attention_mask ''' ''' # Evaluate the model model.eval() with torch.no_grad(): outputs = model(input_ids=input_ids, attention_mask=attention_mask) # Convert predicted token IDs back to text predicted_text = tokenizer.batch_decode(outputs.logits.argmax(dim=-1), skip_special_tokens=True) # Get ground truth labels from the dataset labels = dataset["audio"] # Replace "labels" with the appropriate key in your dataset # Compute WER wer_score = wer(labels, predicted_text) # Print or return WER score print(f"Word Error Rate (WER): {wer_score}") ''' ''' print("check check") print(inputs) input_features = inputs.input_features decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state list(last_hidden_state.shape) print(list(last_hidden_state.shape)) '''