File size: 1,180 Bytes
de07127 b4725a8 decc59e 664eb76 0b5b7f4 8a965da 3b57b43 3826e01 973bb39 62ac43e dca4d0e db75012 75a25c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
import os
import sys
from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor, AutoFeatureExtractor, AutoProcessor, WhisperConfig
os.system("pip install jiwer")
os.system("pip install datasets[audio]")
from evaluate import evaluator
from datasets import load_dataset, Audio, disable_caching, set_caching_enabled
set_caching_enabled(False)
disable_caching()
huggingface_token = os.environ["huggingface_token"]
model = WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
feature_extractor = AutoFeatureExtractor.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
ds = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio(sampling_rate=16000))
print(ds, "and at 0 ", ds[0])
inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
print("check check")
print(inputs)
input_features = inputs.input_features
decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
list(last_hidden_state.shape)
print(list(last_hidden_state.shape))
|