import os 
import sys 
os.system("pip install transformers==4.27.0")
os.system("pip install numpy==1.23")
from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor, AutoFeatureExtractor, AutoProcessor, WhisperConfig
os.system("pip install jiwer")
from jiwer import wer
os.system("pip install datasets[audio]")
from evaluate import evaluator
from datasets import load_dataset, Audio, disable_caching, set_caching_enabled
import gradio as gr

set_caching_enabled(False)
disable_caching()

huggingface_token = os.environ["huggingface_token"]
pipe = pipeline(model="mskov/whisper-small-esc50") 
print(pipe)
'''
model = WhisperModel.from_pretrained("mskov/whisper-small-miso", use_auth_token=huggingface_token)
feature_extractor = AutoFeatureExtractor.from_pretrained("mskov/whisper-small-miso", use_auth_token=huggingface_token)
miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper-small-miso", use_auth_token=huggingface_token)
'''
dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio(sampling_rate=16000))

print(dataset, "and at 0[audio][array] ", dataset[0]["audio"]["array"], type(dataset[0]["audio"]["array"]), "and at audio : ", dataset[0]["audio"])

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

iface = gr.Interface(
    fn=transcribe, 
    inputs=gr.Audio(source="microphone", type="filepath"), 
    outputs="text",
    title="Whisper Small Miso Test",
)

iface.launch()
'''
inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt")
print("inputs ::: ", inputs, "and dataset type for good measure: ", type(dataset))
tempDataset = dataset[0]["audio"]["array"].tostring()
tokenized_dataset = miso_tokenizer(tempDataset)  # Tokenize the dataset

input_ids = features.input_ids
attention_mask = features.attention_mask
'''
'''
# Evaluate the model
model.eval()
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

# Convert predicted token IDs back to text
predicted_text = tokenizer.batch_decode(outputs.logits.argmax(dim=-1), skip_special_tokens=True)

# Get ground truth labels from the dataset
labels = dataset["audio"]  # Replace "labels" with the appropriate key in your dataset

# Compute WER
wer_score = wer(labels, predicted_text)

# Print or return WER score
print(f"Word Error Rate (WER): {wer_score}")
'''
'''
print("check check")
print(inputs)
input_features = inputs.input_features
decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
list(last_hidden_state.shape)
print(list(last_hidden_state.shape))
'''