In [None]:
!which python

In [None]:
pip_ouput = !pip install accelerate evaluate torch transformers
#print(pip_ouput)

In [None]:
from datasets import load_dataset

acc_dataset = load_dataset("monadical-labs/acc_dataset_v3")

In [None]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
 assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
 picks = []
 for _ in range(num_examples):
 pick = random.randint(0, len(dataset)-1)
 while pick in picks:
 pick = random.randint(0, len(dataset)-1)
 picks.append(pick)
 
 df = pd.DataFrame(dataset[picks])
 display(HTML(df.to_html()))

In [None]:
show_random_elements(acc_dataset["train"].remove_columns(["audio"]))

In [None]:
acc_dataset

In [None]:
for split in acc_dataset:
 acc_dataset[split] = acc_dataset[split].remove_columns(["text"])
 acc_dataset[split] = acc_dataset[split].rename_column("text_with_digits", "text")

In [None]:
show_random_elements(acc_dataset["train"].remove_columns(["audio"]))

In [None]:
from transformers import WhisperFeatureExtractor, WhisperForConditionalGeneration, WhisperProcessor, WhisperTokenizer

model_name = "openai/whisper-medium.en"

model = WhisperForConditionalGeneration.from_pretrained(model_name)
processor = WhisperProcessor.from_pretrained(model_name, language="English", task="transcribe")

In [None]:
input_str = acc_dataset['train'][9]["text"]
labels = processor.tokenizer(input_str).input_ids
decoded_with_special = processor.tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = processor.tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input: {input_str}")
print(f"Decoded w/ special: {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal: {input_str == decoded_str}")

In [None]:
acc_dataset['train'][0]["audio"]

In [None]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(acc_dataset["train"]))

print(acc_dataset["train"][rand_int]["text"])
#pd.Audio(data=np.asarray(acc_dataset["train"][rand_int]["audio"]["array"]), autoplay=True, rate=16000)

In [None]:
rand_int = random.randint(0, len(acc_dataset["train"]))

print("Target text:", acc_dataset["train"][rand_int]["text"])
print("Input array shape:", np.asarray(acc_dataset["train"][rand_int]["audio"]["array"]).shape)
print("Sampling rate:", acc_dataset["train"][rand_int]["audio"]["sampling_rate"])

In [None]:
def prepare_dataset(batch):
 audio = batch["audio"]

 # batched output is "un-batched" to ensure mapping is correct
 batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
 
 batch["labels"] = processor.tokenizer(batch["text"]).input_ids
 
 return batch

In [None]:
acc_dataset = acc_dataset.map(prepare_dataset)

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
 processor: Any
 decoder_start_token_id: int

 def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
 input_features = [{"input_features": feature["input_features"]} for feature in features]
 batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

 label_features = [{"input_ids": feature["labels"]} for feature in features]
 labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

 labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

 if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
 labels = labels[:, 1:]

 batch["labels"] = labels

 return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
 processor=processor,
 decoder_start_token_id=model.config.decoder_start_token_id,
)

In [None]:
import evaluate 


wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

In [None]:
def compute_metrics(pred):
 pred_ids = pred.predictions
 label_ids = pred.label_ids

 label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

 pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
 label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

 wer = wer_metric.compute(predictions=pred_str, references=label_str)

 return {"wer": wer}

In [None]:
from transformers import Seq2SeqTrainingArguments

dir_for_training_artifacts = "training-artifacts-" + model_name

eval_step_count = 25
max_step_count = 300

training_args = Seq2SeqTrainingArguments(
 evaluation_strategy="steps",
 eval_steps=eval_step_count,
 fp16=True,
 generation_max_length=225,
 gradient_checkpointing=True,
 greater_is_better=False,
 learning_rate=5e-5,
 load_best_model_at_end=True,
 logging_steps=eval_step_count,
 max_steps=max_step_count,
 metric_for_best_model="wer",
 output_dir= dir_for_training_artifacts,
 per_device_eval_batch_size=4,
 per_device_train_batch_size=32,
 predict_with_generate=True,
 push_to_hub=True,
 warmup_steps=eval_step_count,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
 args=training_args,
 model=model,
 train_dataset=acc_dataset["train"],
 eval_dataset=acc_dataset["validate"],
 data_collator=data_collator,
 compute_metrics=compute_metrics,
 tokenizer=processor.feature_extractor,
)

In [None]:
# Authenticate with HF if you haven't already. 

#from huggingface_hub import notebook_login

#notebook_login()

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
def map_to_result(batch):
 with torch.no_grad():
 input_values = torch.tensor(batch["input_features"], device="cuda").unsqueeze(0)
 predicted_ids = model.generate(input_values)

 batch["pred_str"] = processor.batch_decode(predicted_ids, skip_special_tokens=False)[0]
 return batch

In [None]:
results = acc_dataset["test"].map(map_to_result)
#results = acc_dataset["validate"].map(map_to_result)
#results = acc_dataset["train"].map(map_to_result)

In [None]:
import evaluate 


wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

In [None]:
results["pred_str"][0]

In [None]:
print("WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["text"])))
print("CER: {:.3f}".format(cer_metric.compute(predictions=results["pred_str"], references=results["text"])))

In [None]:
def show_random_elements(dataset, num_examples=10):
 assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
 picks = []
 for _ in range(num_examples):
 pick = random.randint(0, len(dataset)-1)
 while pick in picks:
 pick = random.randint(0, len(dataset)-1)
 picks.append(pick)
 
 df = pd.DataFrame(dataset[picks])
 display(HTML(df.to_html()))

In [None]:
show_random_elements(results.select_columns(["text", "pred_str"]))

In [None]:
with torch.no_grad():
 predicted_ids = model.generate(torch.tensor(acc_dataset["train"][:1]["input_features"], device="cuda"))

print(predicted_ids)

# convert ids to tokens
processor.batch_decode(predicted_ids, skip_special_tokens=False)[0]